Some applications can insert the text matched by the regex or by capturing groups converted to uppercase or lowercase. The Just Great Software applications allow you to prefix the matched text token \0 and the backreferences \1 through \99 with a letter that changes the case of the inserted text. U is for uppercase, L for lowercase, I for initial capitals (first letter of each word is uppercase, rest is lowercase), and F for first capital (first letter in the inserted text is uppercase, rest is lowercase). The letter only affects the case of the backreference that it is part of.
When the regex (?i)(Helló) (Wórld) matches HeLlÓ WóRlD
the replacement text \U1 \L2 \I0 \F0
becomes HELLÓ wórld Helló Wórld Helló wórld.
I edited my script in BBEdit and it worked fine.
Problems Solved 2017-11-22 22:56 GMT-0600
This has been a great thread, with two great solutions to two different problems.
Unfortunately, while SD6 does NOT support case change in its Find RegEx, the question became the genesis for two other solutions.
Here’s a crude implementer of that kind of regex. As it stands, the back references in the replacement template must be case-changing ones, but I think the code could be modified to handle straight ones.
use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions
main()
on main()
set theText to "HeLlÓ WóRlD"
set theRegexPattern to "(?i)(Helló) (Wórld)"
set theReplacementTemplate to "$U1 $L2 $I0 $F0 $0"
getEditedText(theText, theRegexPattern, theReplacementTemplate)
(* -- Or from Script Debugger's script menu:
tell application "Script Debugger"
set userInput to paragraphs of text returned of (display dialog "Please enter an ICU-style regex pattern and replacement template, in that order, on separate lines. Back references in the template may include a case-change code (\"U\", \"L\", \"I\", or \"F\") between the \"$\" and capture group number:" default answer linefeed buttons {"Cancel", "OK"} cancel button 1 with title "Find and change case …" with icon note)
end tell
if (((count userInput) is not 2) or (userInput contains {""})) then error number -128
set {theRegexPattern, theReplacementTemplate} to userInput
tell application "Script Debugger" to set theText to source text of front document
set editedText to getEditedText(theText, theRegexPattern, theReplacementTemplate)
tell application "Script Debugger" to set source text of front document to editedText
*)
end main
on getEditedText(theText, theRegexPattern, theReplacementTemplate)
set |⌘| to current application
set stringBeingEdited to |⌘|'s class "NSMutableString"'s stringWithString:(theText)
set theRegexPattern to |⌘|'s class "NSString"'s stringWithString:(theRegexPattern)
set theReplacementTemplate to |⌘|'s class "NSString"'s stringWithString:(theReplacementTemplate)
-- Get the input regex's matches in the input text.
set mainRegex to |⌘|'s class "NSRegularExpression"'s regularExpressionWithPattern:(theRegexPattern) options:(0) |error|:(missing value)
set mainRegexMatches to mainRegex's matchesInString:(stringBeingEdited) options:(0) range:({0, stringBeingEdited's |length|()})
set mainRegexMatchCount to (count mainRegexMatches)
-- Act on any found.
if (mainRegexMatchCount > 0) then
-- Use another regex to find any back-references in the replacement template! (Unescaped dollar sign, optional case code, 1 or 2 digits.)
set backReferenceRegex to |⌘|'s class "NSRegularExpression"'s regularExpressionWithPattern:("(?<!\\\\)(\\$[ULIF]?)([0-9]{1,2})") options:(0) |error|:(missing value)
set backReferenceMatchesInTemplate to (backReferenceRegex's matchesInString:(theReplacementTemplate) options:(0) range:({0, theReplacementTemplate's |length|()})) as list
-- If there are any, get their ranges in the template, their "prefices" ("$" and any case-change code), and capture group numbers.
set backReferenceCount to (count backReferenceMatchesInTemplate)
if (backReferenceCount > 0) then
set numberOfCaptureGroupsInMainRegex to (mainRegexMatches's firstObject()'s numberOfRanges()) - 1
repeat with i from 1 to backReferenceCount
set thisBackReferenceMatch to item i of backReferenceMatchesInTemplate
set backReferenceRange to thisBackReferenceMatch's range()
set backReferencePrefix to (theReplacementTemplate's substringWithRange:(thisBackReferenceMatch's rangeAtIndex:(1))) as text
set backReferenceCaptureGroupNumber to (theReplacementTemplate's substringWithRange:(thisBackReferenceMatch's rangeAtIndex:(2))) as text
-- Make adjustments if it seems that the capture group number obtained includes what should be a following a literal digit.
if (((count backReferenceCaptureGroupNumber) is 2) and ((backReferenceCaptureGroupNumber begins with "0") or ((backReferenceCaptureGroupNumber as integer) > numberOfCaptureGroupsInMainRegex))) then
set backReferenceCaptureGroupNumber to backReferenceCaptureGroupNumber div 10 -- ie. the first digit as integer.
set backReferenceRange to {backReferenceRange's location(), (backReferenceRange's |length|()) - 1} -- Exclude the second digit.
else
set backReferenceCaptureGroupNumber to backReferenceCaptureGroupNumber as integer
end if
set item i of backReferenceMatchesInTemplate to {backReferenceRange, backReferencePrefix, backReferenceCaptureGroupNumber}
end repeat
end if
-- Perform the specified substitutions in the main text.
repeat with i from mainRegexMatchCount to 1 by -1
set thisRegexMatch to item i of mainRegexMatches
-- The replacement string for each match of the main regex is derived from a copy of the replacement template.
set matchReplacementString to theReplacementTemplate's mutableCopy()
-- If the template contains any back references, replace each with the text to which it refers in the current match.
repeat with j from backReferenceCount to 1 by -1
-- Get this back reference's range in the template, its "prefix", and its capture group number.
set {backReferenceRange, backReferencePrefix, backReferenceCaptureGroupNumber} to item j of backReferenceMatchesInTemplate
-- Get the string matched by the capture group and apply any specified case change to it.
set rangeMatchedByCaptureGroup to (thisRegexMatch's rangeAtIndex:(backReferenceCaptureGroupNumber))
set stringMatchedByCaptureGroup to (stringBeingEdited's substringWithRange:(rangeMatchedByCaptureGroup))
if (backReferencePrefix ends with "U") then
set stringMatchedByCaptureGroup to stringMatchedByCaptureGroup's uppercaseString()
else if (backReferencePrefix ends with "L") then
set stringMatchedByCaptureGroup to stringMatchedByCaptureGroup's lowercaseString()
else if (backReferencePrefix ends with "I") then
set stringMatchedByCaptureGroup to stringMatchedByCaptureGroup's capitalizedString()
else if (backReferencePrefix ends with "F") then
set character1Length to (stringMatchedByCaptureGroup's rangeOfComposedCharacterSequenceAtIndex:(0))'s |length|()
set initialCapital to (stringMatchedByCaptureGroup's substringToIndex:(character1Length))'s uppercaseString()
set stringMatchedByCaptureGroup to (initialCapital's stringByAppendingString:((stringMatchedByCaptureGroup's substringFromIndex:(character1Length))'s lowercaseString()))
end if
-- Replace the back reference in the replacement string with the possibly modified capture group text.
tell matchReplacementString to replaceCharactersInRange:(backReferenceRange) withString:(stringMatchedByCaptureGroup)
end repeat
-- When all the back references have been so replaced, delete any escapement and replace the match in the main text with the completed replacement string.
tell matchReplacementString to replaceOccurrencesOfString:("\\\\(.)") withString:("$1") options:(|⌘|'s NSRegularExpressionSearch) range:({0, its |length|()})
tell stringBeingEdited to replaceCharactersInRange:(thisRegexMatch's range()) withString:(matchReplacementString)
end repeat
end if
-- When all the pattern matches have been replaced, return the edited text.
return stringBeingEdited as text
end getEditedText
Aha! I’d thought of that but had got it into my head there was no such thing! I’ve now edited the script above, but I’ll check it again in the morning when/if I’m properly awake.
OK. I’ve tidied up the code a bit and modified it so that it also handles back references not specifying case changes. To make it more ICU-like, back references in the replacement template must now be prefixed with dollar signs instead of with backslashes. If a back reference contains an apparent two-digit group number which is more than the number of capture groups, the number’s interpreted as being a one-digit group number followed by a literal digit character. Backslashes can now be used in the replacement template to indicate literal characters if required.
I have question about returning Capture Groups using ASObjC RegEx in general.
The above statement returns the CGs concatenated in a single string.
How can I return CGs as items in a List, as in: {$0, $1, $2}
Here is a handler for ASObjC RegEx, written by @ShaneStanley and @ccstone, that I have used. It is here where I’d like to get the CGs in a list.
on regexFindWithCapture:thePattern fromString:theString resultTemplate:templateStr
set theString to current application's NSString's stringWithString:theString
set theRegEx to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(missing value)
set theFinds to theRegEx's matchesInString:theString options:0 range:{0, theString's |length|()}
set theResult to current application's NSMutableArray's array()
repeat with aFind in theFinds
set foundString to (theRegEx's replacementStringForResult:aFind inString:theString |offset|:0 template:templateStr)
(theResult's addObject:foundString)
end repeat
return theResult as list
end regexFindWithCapture:fromString:resultTemplate:
Here are two options. The first finds only the first match in the string, and returns a list of the contents of all capture groups for that match. The second finds all matches, so the result is a list of lists.
on regexFindCaptures:thePattern inString:theString
set theString to current application's NSString's stringWithString:theString
set {theRegEx, theError} to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(reference)
if theRegEx is missing value then error theError's localizedDescription() as text
set theFind to theRegEx's firstMatchInString:theString options:0 range:{0, theString's |length|()}
set groupCount to theFind's numberOfRanges()
set theResult to current application's NSMutableArray's array()
repeat with i from 0 to (groupCount - 1)
set theRange to (aFind's rangeAtIndex:i)
if |length| of theRange = 0 then
(theResult's addObject:"")
else
(theResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
return theResult as list
end regexFindCaptures:inString:
And:
on regexFindAllCaptures:thePattern inString:theString
set theString to current application's NSString's stringWithString:theString
set {theRegEx, theError} to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(reference)
if theRegEx is missing value then error theError's localizedDescription() as text
set theFinds to theRegEx's matchesInString:theString options:0 range:{0, theString's |length|()}
set theResult to current application's NSMutableArray's array()
repeat with aFind in theFinds
set subResult to current application's NSMutableArray's array()
set groupCount to aFind's numberOfRanges()
repeat with i from 0 to (groupCount - 1)
set theRange to (aFind's rangeAtIndex:i)
if |length| of theRange = 0 then
(subResult's addObject:"")
else
(subResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
(theResult's addObject:subResult)
end repeat
return theResult as list
end regexFindAllCaptures:inString:
Here’s a demo/test case for anyone who is interested:
use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions
set sourceStr to " 2016-01-11 and some text and 2017-02-22"
set reFind to "(\\d{4})-(\\d{2})-(\\d{2})"
set reResults to my regexFindAllCaptures:reFind inString:sourceStr
-->{{"2016-01-11", "2016", "01", "11"}, {"2017-02-22", "2017", "02", "22"}}
on regexFindAllCaptures:thePattern inString:theString
set theString to current application's NSString's stringWithString:theString
set theRegEx to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(missing value)
set theFinds to theRegEx's matchesInString:theString options:0 range:{0, theString's |length|()}
set theResult to current application's NSMutableArray's array()
repeat with aFind in theFinds
set subResult to current application's NSMutableArray's array()
set groupCount to aFind's numberOfRanges()
repeat with i from 0 to (groupCount - 1)
(subResult's addObject:(theString's substringWithRange:(aFind's rangeAtIndex:i)))
end repeat
(theResult's addObject:subResult)
end repeat
return theResult as list
end regexFindAllCaptures:inString:
OK, thanks again Shane.
Here is my merge and refactor of Shane’s handlers into one handler.
I added a bit of error handling.
If anyone sees issues or has suggestions, please post.
use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions
property LF : linefeed
set sourceStr to " 2016-01-11 and some text and 2017-02-22"
set reFind to "(\\d{4})-(\\d{2})-(\\d{2})"
set reFirstMatch to my regexFindCaptures:reFind inString:sourceStr findAll:false
-->{"2016-01-11", "2016", "01", "11"}
set reAllMatches to my regexFindCaptures:reFind inString:sourceStr findAll:true
-->{{"2016-01-11", "2016", "01", "11"}, {"2017-02-22", "2017", "02", "22"}}
--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
on regexFindCaptures:thePattern inString:theString findAll:theFindAll
----------------------------------------------------------
-- Find Match & Return Match with All Capture Groups as list
-- if theFindAll is true, then Find ALL matches (Global) & return as list of lists
-- else Find FIRST match, & return simple list of match & capture groups
-- If NO matches, return empty list {}
-- This handler is a merge and refactor of the two handlers provided by @ShaneStanley
-- (http://forum.latenightsw.com//t/does-sd6-find-regex-support-case-change/816/8)
--
-- All errors are mine.
local theFinds, theResult, subResult, groupCount
set theString to current application's NSString's stringWithString:theString
set theRegEx to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(missing value)
try
--------------------------------------------------
if (theFindAll) then ### FIND ALL MATCHES ###
-------------------------------------------------
set theFinds to theRegEx's matchesInString:theString options:0 range:{0, theString's |length|()}
set theResult to current application's NSMutableArray's array()
repeat with aFind in theFinds
set subResult to current application's NSMutableArray's array()
set groupCount to aFind's numberOfRanges()
repeat with i from 0 to (groupCount - 1)
(subResult's addObject:(theString's substringWithRange:(aFind's rangeAtIndex:i)))
end repeat
(theResult's addObject:subResult)
end repeat -- theFinds
--------------------------------------------
else ### FIND FIRST MATCH ###
-------------------------------------------
set theFind to theRegEx's firstMatchInString:theString options:0 range:{0, theString's |length|()}
set groupCount to theFind's numberOfRanges()
set theResult to current application's NSMutableArray's array()
repeat with i from 0 to (groupCount - 1)
(theResult's addObject:(theString's substringWithRange:(theFind's rangeAtIndex:i)))
end repeat
end if -- (theFindAll)
on error errMsg number errNum
if (errNum = -1708) then
set errMsg to "ASObjC RegEx ERROR #" & errNum & LF ¬
& "Most likely due to invalid RegEx Pattern:" & LF ¬
& thePattern & LF & LF ¬
& "ASObjC Error: " & errMsg
set the clipboard to errMsg
error errMsg
end if
end try
return theResult as list
end regexFindCaptures:inString:findAll:
You can actually trap directly for that. Change this:
set theRegEx to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(missing value)
To this:
set {theRegEx, theError} to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(reference)
if theRegEx is missing value then error theError's localizedDescription() as text
The other place it could go wrong is if there’s no match for a particular group. You could also catch that by changing this:
set theRange to theFind's rangeAtIndex:i
if |length| of theRange > 0 then -- skip if not found
(theResult's addObject:(theString's substringWithRange:theRange))
end if
Followng this pattern for the regexFindCaptures for first match, I made this change to the find all matches block:
repeat with i from 0 to (groupCount - 1)
### (subResult's addObject:(theString's substringWithRange:(aFind's rangeAtIndex:i)))
set theRange to (aFind's rangeAtIndex:i)
if |length| of theRange > 0 then -- skip if not found
(subResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
Seems to work OK for successful finds. Not sure when it is supposed to go through the if block.
Group 3 in Match 1 of your example is just such a case. Before my fix, this resulted in no results for the second case.
However, my proposed “fix” is just skipping the empty case, whereas it should really be inserting an empty string for it, so any others are still in the correct order. So the relevant part should be:
repeat with i from 0 to (groupCount - 1)
set theRange to (aFind's rangeAtIndex:i)
if |length| of theRange = 0 then
(subResult's addObject:"")
else
(subResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
(The problem stems from the fact that the range for Group 3 in Match 1 of your example is returned as {location:0, length:NSNotFound}, and NSNotFound is too big an integer for AppleScript to cope with.)
OK. Great!!! It works perfectly. I believe we have a gold release candidate (unless you find more issues )
Here’s the script with all of the changes Shane has provided:
use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions
property LF : linefeed
set sourceStr to " 2016-01-11 and some text and 2017-02A-22"
set reFind to "(\\d{4})-(\\d{2})(A)?-(\\d{2})"
set reFirstMatch to my regexFindCaptures:reFind inString:sourceStr findAll:false
-->{"2016-01-11", "2016", "01", "", "11"}
set reAllMatches to my regexFindCaptures:reFind inString:sourceStr findAll:true
-->{{"2016-01-11", "2016", "01", "", "11"}, {"2017-02A-22", "2017", "02", "A", "22"}}
--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
on regexFindCaptures:thePattern inString:theString findAll:theFindAll
----------------------------------------------------------
-- Find Match & Return Match with All Capture Groups as list
-- if theFindAll is true, then Find ALL matches (Global) & return as list of lists
-- else Find FIRST match, & return simple list of match & capture groups
-- If NO matches, return empty list {}
-- If an Optional Capture Group is not found/matched, it is returned as empty string ""
-- This handler is a merge and refactor of the two handlers provided by @ShaneStanley
-- (http://forum.latenightsw.com//t/does-sd6-find-regex-support-case-change/816/8)
--
-- All errors are mine.
local theFinds, theResult, subResult, groupCount
try
set theString to current application's NSString's stringWithString:theString
set {theRegEx, theError} to current application's NSRegularExpression's regularExpressionWithPattern:thePattern options:0 |error|:(reference)
if theRegEx is missing value then error ("Invalid RegEx Pattern." & LF & theError's localizedDescription() as text)
--------------------------------------------------
if (theFindAll) then ### FIND ALL MATCHES ###
-------------------------------------------------
set theFinds to theRegEx's matchesInString:theString options:0 range:{0, theString's |length|()}
set theResult to current application's NSMutableArray's array()
repeat with aFind in theFinds
set subResult to current application's NSMutableArray's array()
set groupCount to aFind's numberOfRanges()
repeat with i from 0 to (groupCount - 1)
set theRange to (aFind's rangeAtIndex:i)
if |length| of theRange = 0 then
--- Optional Capture Group was NOT Matched ---
(subResult's addObject:"")
else
--- Capture Group was Matched ---
(subResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
(theResult's addObject:subResult)
end repeat -- theFinds
--------------------------------------------
else ### FIND FIRST MATCH ###
-------------------------------------------
set theFind to theRegEx's firstMatchInString:theString options:0 range:{0, theString's |length|()}
set groupCount to theFind's numberOfRanges()
set theResult to current application's NSMutableArray's array()
repeat with i from 0 to (groupCount - 1)
set theRange to (theFind's rangeAtIndex:i)
if |length| of theRange = 0 then
--- Optional Capture Group was NOT Matched ---
(theResult's addObject:"")
else
--- Capture Group was Matched ---
(theResult's addObject:(theString's substringWithRange:theRange))
end if
end repeat
end if -- (theFindAll)
on error errMsg number errNum
set errMsg to "ASObjC RegEx ERROR #" & errNum & LF & errMsg
set the clipboard to errMsg
display dialog errMsg with title (name of me) with icon stop
error errMsg
end try
return theResult as list
end regexFindCaptures:inString:findAll:
I’ve made some further changes to my script in post 3 which fix a couple of bugs, move the analysis of the replacement template to before the main repeat, correct some comments and text which should have been changed in the previous update, and rename a handler and some variables to try and make it clearer what they represent!