From 418671fdb06c1920414056f9b47245aa062f7b6f Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Wed, 25 Mar 2009 11:29:11 +0000 Subject: Jon Strait: 26778, 26781: extra options for PCRE matching --- ChangeLog | 5 ++++- Doc/Zsh/mod_pcre.yo | 37 ++++++++++++++++++++++++++++++++-- Src/Modules/pcre.c | 57 ++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 87 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index 52e2d6dd7..31f4d6a45 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2009-03-25 Peter Stephenson + * Jon Strait: 26778, 26781: Doc/Zsh/mod_pcre.yo, + Src/Modules/pcre.c: a couple of extra options for PCRE matching. + * Michael Hwang: 26776: Src/builtin.c: improved column alignment with print -c -P. @@ -11487,5 +11490,5 @@ ***************************************************** * This is used by the shell to define $ZSH_PATCHLEVEL -* $Revision: 1.4636 $ +* $Revision: 1.4637 $ ***************************************************** diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo index 33b864478..9b8d9d6a7 100644 --- a/Doc/Zsh/mod_pcre.yo +++ b/Doc/Zsh/mod_pcre.yo @@ -6,7 +6,7 @@ The tt(zsh/pcre) module makes some commands available as builtins: startitem() findex(pcre_compile) -item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))( +item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))( Compiles a perl-compatible regular expression. Option tt(-a) will force the pattern to be anchored. @@ -15,6 +15,8 @@ Option tt(-m) will compile a multi-line pattern; that is, tt(^) and tt($) will match newlines within the pattern. Option tt(-x) will compile an extended pattern, wherein whitespace and tt(#) comments are ignored. +Option tt(-s) makes the dot metacharacter match all characters, +including those that indicate newline. ) findex(pcre_study) item(tt(pcre_study))( @@ -22,7 +24,8 @@ Studies the previously-compiled PCRE which may result in faster matching. ) findex(pcre_match) -item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))( +item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \ +[ tt(-n) var(offset) ] [ tt(-b) ] var(string))( Returns successfully if tt(string) matches the previously-compiled PCRE. @@ -35,6 +38,36 @@ var(MATCH) will be set to the entire matched portion of the string, unless the tt(-v) option is given, in which case the variable var(var) will be set. No variables are altered if there is no successful match. +A tt(-n) option starts searching for a match from the +byte var(offset) position in var(string). If the tt(-b) option is given, +the variable var(ZPCRE_OP) will be set to an offset pair string, +representing the byte offset positions of the entire matched portion +within the var(string). For example, a var(ZPCRE_OP) set to "32 45" indicates +that the matched portion began on byte offset 32 and ended on byte offset 44. +Here, byte offset position 45 is the position directly after the matched +portion. Keep in mind that the byte position isn't necessarily the same +as the character position when UTF-8 characters are involved. +Consequently, the byte offset positions are only to be relied on in the +context of using them for subsequent searches on var(string), using an offset +position as an argument to the tt(-n) option. This is mostly +used to implement the "find all non-overlapping matches" functionality. + +A simple example of "find all non-overlapping matches": + +example( +string="The following zip codes: 78884 90210 99513" +pcre_compile -m "\d{5}" +accum=() +pcre_match -b -- $string +while [[ $? -eq 0 ]] do + b=($=ZPCRE_OP) + accum+=$MATCH + pcre_match -b -n $b[2] -- $string +done +print -l $accum + + +) ) enditem() diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c index 4f8daff80..08205d144 100644 --- a/Src/Modules/pcre.c +++ b/Src/Modules/pcre.c @@ -82,6 +82,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func)) if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS; if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE; if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED; + if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL; if (zpcre_utf8_enabled()) pcre_opts |= PCRE_UTF8; @@ -137,9 +138,11 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f /**/ static int -zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr) +zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, + int want_offset_pair, int matchedinarr) { char **captures, *match_all, **matches; + char offset_all[50]; int capture_start = 1; if (matchedinarr) @@ -148,9 +151,14 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr matchvar = "MATCH"; if (substravar == NULL) substravar = "match"; - + /* captures[0] will be entire matched string, [1] first substring */ - if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) { + if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) { + /* Set to the offsets of the complete match */ + if (want_offset_pair) { + sprintf(offset_all, "%d %d", ovec[0], ovec[1]); + setsparam("ZPCRE_OP", ztrdup(offset_all)); + } match_all = ztrdup(captures[0]); setsparam(matchvar, match_all); matches = zarrdup(&captures[capture_start]); @@ -161,6 +169,22 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr return 0; } +/**/ +static int +getposint(char *instr, char *nam) +{ + char *eptr; + int ret; + + ret = (int)zstrtol(instr, &eptr, 10); + if (*eptr || ret < 0) { + zwarnnam(nam, "integer expected: %s", instr); + return -1; + } + + return ret; +} + /**/ static int bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) @@ -169,6 +193,10 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) char *matched_portion = NULL; char *receptacle = NULL; int return_value = 1; + /* The subject length and offset start are both int values in pcre_exec */ + int subject_len; + int offset_start = 0; + int want_offset_pair = 0; if (pcre_pattern == NULL) { zwarnnam(nam, "no pattern has been compiled"); @@ -181,6 +209,12 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) if(OPT_HASARG(ops,c='v')) { matched_portion = OPT_ARG(ops,c); } + if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */ + offset_start = getposint(OPT_ARG(ops,c), nam); + } + /* For the entire match, 'Return' the offset byte positions instead of the matched string */ + if(OPT_ISSET(ops,'b')) want_offset_pair = 1; + if(!*args) { zwarnnam(nam, "not enough arguments"); } @@ -194,12 +228,17 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) ovecsize = (capcount+1)*3; ovec = zalloc(ovecsize*sizeof(int)); - ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize); - + subject_len = (int)strlen(*args); + + if (offset_start < 0 || offset_start >= subject_len) + ret = PCRE_ERROR_NOMATCH; + else + ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize); + if (ret==0) return_value = 0; else if (ret==PCRE_ERROR_NOMATCH) /* no match */; else if (ret>0) { - zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0); + zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0); return_value = 0; } else { @@ -258,7 +297,7 @@ cond_pcre_match(char **a, int id) break; } else if (r>0) { - zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH)); + zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH)); return_value = 1; break; } @@ -289,8 +328,8 @@ static struct conddef cotab[] = { #endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */ static struct builtin bintab[] = { - BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx", NULL), - BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:", NULL), + BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL), + BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL), BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL) }; -- cgit 1.4.1