diff options
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | Doc/Zsh/cond.yo | 24 | ||||
-rw-r--r-- | Src/Modules/pcre.c | 79 | ||||
-rw-r--r-- | Src/Modules/regex.c | 56 | ||||
-rw-r--r-- | Test/C02cond.ztst | 33 |
5 files changed, 188 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog index 35600b5e1..530decde1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2010-01-17 Peter Stephenson <p.w.stephenson@ntlworld.com> + + * 27600: Doc/Zsh/cond.yo, Src/Modules/pcre.c, Src/Modules/regex.c, + Test/C02cond.ztst: extend =~ syntax to set MBEGIN and MEND + with MATCH and mbegin and mend with match. + 2010-01-16 Peter Stephenson <p.w.stephenson@ntlworld.com> * Frank: 27599: Doc/Zsh/contrib.yo: fix formatting documentation @@ -12585,5 +12591,5 @@ ***************************************************** * This is used by the shell to define $ZSH_PATCHLEVEL -* $Revision: 1.4862 $ +* $Revision: 1.4863 $ ***************************************************** diff --git a/Doc/Zsh/cond.yo b/Doc/Zsh/cond.yo index 9d8f145f8..4b7304407 100644 --- a/Doc/Zsh/cond.yo +++ b/Doc/Zsh/cond.yo @@ -117,13 +117,29 @@ the tt(zsh/pcre) module, else it is tested as a POSIX extended regular expression using the tt(zsh/regex) module. Upon successful match, some variables will be updated; no variables are changed if the matching fails. + +If the option tt(BASH_REMATCH) is not set the scalar parameter +tt(MATCH) is set to the substring that matched the pattern and +the integer parameters tt(MBEGIN) and tt(MEND) to the index of the start +and end, respectively, of the match in var(string), such that if +var(string) is contained in variable tt(var) the expression +`${var[$MBEGIN,$MEND]}' is identical to `$MATCH'. The setting +of the option tt(KSH_ARRAYS) is respected. Likewise, the array +tt(match) is set to the substrings that matched parenthesised +subexpressions and the arrays tt(mbegin) and tt(mend) to the indices of +the start and end positions, respectively, of the substrings within +var(string). The arrays are not set if there were no parenthesised +subexpresssions. For example, if the string `tt(a short string)' is matched +against the regular expression `tt(s(...)t)', then (assuming the option +tt(KSH_ARRAYS) is not set) tt(MATCH), tt(MBEGIN) +and tt(MEND) are `tt(short)', 3 and 7, respectively, while tt(match), +tt(mbegin) and tt(mend) are single entry arrays containing +the strings `tt(hor)', `tt(4)' and `tt(6), respectively. + If the option tt(BASH_REMATCH) is set the array tt(BASH_REMATCH) is set to the substring that matched the pattern followed by the substrings that matched parenthesised -subexpressions within the pattern; otherwise, the scalar parameter -tt(MATCH) is set to the substring that matched the pattern and -and the array tt(match) to the substrings that matched parenthesised -subexpressions. +subexpressions within the pattern. ) item(var(string1) tt(<) var(string2))( true if var(string1) comes before var(string2) diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c index 08205d144..f8b79adea 100644 --- a/Src/Modules/pcre.c +++ b/Src/Modules/pcre.c @@ -138,8 +138,9 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f /**/ static int -zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, - int want_offset_pair, int matchedinarr) +zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, + char *substravar, int want_offset_pair, int matchedinarr, + int want_begin_end) { char **captures, *match_all, **matches; char offset_all[50]; @@ -154,6 +155,7 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr /* captures[0] will be entire matched string, [1] first substring */ if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) { + int nelem = arrlen(captures)-1; /* Set to the offsets of the complete match */ if (want_offset_pair) { sprintf(offset_all, "%d %d", ovec[0], ovec[1]); @@ -161,8 +163,70 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr } match_all = ztrdup(captures[0]); setsparam(matchvar, match_all); - matches = zarrdup(&captures[capture_start]); - setaparam(substravar, matches); + /* + * If we're setting match, mbegin, mend we only do + * so if there were parenthesised matches, for consistency + * (c.f. regex.c). + */ + if (!want_begin_end || nelem) { + matches = zarrdup(&captures[capture_start]); + setaparam(substravar, matches); + } + + if (want_begin_end) { + char *ptr = arg; + zlong offs = 0; + + /* Count the characters before the match */ + MB_METACHARINIT(); + while (ptr < arg + ovec[0]) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + setiparam("MBEGIN", offs + !isset(KSHARRAYS)); + /* Add on the characters in the match */ + while (ptr < arg + ovec[1]) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + setiparam("MEND", offs + !isset(KSHARRAYS) - 1); + if (nelem) { + char **mbegin, **mend, **bptr, **eptr; + int i, *ipair; + + bptr = mbegin = zalloc(nelem+1); + eptr = mend = zalloc(nelem+1); + + for (ipair = ovec + 2, i = 0; + i < nelem; + ipair += 2, i++, bptr++, eptr++) + { + char buf[DIGBUFSIZE]; + ptr = arg; + offs = 0; + /* Find the start offset */ + MB_METACHARINIT(); + while (ptr < arg + ipair[0]) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + convbase(buf, offs + !isset(KSHARRAYS), 10); + *bptr = ztrdup(buf); + /* Continue to the end offset */ + while (ptr < arg + ipair[1]) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + convbase(buf, offs + !isset(KSHARRAYS) - 1, 10); + *eptr = ztrdup(buf); + } + *bptr = *eptr = NULL; + + setaparam("mbegin", mbegin); + setaparam("mend", mend); + } + } + pcre_free_substring_list((const char **)captures); } @@ -238,7 +302,8 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) if (ret==0) return_value = 0; else if (ret==PCRE_ERROR_NOMATCH) /* no match */; else if (ret>0) { - zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0); + zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, + want_offset_pair, 0, 0); return_value = 0; } else { @@ -297,7 +362,9 @@ cond_pcre_match(char **a, int id) break; } else if (r>0) { - zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH)); + zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, + isset(BASHREMATCH), + !isset(BASHREMATCH)); return_value = 1; break; } diff --git a/Src/Modules/regex.c b/Src/Modules/regex.c index 8a9f3e608..25dbddf07 100644 --- a/Src/Modules/regex.c +++ b/Src/Modules/regex.c @@ -108,11 +108,65 @@ zcond_regex_match(char **a, int id) if (isset(BASHREMATCH)) { setaparam("BASH_REMATCH", arr); } else { + zlong offs; + char *ptr; + m = matches; s = ztrduppfx(lhstr + m->rm_so, m->rm_eo - m->rm_so); setsparam("MATCH", s); - if (nelem) + /* + * Count the characters before the match. + */ + ptr = lhstr; + offs = 0; + MB_METACHARINIT(); + while (ptr < lhstr + m->rm_so) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + setiparam("MBEGIN", offs + !isset(KSHARRAYS)); + /* + * Add on the characters in the match. + */ + while (ptr < lhstr + m->rm_eo) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + setiparam("MEND", offs + !isset(KSHARRAYS) - 1); + if (nelem) { + char **mbegin, **mend, **bptr, **eptr; + bptr = mbegin = (char **)zalloc(nelem+1); + eptr = mend = (char **)zalloc(nelem+1); + + for (m = matches + start, n = start; + n <= (int)re.re_nsub; + ++n, ++m, ++bptr, ++eptr) + { + char buf[DIGBUFSIZE]; + ptr = lhstr; + offs = 0; + /* Find the start offset */ + MB_METACHARINIT(); + while (ptr < lhstr + m->rm_so) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + convbase(buf, offs + !isset(KSHARRAYS), 10); + *bptr = ztrdup(buf); + /* Continue to the end offset */ + while (ptr < lhstr + m->rm_eo) { + offs++; + ptr += MB_METACHARLEN(ptr); + } + convbase(buf, offs + !isset(KSHARRAYS) - 1, 10); + *eptr = ztrdup(buf); + } + *bptr = *eptr = NULL; + setaparam("match", arr); + setaparam("mbegin", mbegin); + setaparam("mend", mend); + } } } else diff --git a/Test/C02cond.ztst b/Test/C02cond.ztst index de82dcbe2..b0e278f4b 100644 --- a/Test/C02cond.ztst +++ b/Test/C02cond.ztst @@ -251,6 +251,39 @@ F:Failures in these cases do not indicate a problem in the shell. fi 0:regex tests shouldn't crash + if zmodload -i zsh/regex 2>/dev/null; then + string="this has stuff in it" + bad_regex=0 + if [[ $string =~ "h([a-z]*) s([a-z]*) " ]]; then + if [[ "$MATCH $MBEGIN $MEND" != "has stuff 6 15" ]]; then + print -r "regex variables MATCH MBEGIN MEND: + '$MATCH $MBEGIN $MEND' + should be: + 'has stuff 6 15'" >&2 + bad_regex=1 + else + results=("as 7 8" "tuff 11 14") + for i in 1 2; do + if [[ "$match[$i] $mbegin[$i] $mend[$i]" != $results[i] ]]; then + print -r "regex variables match[$i] mbegin[$i] mend[$i]: + '$match[$i] $mbegin[$i] $mend[$i]' + should be + '$results[$i]'" >&2 + break + fi + done + fi + else + print -r "regex failed to match '$string'" >&2 + fi + (( bad_regex )) || print OK + else + # if it didn't load, tough, but not a test error + print OK + fi +0:MATCH, MBEGIN, MEND, match, mbegin, mend +>OK + %clean # This works around a bug in rm -f in some versions of Cygwin chmod 644 unmodish |