diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | Src/Modules/pcre.c | 61 | ||||
-rw-r--r-- | Test/V07pcre.ztst | 106 |
3 files changed, 160 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog index 934fb881f..47fc89954 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2011-10-24 Phil Pennock <pdpennock@users.sourceforge.net> + + * 29838: Src/Modules/pcre.c: metafy/unmetafy strings, to + correctly handle non-ASCII characters in UTF-8 for regexp + matches. + + * unposted: Test/V07pcre.ztst: some PCRE tests + 2011-10-23 Peter Stephenson <p.w.stephenson@ntlworld.com> * users/16492: MACHINES: OpenIndiana issue. @@ -15484,5 +15492,5 @@ ***************************************************** * This is used by the shell to define $ZSH_PATCHLEVEL -* $Revision: 1.5481 $ +* $Revision: 1.5482 $ ***************************************************** diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c index e1a897944..e36013163 100644 --- a/Src/Modules/pcre.c +++ b/Src/Modules/pcre.c @@ -77,6 +77,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func)) { int pcre_opts = 0, pcre_errptr; const char *pcre_error; + char *target; if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED; if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS; @@ -92,8 +93,13 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func)) if (pcre_pattern) pcre_free(pcre_pattern); - pcre_pattern = pcre_compile(*args, pcre_opts, &pcre_error, &pcre_errptr, NULL); + target = ztrdup(*args); + unmetafy(target, NULL); + + pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL); + free(target); + if (pcre_pattern == NULL) { zwarnnam(nam, "error in regex: %s", pcre_error); @@ -161,7 +167,7 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, sprintf(offset_all, "%d %d", ovec[0], ovec[1]); setsparam("ZPCRE_OP", ztrdup(offset_all)); } - match_all = ztrdup(captures[0]); + match_all = metafy(captures[0], -1, META_DUP); setsparam(matchvar, match_all); /* * If we're setting match, mbegin, mend we only do @@ -169,7 +175,15 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, * (c.f. regex.c). */ if (!want_begin_end || nelem) { - matches = zarrdup(&captures[capture_start]); + char **x, **y; + y = &captures[capture_start]; + matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1)); + do { + if (*y) + *x++ = metafy(*y, -1, META_DUP); + else + *x++ = NULL; + } while (*y++); setaparam(substravar, matches); } @@ -255,6 +269,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) { int ret, capcount, *ovec, ovecsize, c; char *matched_portion = NULL; + char *plaintext = NULL; char *receptacle = NULL; int return_value = 1; /* The subject length and offset start are both int values in pcre_exec */ @@ -278,7 +293,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) } /* For the entire match, 'Return' the offset byte positions instead of the matched string */ if(OPT_ISSET(ops,'b')) want_offset_pair = 1; - + if(!*args) { zwarnnam(nam, "not enough arguments"); } @@ -288,26 +303,28 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func)) zwarnnam(nam, "error %d in fullinfo", ret); return 1; } - + ovecsize = (capcount+1)*3; ovec = zalloc(ovecsize*sizeof(int)); - - subject_len = (int)strlen(*args); + + plaintext = ztrdup(*args); + unmetafy(plaintext, NULL); + subject_len = (int)strlen(plaintext); if (offset_start < 0 || offset_start >= subject_len) ret = PCRE_ERROR_NOMATCH; else - ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize); + ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize); if (ret==0) return_value = 0; else if (ret==PCRE_ERROR_NOMATCH) /* no match */; else if (ret>0) { - zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, + zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle, want_offset_pair, 0, 0); return_value = 0; } else { - zwarnnam(nam, "error in pcre_exec"); + zwarnnam(nam, "error in pcre_exec [%d]", ret); } if (ovec) @@ -322,7 +339,8 @@ cond_pcre_match(char **a, int id) { pcre *pcre_pat; const char *pcre_err; - char *lhstr, *rhre, *avar=NULL; + char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL; + char *p; int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize; int return_value = 0; @@ -331,6 +349,10 @@ cond_pcre_match(char **a, int id) lhstr = cond_str(a,0,0); rhre = cond_str(a,1,0); + lhstr_plain = ztrdup(lhstr); + rhre_plain = ztrdup(rhre); + unmetafy(lhstr_plain, NULL); + unmetafy(rhre_plain, NULL); pcre_pat = NULL; ov = NULL; @@ -339,7 +361,7 @@ cond_pcre_match(char **a, int id) switch(id) { case CPCRE_PLAIN: - pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL); + pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL); if (pcre_pat == NULL) { zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err); break; @@ -347,7 +369,7 @@ cond_pcre_match(char **a, int id) pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt); ovsize = (capcnt+1)*3; ov = zalloc(ovsize*sizeof(int)); - r = pcre_exec(pcre_pat, NULL, lhstr, strlen(lhstr), 0, 0, ov, ovsize); + r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize); /* r < 0 => error; r==0 match but not enough size in ov * r > 0 => (r-1) substrings found; r==1 => no substrings */ @@ -356,13 +378,16 @@ cond_pcre_match(char **a, int id) return_value = 1; break; } - else if (r==PCRE_ERROR_NOMATCH) return 0; /* no match */ + else if (r==PCRE_ERROR_NOMATCH) { + return_value = 0; /* no match */ + break; + } else if (r<0) { - zwarn("pcre_exec() error: %d", r); + zwarn("pcre_exec() error [%d]", r); break; } else if (r>0) { - zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, + zpcre_get_substrings(lhstr_plain, ov, r, NULL, avar, 0, isset(BASHREMATCH), !isset(BASHREMATCH)); return_value = 1; @@ -371,6 +396,10 @@ cond_pcre_match(char **a, int id) break; } + if (lhstr_plain) + free(lhstr_plain); + if(rhre_plain) + free(rhre_plain); if (pcre_pat) pcre_free(pcre_pat); if (ov) diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst new file mode 100644 index 000000000..4dd173557 --- /dev/null +++ b/Test/V07pcre.ztst @@ -0,0 +1,106 @@ +%prep + + zmodload zsh/pcre + setopt rematch_pcre +# Find a UTF-8 locale. + setopt multibyte +# Don't let LC_* override our choice of locale. + unset -m LC_\* + mb_ok= + langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 + $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) + for LANG in $langs; do + if [[ é = ? ]]; then + mb_ok=1 + break; + fi + done + if [[ -z $mb_ok ]]; then + ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" + else + print -u $ZTST_fd Testing PCRE multibyte with locale $LANG + mkdir multibyte.tmp && cd multibyte.tmp + fi + +%test + + [[ 'foo→bar' =~ .([^[:ascii:]]). ]] + print $MATCH + print $match[1] +0:Basic non-ASCII regexp matching +>o→b +>→ + + [[ foo =~ f.+ ]] ; print $? + [[ foo =~ x.+ ]] ; print $? + [[ ! foo =~ f.+ ]] ; print $? + [[ ! foo =~ x.+ ]] ; print $? + [[ foo =~ f.+ && bar =~ b.+ ]] ; print $? + [[ foo =~ x.+ && bar =~ b.+ ]] ; print $? + [[ foo =~ f.+ && bar =~ x.+ ]] ; print $? + [[ ! foo =~ f.+ && bar =~ b.+ ]] ; print $? + [[ foo =~ f.+ && ! bar =~ b.+ ]] ; print $? + [[ ! ( foo =~ f.+ && bar =~ b.+ ) ]] ; print $? + [[ ! foo =~ x.+ && bar =~ b.+ ]] ; print $? + [[ foo =~ x.+ && ! bar =~ b.+ ]] ; print $? + [[ ! ( foo =~ x.+ && bar =~ b.+ ) ]] ; print $? +0:Regex result inversion detection +>0 +>1 +>1 +>0 +>0 +>1 +>1 +>1 +>1 +>1 +>0 +>1 +>0 + +# Note that PCRE_ANCHORED only means anchored at the start +# Also note that we don't unset MATCH/match on failed match (and it's an +# open issue as to whether or not we should) + pcre_compile '.(→.)' + pcre_match foo→bar + print $? $MATCH $match ; unset MATCH match + pcre_match foo.bar + print $? $MATCH $match ; unset MATCH match + pcre_match foo†bar + print $? $MATCH $match ; unset MATCH match + pcre_match foo→†ar + print $? $MATCH $match ; unset MATCH match + pcre_study + pcre_match foo→bar + print $? $MATCH $match ; unset MATCH match + pcre_compile -a '.(→.)' + pcre_match foo→bar + print $? $MATCH $match ; unset MATCH match + pcre_match o→bar + print $? $MATCH $match ; unset MATCH match + pcre_match o→b + print $? $MATCH $match ; unset MATCH match + pcre_compile 'x.(→.)' + pcre_match xo→t + print $? $MATCH $match ; unset MATCH match + pcre_match Xo→t + print $? $MATCH $match ; unset MATCH match + pcre_compile -i 'x.(→.)' + pcre_match xo→t + print $? $MATCH $match ; unset MATCH match + pcre_match Xo→t + print $? $MATCH $match ; unset MATCH match +0:pcre_compile interface testing: basic, anchored & case-insensitive +>0 o→b →b +>1 +>1 +>0 o→† →† +>0 o→b →b +>1 +>0 o→b →b +>0 o→b →b +>0 xo→t →t +>1 +>0 xo→t →t +>0 Xo→t →t |