about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPhil Pennock <pdpennock@users.sourceforge.net>2011-10-24 11:31:25 +0000
committerPhil Pennock <pdpennock@users.sourceforge.net>2011-10-24 11:31:25 +0000
commit2f3c16d40fe1e8c6aa351224fd50530024c4f5c6 (patch)
tree466380965df722386261e8d72512ad4a34dbd0bd
parent332fbbd7002b3aba778e624b92531c83bb3bef62 (diff)
downloadzsh-2f3c16d40fe1e8c6aa351224fd50530024c4f5c6.tar.gz
zsh-2f3c16d40fe1e8c6aa351224fd50530024c4f5c6.tar.xz
zsh-2f3c16d40fe1e8c6aa351224fd50530024c4f5c6.zip
29838: metafy/unmetafy strings for PCRE matching (UTF-8 fixes)
-rw-r--r--ChangeLog10
-rw-r--r--Src/Modules/pcre.c61
-rw-r--r--Test/V07pcre.ztst106
3 files changed, 160 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index 934fb881f..47fc89954 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2011-10-24  Phil Pennock  <pdpennock@users.sourceforge.net>
+
+	* 29838: Src/Modules/pcre.c: metafy/unmetafy strings, to
+	correctly handle non-ASCII characters in UTF-8 for regexp
+	matches.
+
+	* unposted: Test/V07pcre.ztst: some PCRE tests
+
 2011-10-23  Peter Stephenson  <p.w.stephenson@ntlworld.com>
 
 	* users/16492: MACHINES: OpenIndiana issue.
@@ -15484,5 +15492,5 @@
 
 *****************************************************
 * This is used by the shell to define $ZSH_PATCHLEVEL
-* $Revision: 1.5481 $
+* $Revision: 1.5482 $
 *****************************************************
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index e1a897944..e36013163 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -77,6 +77,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
 {
     int pcre_opts = 0, pcre_errptr;
     const char *pcre_error;
+    char *target;
     
     if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
     if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
@@ -92,8 +93,13 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
     if (pcre_pattern)
 	pcre_free(pcre_pattern);
 
-    pcre_pattern = pcre_compile(*args, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+    target = ztrdup(*args);
+    unmetafy(target, NULL);
+
+    pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
     
+    free(target);
+
     if (pcre_pattern == NULL)
     {
 	zwarnnam(nam, "error in regex: %s", pcre_error);
@@ -161,7 +167,7 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
 	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
 	    setsparam("ZPCRE_OP", ztrdup(offset_all));
 	}
-	match_all = ztrdup(captures[0]);
+	match_all = metafy(captures[0], -1, META_DUP);
 	setsparam(matchvar, match_all);
 	/*
 	 * If we're setting match, mbegin, mend we only do
@@ -169,7 +175,15 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
 	 * (c.f. regex.c).
 	 */
 	if (!want_begin_end || nelem) {
-	    matches = zarrdup(&captures[capture_start]);
+	    char **x, **y;
+	    y = &captures[capture_start];
+	    matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1));
+	    do {
+		if (*y)
+		    *x++ = metafy(*y, -1, META_DUP);
+		else
+		    *x++ = NULL;
+	    } while (*y++);
 	    setaparam(substravar, matches);
 	}
 
@@ -255,6 +269,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
     int ret, capcount, *ovec, ovecsize, c;
     char *matched_portion = NULL;
+    char *plaintext = NULL;
     char *receptacle = NULL;
     int return_value = 1;
     /* The subject length and offset start are both int values in pcre_exec */
@@ -278,7 +293,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     }
     /* For the entire match, 'Return' the offset byte positions instead of the matched string */
     if(OPT_ISSET(ops,'b')) want_offset_pair = 1; 
-    
+
     if(!*args) {
 	zwarnnam(nam, "not enough arguments");
     }
@@ -288,26 +303,28 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 	zwarnnam(nam, "error %d in fullinfo", ret);
 	return 1;
     }
-    
+
     ovecsize = (capcount+1)*3;
     ovec = zalloc(ovecsize*sizeof(int));
-    
-    subject_len = (int)strlen(*args);
+
+    plaintext = ztrdup(*args);
+    unmetafy(plaintext, NULL);
+    subject_len = (int)strlen(plaintext);
 
     if (offset_start < 0 || offset_start >= subject_len)
 	ret = PCRE_ERROR_NOMATCH;
     else
-	ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
+	ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
 
     if (ret==0) return_value = 0;
     else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle,
+	zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
 			     want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
-	zwarnnam(nam, "error in pcre_exec");
+	zwarnnam(nam, "error in pcre_exec [%d]", ret);
     }
     
     if (ovec)
@@ -322,7 +339,8 @@ cond_pcre_match(char **a, int id)
 {
     pcre *pcre_pat;
     const char *pcre_err;
-    char *lhstr, *rhre, *avar=NULL;
+    char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL;
+    char *p;
     int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
     int return_value = 0;
 
@@ -331,6 +349,10 @@ cond_pcre_match(char **a, int id)
 
     lhstr = cond_str(a,0,0);
     rhre = cond_str(a,1,0);
+    lhstr_plain = ztrdup(lhstr);
+    rhre_plain = ztrdup(rhre);
+    unmetafy(lhstr_plain, NULL);
+    unmetafy(rhre_plain, NULL);
     pcre_pat = NULL;
     ov = NULL;
 
@@ -339,7 +361,7 @@ cond_pcre_match(char **a, int id)
 
     switch(id) {
 	 case CPCRE_PLAIN:
-		pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL);
+		pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
 		if (pcre_pat == NULL) {
 		    zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
 		    break;
@@ -347,7 +369,7 @@ cond_pcre_match(char **a, int id)
                 pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
     		ovsize = (capcnt+1)*3;
 		ov = zalloc(ovsize*sizeof(int));
-    		r = pcre_exec(pcre_pat, NULL, lhstr, strlen(lhstr), 0, 0, ov, ovsize);
+    		r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize);
 		/* r < 0 => error; r==0 match but not enough size in ov
 		 * r > 0 => (r-1) substrings found; r==1 => no substrings
 		 */
@@ -356,13 +378,16 @@ cond_pcre_match(char **a, int id)
 		    return_value = 1;
 		    break;
 		}
-	        else if (r==PCRE_ERROR_NOMATCH) return 0; /* no match */
+	        else if (r==PCRE_ERROR_NOMATCH) {
+		    return_value = 0; /* no match */
+		    break;
+		}
 		else if (r<0) {
-		    zwarn("pcre_exec() error: %d", r);
+		    zwarn("pcre_exec() error [%d]", r);
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0,
+		    zpcre_get_substrings(lhstr_plain, ov, r, NULL, avar, 0,
 					 isset(BASHREMATCH),
 					 !isset(BASHREMATCH));
 		    return_value = 1;
@@ -371,6 +396,10 @@ cond_pcre_match(char **a, int id)
 		break;
     }
 
+    if (lhstr_plain)
+	free(lhstr_plain);
+    if(rhre_plain)
+	free(rhre_plain);
     if (pcre_pat)
 	pcre_free(pcre_pat);
     if (ov)
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
new file mode 100644
index 000000000..4dd173557
--- /dev/null
+++ b/Test/V07pcre.ztst
@@ -0,0 +1,106 @@
+%prep
+
+  zmodload zsh/pcre
+  setopt rematch_pcre
+# Find a UTF-8 locale.
+  setopt multibyte
+# Don't let LC_* override our choice of locale.
+  unset -m LC_\*
+  mb_ok=
+  langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
+	 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
+  for LANG in $langs; do
+    if [[ é = ? ]]; then
+      mb_ok=1
+      break;
+    fi
+  done
+  if [[ -z $mb_ok ]]; then
+    ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
+  else
+    print -u $ZTST_fd Testing PCRE multibyte with locale $LANG
+    mkdir multibyte.tmp && cd multibyte.tmp
+  fi
+
+%test
+
+  [[ 'foo→bar' =~ .([^[:ascii:]]). ]]
+  print $MATCH
+  print $match[1]
+0:Basic non-ASCII regexp matching
+>o→b
+>→
+
+  [[ foo =~ f.+ ]] ; print $?
+  [[ foo =~ x.+ ]] ; print $?
+  [[ ! foo =~ f.+ ]] ; print $?
+  [[ ! foo =~ x.+ ]] ; print $?
+  [[ foo =~ f.+ && bar =~ b.+ ]] ; print $?
+  [[ foo =~ x.+ && bar =~ b.+ ]] ; print $?
+  [[ foo =~ f.+ && bar =~ x.+ ]] ; print $?
+  [[ ! foo =~ f.+ && bar =~ b.+ ]] ; print $?
+  [[ foo =~ f.+ && ! bar =~ b.+ ]] ; print $?
+  [[ ! ( foo =~ f.+ && bar =~ b.+ ) ]] ; print $?
+  [[ ! foo =~ x.+ && bar =~ b.+ ]] ; print $?
+  [[ foo =~ x.+ && ! bar =~ b.+ ]] ; print $?
+  [[ ! ( foo =~ x.+ && bar =~ b.+ ) ]] ; print $?
+0:Regex result inversion detection
+>0
+>1
+>1
+>0
+>0
+>1
+>1
+>1
+>1
+>1
+>0
+>1
+>0
+
+# Note that PCRE_ANCHORED only means anchored at the start
+# Also note that we don't unset MATCH/match on failed match (and it's an
+# open issue as to whether or not we should)
+  pcre_compile '.(→.)'
+  pcre_match foo→bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match foo.bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match foo†bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match foo→†ar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_study
+  pcre_match foo→bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_compile -a '.(→.)'
+  pcre_match foo→bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match o→bar
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match o→b
+  print $? $MATCH $match ; unset MATCH match
+  pcre_compile 'x.(→.)'
+  pcre_match xo→t
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match Xo→t
+  print $? $MATCH $match ; unset MATCH match
+  pcre_compile -i 'x.(→.)'
+  pcre_match xo→t
+  print $? $MATCH $match ; unset MATCH match
+  pcre_match Xo→t
+  print $? $MATCH $match ; unset MATCH match
+0:pcre_compile interface testing: basic, anchored & case-insensitive
+>0 o→b →b
+>1
+>1
+>0 o→† →†
+>0 o→b →b
+>1
+>0 o→b →b
+>0 o→b →b
+>0 xo→t →t
+>1
+>0 xo→t →t
+>0 Xo→t →t