about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--Doc/Zsh/cond.yo24
-rw-r--r--Src/Modules/pcre.c79
-rw-r--r--Src/Modules/regex.c56
-rw-r--r--Test/C02cond.ztst33
5 files changed, 188 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index 35600b5e1..530decde1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2010-01-17  Peter Stephenson  <p.w.stephenson@ntlworld.com>
+
+	* 27600: Doc/Zsh/cond.yo, Src/Modules/pcre.c, Src/Modules/regex.c,
+	Test/C02cond.ztst: extend =~ syntax to set MBEGIN and MEND
+	with MATCH and mbegin and mend with match.
+
 2010-01-16  Peter Stephenson  <p.w.stephenson@ntlworld.com>
 
 	* Frank: 27599: Doc/Zsh/contrib.yo: fix formatting documentation
@@ -12585,5 +12591,5 @@
 
 *****************************************************
 * This is used by the shell to define $ZSH_PATCHLEVEL
-* $Revision: 1.4862 $
+* $Revision: 1.4863 $
 *****************************************************
diff --git a/Doc/Zsh/cond.yo b/Doc/Zsh/cond.yo
index 9d8f145f8..4b7304407 100644
--- a/Doc/Zsh/cond.yo
+++ b/Doc/Zsh/cond.yo
@@ -117,13 +117,29 @@ the tt(zsh/pcre) module, else it is tested as a POSIX
 extended regular expression using the tt(zsh/regex) module.
 Upon successful match, some variables will be updated; no variables
 are changed if the matching fails.
+
+If the option tt(BASH_REMATCH) is not set the scalar parameter
+tt(MATCH) is set to the substring that matched the pattern and
+the integer parameters tt(MBEGIN) and tt(MEND) to the index of the start
+and end, respectively, of the match in var(string), such that if
+var(string) is contained in variable tt(var) the expression
+`${var[$MBEGIN,$MEND]}' is identical to `$MATCH'.  The setting
+of the option tt(KSH_ARRAYS) is respected.  Likewise, the array
+tt(match) is set to the substrings that matched parenthesised
+subexpressions and the arrays tt(mbegin) and tt(mend) to the indices of
+the start and end positions, respectively, of the substrings within
+var(string).  The arrays are not set if there were no parenthesised
+subexpresssions.  For example, if the string `tt(a short string)' is matched
+against the regular expression `tt(s(...)t)', then (assuming the option
+tt(KSH_ARRAYS) is not set) tt(MATCH), tt(MBEGIN)
+and tt(MEND) are `tt(short)', 3 and 7, respectively, while tt(match),
+tt(mbegin) and tt(mend) are single entry arrays containing
+the strings `tt(hor)', `tt(4)' and `tt(6), respectively.
+
 If the option tt(BASH_REMATCH) is set the array
 tt(BASH_REMATCH) is set to the substring that matched the pattern
 followed by the substrings that matched parenthesised
-subexpressions within the pattern; otherwise, the scalar parameter
-tt(MATCH) is set to the substring that matched the pattern and
-and the array tt(match) to the substrings that matched parenthesised
-subexpressions.
+subexpressions within the pattern.
 )
 item(var(string1) tt(<) var(string2))(
 true if var(string1) comes before var(string2)
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 08205d144..f8b79adea 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -138,8 +138,9 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
 
 /**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, 
-    int want_offset_pair, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
+		     char *substravar, int want_offset_pair, int matchedinarr,
+		     int want_begin_end)
 {
     char **captures, *match_all, **matches;
     char offset_all[50];
@@ -154,6 +155,7 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
     
     /* captures[0] will be entire matched string, [1] first substring */
     if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+	int nelem = arrlen(captures)-1;
 	/* Set to the offsets of the complete match */
 	if (want_offset_pair) {
 	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
@@ -161,8 +163,70 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
 	}
 	match_all = ztrdup(captures[0]);
 	setsparam(matchvar, match_all);
-	matches = zarrdup(&captures[capture_start]);
-	setaparam(substravar, matches);
+	/*
+	 * If we're setting match, mbegin, mend we only do
+	 * so if there were parenthesised matches, for consistency
+	 * (c.f. regex.c).
+	 */
+	if (!want_begin_end || nelem) {
+	    matches = zarrdup(&captures[capture_start]);
+	    setaparam(substravar, matches);
+	}
+
+	if (want_begin_end) {
+	    char *ptr = arg;
+	    zlong offs = 0;
+
+	    /* Count the characters before the match */
+	    MB_METACHARINIT();
+	    while (ptr < arg + ovec[0]) {
+		offs++;
+		ptr += MB_METACHARLEN(ptr);
+	    }
+	    setiparam("MBEGIN", offs + !isset(KSHARRAYS));
+	    /* Add on the characters in the match */
+	    while (ptr < arg + ovec[1]) {
+		offs++;
+		ptr += MB_METACHARLEN(ptr);
+	    }
+	    setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
+	    if (nelem) {
+		char **mbegin, **mend, **bptr, **eptr;
+		int i, *ipair;
+
+		bptr = mbegin = zalloc(nelem+1);
+		eptr = mend = zalloc(nelem+1);
+
+		for (ipair = ovec + 2, i = 0;
+		     i < nelem;
+		     ipair += 2, i++, bptr++, eptr++)
+		{
+		    char buf[DIGBUFSIZE];
+		    ptr = arg;
+		    offs = 0;
+		    /* Find the start offset */
+		    MB_METACHARINIT();
+		    while (ptr < arg + ipair[0]) {
+			offs++;
+			ptr += MB_METACHARLEN(ptr);
+		    }
+		    convbase(buf, offs + !isset(KSHARRAYS), 10);
+		    *bptr = ztrdup(buf);
+		    /* Continue to the end offset */
+		    while (ptr < arg + ipair[1]) {
+			offs++;
+			ptr += MB_METACHARLEN(ptr);
+		    }
+		    convbase(buf, offs + !isset(KSHARRAYS) - 1, 10);
+		    *eptr = ztrdup(buf);
+		}
+		*bptr = *eptr = NULL;
+
+		setaparam("mbegin", mbegin);
+		setaparam("mend", mend);
+	    }
+	}
+
 	pcre_free_substring_list((const char **)captures);
     }
 
@@ -238,7 +302,8 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if (ret==0) return_value = 0;
     else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
+	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle,
+			     want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
@@ -297,7 +362,9 @@ cond_pcre_match(char **a, int id)
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
+		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0,
+					 isset(BASHREMATCH),
+					 !isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
diff --git a/Src/Modules/regex.c b/Src/Modules/regex.c
index 8a9f3e608..25dbddf07 100644
--- a/Src/Modules/regex.c
+++ b/Src/Modules/regex.c
@@ -108,11 +108,65 @@ zcond_regex_match(char **a, int id)
 	    if (isset(BASHREMATCH)) {
 		setaparam("BASH_REMATCH", arr);
 	    } else {
+		zlong offs;
+		char *ptr;
+
 		m = matches;
 		s = ztrduppfx(lhstr + m->rm_so, m->rm_eo - m->rm_so);
 		setsparam("MATCH", s);
-		if (nelem)
+		/*
+		 * Count the characters before the match.
+		 */
+		ptr = lhstr;
+		offs = 0;
+		MB_METACHARINIT();
+		while (ptr < lhstr + m->rm_so) {
+		    offs++;
+		    ptr += MB_METACHARLEN(ptr);
+		}
+		setiparam("MBEGIN", offs + !isset(KSHARRAYS));
+		/*
+		 * Add on the characters in the match.
+		 */
+		while (ptr < lhstr + m->rm_eo) {
+		    offs++;
+		    ptr += MB_METACHARLEN(ptr);
+		}
+		setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
+		if (nelem) {
+		    char **mbegin, **mend, **bptr, **eptr;
+		    bptr = mbegin = (char **)zalloc(nelem+1);
+		    eptr = mend = (char **)zalloc(nelem+1);
+
+		    for (m = matches + start, n = start;
+			 n <= (int)re.re_nsub;
+			 ++n, ++m, ++bptr, ++eptr)
+		    {
+			char buf[DIGBUFSIZE];
+			ptr = lhstr;
+			offs = 0;
+			/* Find the start offset */
+			MB_METACHARINIT();
+			while (ptr < lhstr + m->rm_so) {
+			    offs++;
+			    ptr += MB_METACHARLEN(ptr);
+			}
+			convbase(buf, offs + !isset(KSHARRAYS), 10);
+			*bptr = ztrdup(buf);
+			/* Continue to the end offset */
+			while (ptr < lhstr + m->rm_eo) {
+			    offs++;
+			    ptr += MB_METACHARLEN(ptr);
+			}
+			convbase(buf, offs + !isset(KSHARRAYS) - 1, 10);
+			*eptr = ztrdup(buf);
+		    }
+		    *bptr = *eptr = NULL;
+
 		    setaparam("match", arr);
+		    setaparam("mbegin", mbegin);
+		    setaparam("mend", mend);
+		}
 	    }
 	}
 	else
diff --git a/Test/C02cond.ztst b/Test/C02cond.ztst
index de82dcbe2..b0e278f4b 100644
--- a/Test/C02cond.ztst
+++ b/Test/C02cond.ztst
@@ -251,6 +251,39 @@ F:Failures in these cases do not indicate a problem in the shell.
   fi
 0:regex tests shouldn't crash
 
+  if zmodload -i zsh/regex 2>/dev/null; then
+    string="this has stuff in it"
+    bad_regex=0
+    if [[ $string =~ "h([a-z]*) s([a-z]*) " ]]; then
+      if [[ "$MATCH $MBEGIN $MEND" != "has stuff  6 15" ]]; then
+	print -r "regex variables MATCH MBEGIN MEND:
+  '$MATCH $MBEGIN $MEND'
+  should be:
+  'has stuff  6 15'" >&2
+        bad_regex=1
+      else
+	results=("as 7 8" "tuff 11 14")
+	for i in 1 2; do
+	  if [[ "$match[$i] $mbegin[$i] $mend[$i]" != $results[i] ]]; then
+	    print -r "regex variables match[$i] mbegin[$i] mend[$i]:
+  '$match[$i] $mbegin[$i] $mend[$i]'
+  should be
+  '$results[$i]'" >&2
+	    break
+	  fi
+	done
+      fi
+    else
+      print -r "regex failed to match '$string'" >&2
+    fi
+    (( bad_regex )) || print OK
+  else
+    # if it didn't load, tough, but not a test error
+    print OK
+  fi
+0:MATCH, MBEGIN, MEND, match, mbegin, mend
+>OK
+
 %clean
   # This works around a bug in rm -f in some versions of Cygwin
   chmod 644 unmodish