From bb61da36aaeeaa70413cdf5bc66d7a71194f93e5 Mon Sep 17 00:00:00 2001 From: Stephane Chazelas Date: Mon, 6 Sep 2021 14:43:01 -0700 Subject: 45180: clarify doc for POSIX EREs, fix an issue with PCRE when the replacement was empty or generated more than one element --- ChangeLog | 5 +++ Doc/Zsh/contrib.yo | 5 ++- Functions/Example/zpgrep | 15 +++++-- Functions/Misc/regexp-replace | 102 +++++++++++++++++++++++++++++++----------- 4 files changed, 95 insertions(+), 32 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4eea4b614..eb75cf235 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2021-09-06 Bart Schaefer + * Stephane Chazelas: 45180: Doc/Zsh/contrib.yo, + Functions/Example/zpgrep, Functions/Misc/regexp-replace: clarify + doc for POSIX EREs, fix an issue with PCRE when the replacement + was empty or generated more than one element + * zeurkous: 49154: Doc/Zsh/exec.yo: clarify status on exec failure * Marlon Richert: 49378: Src/parse.c: skip check for collision diff --git a/Doc/Zsh/contrib.yo b/Doc/Zsh/contrib.yo index e74528341..94059eff6 100644 --- a/Doc/Zsh/contrib.yo +++ b/Doc/Zsh/contrib.yo @@ -4335,7 +4335,7 @@ See also the tt(pager), tt(prompt) and tt(rprompt) styles below. findex(regexp-replace) item(tt(regexp-replace) var(var) var(regexp) var(replace))( Use regular expressions to perform a global search and replace operation -on a variable. POSIX extended regular expressions are used, +on a variable. POSIX extended regular expressions (ERE) are used, unless the option tt(RE_MATCH_PCRE) has been set, in which case Perl-compatible regular expressions are used (this requires the shell to be linked against the tt(pcre) @@ -4353,6 +4353,9 @@ and arithmetic expressions which will be replaced: in particular, a reference to tt($MATCH) will be replaced by the text matched by the pattern. The return status is 0 if at least one match was performed, else 1. + +Note that if using POSIX EREs, the tt(^) or word boundary operators +(where available) may not work properly. ) findex(run-help) item(tt(run-help) var(cmd))( diff --git a/Functions/Example/zpgrep b/Functions/Example/zpgrep index 8b1edaa1c..556e58cd6 100644 --- a/Functions/Example/zpgrep +++ b/Functions/Example/zpgrep @@ -2,24 +2,31 @@ # zpgrep() { -local file pattern +local file pattern ret pattern=$1 shift +ret=1 if ((! ARGC)) then set -- - fi -pcre_compile $pattern +zmodload zsh/pcre || return +pcre_compile -- "$pattern" pcre_study for file do if [[ "$file" == - ]] then - while read -u0 buf; do pcre_match $buf && print $buf; done + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done else - while read -u0 buf; do pcre_match $buf && print $buf; done < "$file" + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done < "$file" fi done +return "$ret" } diff --git a/Functions/Misc/regexp-replace b/Functions/Misc/regexp-replace index dec105524..0d5948075 100644 --- a/Functions/Misc/regexp-replace +++ b/Functions/Misc/regexp-replace @@ -8,36 +8,84 @@ # $ and backtick substitutions; in particular, $MATCH will be replaced # by the portion of the string matched by the regular expression. -integer pcre +# we use positional parameters instead of variables to avoid +# clashing with the user's variable. Make sure we start with 3 and only +# 3 elements: +argv=("$1" "$2" "$3") -[[ -o re_match_pcre ]] && pcre=1 +# $4 records whether pcre is enabled as that information would otherwise +# be lost after emulate -L zsh +4=0 +[[ -o re_match_pcre ]] && 4=1 emulate -L zsh -(( pcre )) && setopt re_match_pcre - -# $4 is the string to be matched -4=${(P)1} -# $5 is the final string -5= -# 6 indicates if we made a change -6= + + local MATCH MBEGIN MEND local -a match mbegin mend -while [[ -n $4 ]]; do - if [[ $4 =~ $2 ]]; then - # append initial part and subsituted match - 5+=${4[1,MBEGIN-1]}${(e)3} - # truncate remaining string - 4=${4[MEND+1,-1]} - # indicate we did something - 6=1 - else - break - fi -done -5+=$4 - -eval ${1}=${(q)5} -# status 0 if we did something, else 1. -[[ -n $6 ]] +if (( $4 )); then + # if using pcre, we're using pcre_match and a running offset + # That's needed for ^, \A, \b, and look-behind operators to work + # properly. + + zmodload zsh/pcre || return 2 + pcre_compile -- "$2" && pcre_study || return 2 + + # $4 is the current *byte* offset, $5, $6 reserved for later use + 4=0 6= + + local ZPCRE_OP + while pcre_match -b -n $4 -- "${(P)1}"; do + # append offsets and computed replacement to the array + # we need to perform the evaluation in a scalar assignment so that if + # it generates an array, the elements are converted to string (by + # joining with the first chararacter of $IFS as usual) + 5=${(e)3} + argv+=(${(s: :)ZPCRE_OP} "$5") + + # for 0-width matches, increase offset by 1 to avoid + # infinite loop + 4=$((argv[-2] + (argv[-3] == argv[-2]))) + done + + (($# > 6)) || return # no match + + set +o multibyte + + # $5 contains the result, $6 the current offset + 5= 6=1 + for 2 3 4 in "$@[7,-1]"; do + 5+=${(P)1[$6,$2]}$4 + 6=$(($3 + 1)) + done + 5+=${(P)1[$6,-1]} +else + # in ERE, we can't use an offset so ^, (and \<, \b, \B, [[:<:]] where + # available) won't work properly. + + # $4 is the string to be matched + 4=${(P)1} + + while [[ -n $4 ]]; do + if [[ $4 =~ $2 ]]; then + # append initial part and substituted match + 5+=${4[1,MBEGIN-1]}${(e)3} + # truncate remaining string + if ((MEND < MBEGIN)); then + # zero-width match, skip one character for the next match + ((MEND++)) + 5+=${4[1]} + fi + 4=${4[MEND+1,-1]} + # indicate we did something + 6=1 + else + break + fi + done + [[ -n $6 ]] || return # no match + 5+=$4 +fi + +eval $1=\$5 -- cgit 1.4.1