summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Stephenson <pws@users.sourceforge.net>2003-08-30 19:12:18 +0000
committerPeter Stephenson <pws@users.sourceforge.net>2003-08-30 19:12:18 +0000
commitbab0f3b920687f21a3c181401bfd5443a94cbe15 (patch)
treee189474b56018bd61cb2a394be2fbe76339fba94
parentc9c5f9da9366d1587c588507433cec35ef243579 (diff)
downloadzsh-bab0f3b920687f21a3c181401bfd5443a94cbe15.tar.gz
zsh-bab0f3b920687f21a3c181401bfd5443a94cbe15.tar.xz
zsh-bab0f3b920687f21a3c181401bfd5443a94cbe15.zip
18982: comments/rant for paramsubst(), accidentally uncommited
-rw-r--r--Src/subst.c572
1 files changed, 564 insertions, 8 deletions
diff --git a/Src/subst.c b/Src/subst.c
index bc7015eef..d2d75ce79 100644
--- a/Src/subst.c
+++ b/Src/subst.c
@@ -300,6 +300,15 @@ multsub(char **s, char ***a, int *isarr, char *sep)
 	while (nonempty(&foo))
 	    *p++ = (char *)ugetnode(&foo);
 	*p = NULL;
+	/*
+	 * This is the most obscure way of deciding whether a value is
+	 * an array it would be possible to imagine.  It seems to result
+	 * partly because we don't pass down the qt and ssub flags from
+	 * paramsubst() through prefork() properly, partly because we
+	 * don't tidy up to get back the return type from multsub we
+	 * need properly.  The crux of neatening this up is to get rid
+	 * of the following test.
+	 */
 	if (a && mult_isarr) {
 	    *a = r;
 	    *isarr = SCANPM_MATCHMANY;
@@ -817,6 +826,23 @@ subst_parse_str(char **sp, int single, int err)
 #define	isstring(c) ((c) == '$' || (char)(c) == String || (char)(c) == Qstring)
 #define isbrack(c)  ((c) == '[' || (char)(c) == Inbrack)
 
+/*
+ * Given a linked list l with node n, perform parameter substitution
+ * starting from *str.  Return the node with the substitutuion performed
+ * or NULL if it failed.
+ *
+ * If qt is true, the `$' was quoted.  TODO: why can't we just look
+ * to see if the first character was String or Qstring?
+ *
+ * If ssub is true, we are being called via singsubst(), which means
+ * the result will be a single word.  TODO: can we generate the
+ * single word at the end?  TODO: if not, or maybe in any case,
+ * can we pass down the ssub flag from prefork with the other flags
+ * instead of pushing it into different arguments?  (How exactly
+ * to qt and ssub differ?  Are both necessary, if so is there some
+ * better way of separating the two?)
+ */
+
 /**/
 LinkNode
 paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
@@ -824,43 +850,207 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
     char *aptr = *str, c, cc;
     char *s = aptr, *fstr, *idbeg, *idend, *ostr = (char *) getdata(n);
     int colf;			/* != 0 means we found a colon after the name */
+    /*
+     * There are far too many flags.  They need to be grouped
+     * together into some structure which ties them to where they
+     * came from.
+     *
+     * Some flags have a an obscure relationship to their effect which
+     * depends on incrementing them to particular values in particular
+     * ways.
+     */
+    /*
+     * Whether the value is an array (in aval) or not (in val).  There's
+     * a movement from storing the value in the stuff read from the
+     * parameter (the value v) to storing them in val and aval.
+     * However, sometimes you find v reappearing temporarily.
+     *
+     * The values -1 and 2 are special to isarr.  It looks like 2 is
+     * some kind of an internal flag to do with whether the array's been
+     * copied, in which case I don't know why we don't use the copied
+     * flag, but they do both occur close together so they presumably
+     * have different effects.  The value -1 is isued to force us to
+     * keep an empty array.  It's tested in the YUK chunk (I mean the
+     * one explicitly marked as such).
+     */
     int isarr = 0;
+    /*
+     * This is just the setting of the option except we need to
+     * take account of ^ and ^^.
+     */
     int plan9 = isset(RCEXPANDPARAM);
+    /*
+     * Likwise, but with ~ and ~~.  Also, we turn it off later
+     * on if qt is passed down.
+     */
     int globsubst = isset(GLOBSUBST);
+    /*
+     * Indicates ${#pm}, massaged by whichlen which is set by
+     * the (c), (w), and (W) flags to indicate how we take the length.
+     */
     int getlen = 0;
     int whichlen = 0;
+    /*
+     * Indicates ${+pm}: a simple boolean for once.
+     */
     int chkset = 0;
+    /*
+     * Indicates we have tried to get a value in v but that was
+     * unset.  I don't quite understand why (v == NULL) isn't
+     * good enough, but there are places where we seem to need
+     * to second guess whether a value is a real value or not.
+     */
     int vunset = 0;
+    /*
+     * Indicates (t) flag, i.e. print out types.  The code for
+     * this actually isn't too horrifically inbred compared with
+     * that for (P).
+     */
     int wantt = 0;
+    /*
+     * Indicates spliting a string into an array.  There aren't
+     * actually that many special cases for this --- which may
+     * be why it doesn't work properly; we split in some cases
+     * where we shouldn't, in particular on the multsubs for
+     * handling embedded values for ${...=...} and the like.
+     */
     int spbreak = isset(SHWORDSPLIT) && !ssub && !qt;
+    /* Scalar and array value, see isarr above */
     char *val = NULL, **aval = NULL;
+    /*
+     * Padding based on setting in parameter rather than substitution
+     * flags.  This is only used locally.
+     */
     unsigned int fwidth = 0;
+    /*
+     * vbuf and v are both used to retrieve parameter values; this
+     * is a kludge, we pass down vbuf and it may or may not return v.
+     */
     struct value vbuf;
     Value v = NULL;
+    /*
+     * This expressive name refers to the set of flags which
+     * is applied to matching for #, %, / and their doubled variants:
+     * (M), (R), (B), (E), (N), (S).
+     */
     int flags = 0;
+    /* Value from (I) flag, used for ditto. */
     int flnum = 0;
+    /*
+     * sortit is an obscure combination of the settings for (o), (O),
+     * (i) and (n). casind is (i) and numord is (n); these are
+     * separate so we can have fun doing the obscure combinatorics later.
+     * indord is the (a) flag, which for consistency doesn't get
+     * combined into sortit.
+     */
     int sortit = 0, casind = 0, numord = 0, indord = 0;
+    /* (u): straightforward. */
     int unique = 0;
+    /* combination of (L), (U) and (C) flags. */
     int casmod = 0;
+    /*
+     * quotemod says we are doing either (q) (positive), (Q) (negative)
+     * or not (0).  quotetype counts the q's for the first case.
+     * quoterr is simply (X) but gets passed around a lot because the
+     * combination (eX) needs it.
+     */
     int quotemod = 0, quotetype = 0, quoteerr = 0;
+    /*
+     * (V) flag: fairly straightforward, except that as with so
+     * many flags it's not easy to decide where to put it in the order.
+     */
     int visiblemod = 0;
+    /*
+     * The (z) flag, nothing to do with SH_WORD_SPLIT which is tied
+     * spbreak, see above; fairly straighforward in use but c.f.
+     * the comment for visiblemod.
+     */
     int shsplit = 0;
+    /*
+     * The separator from (j) and (s) respectively, or (F) and (f)
+     * respectively (hardwired to "\n" in that case).  Slightly
+     * confusingly also used for ${#pm}, thought that's at least
+     * documented in the manual
+     */
     char *sep = NULL, *spsep = NULL;
+    /*
+     * Padding strings.  The left and right padding strings which
+     * are repeated, then the ones which only occur once, for
+     * the (l) and (r) flags.
+     */
     char *premul = NULL, *postmul = NULL, *preone = NULL, *postone = NULL;
-    char *replstr = NULL;	/* replacement string for /orig/repl */
+    /* Replacement string for /orig/repl and //orig/repl */
+    char *replstr = NULL;
+    /* The numbers for (l) and (r) */
     zlong prenum = 0, postnum = 0;
+    /*
+     * Whether the value has been copied.  Optimisation:  if we
+     * are modifying an expression, we only need to copy it the
+     * first time, and if we don't modify it we can just use the
+     * value from the parameter or input.
+     */
     int copied = 0;
+    /*
+     * The (A) flag for array assignment, with consequences for
+     * splitting and joining; (AA) gives arrasg == 2 for associative
+     * arrays.
+     */
     int arrasg = 0;
+    /*
+     * The (e) flag.  As we need to do extra work not quite
+     * at the end, the effect of this is kludged in in several places.
+     */
     int eval = 0;
+    /*
+     * The (P) flag.  This interacts a bit obscurely with whether
+     * or not we are dealing with a sub expression (subexp).
+     */
     int aspar = 0;
+    /*
+     * The (%) flag, c.f. visiblemod again.
+     */	
     int presc = 0;
+    /*
+     * The (@) flag; interacts obscurely with qt and isarr.
+     * This is one of the things that decides whether multsub
+     * will produce an array, but in an extremely indirect fashion.
+     */
     int nojoin = 0;
-    char inbrace = 0;		/* != 0 means ${...}, otherwise $... */
+    /*
+     * != 0 means ${...}, otherwise $...  What works without braces
+     * is largely a historical artefact (everything works with braces,
+     * I sincerely hope).
+     */
+    char inbrace = 0;
+    /*
+     * Use for the (k) flag.  Goes down into the parameter code,
+     * sometimes.
+     */
     char hkeys = 0;
+    /*
+     * Used for the (v) flag, ditto.  Not quite sure why they're
+     * separate, but the tradition seems to be that things only
+     * get combined when that makes the result more obscure rather
+     * than less.
+     */
     char hvals = 0;
+    /*
+     * Whether we had to evaluate a subexpression, i.e. an
+     * internal ${...} or $(...) or plain $pm.  We almost don't
+     * need to remember this (which would be neater), but the (P)
+     * flag means the subexp and !subexp code is obscurely combined,
+     * and the argument passing to fetchvalue has another kludge.
+     */
     int subexp;
 
     *s++ = '\0';
+    /*
+     * Nothing to do unless the character following the $ is
+     * something we recognise.
+     *
+     * Shouldn't this be a table or something?  We test for all
+     * these later on, too.
+     */
     if (!ialnum(c = *s) && c != '#' && c != Pound && c != '-' &&
 	c != '!' && c != '$' && c != String && c != Qstring &&
 	c != '?' && c != Quest && c != '_' &&
@@ -872,9 +1062,21 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	return n;
     }
     DPUTS(c == '{', "BUG: inbrace == '{' in paramsubst()");
+    /*
+     * Extra processing if there is an opening brace: mostly
+     * flags in parentheses, but also one ksh hack.
+     */
     if (c == Inbrace) {
 	inbrace = 1;
 	s++;
+	/*
+	 * In ksh emulation a leading `!' is a special flag working
+	 * sort of like our (k).
+	 * TODO: this is one of very few cases tied directly to
+	 * the emulation mode rather than an option.  Since ksh
+	 * doesn't have parameter flags it might be neater to
+	 * handle this with the ^, =, ~ stuff, below.
+	 */
 	if ((c = *s) == '!' && s[1] != Outbrace && emulation == EMULATE_KSH) {
 	    hkeys = SCANPM_WANTKEYS;
 	    s++;
@@ -882,6 +1084,14 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    char *t, sav;
 	    int tt = 0;
 	    zlong num;
+	    /*
+	     * The (p) flag is (uniquely) only remembered within
+	     * this block.  It says we do print-style handling
+	     * on the values for flags, but only on those.
+	     * This explains the ghastly macro, but why can't it
+	     * be a function?  UNTOK_AND_ESCAPE is defined
+	     * so that the argument must be an lvalue.
+	     */
 	    int escapes = 0;
 	    int klen;
 #define UNTOK(C)  (itok(C) ? ztokens[(C) - Pound] : (C))
@@ -1089,22 +1299,40 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    s++;
 	}
     }
+    /* Sort is done by indexing on sortit-1:
+     *   bit 1: ascending (o)/descending (O)
+     *   bit 2: case sensitive/independent (i)
+     *   bit 3: strict order/numeric (n)
+     * unless indord (a) is set set, in which case only test for
+     * descending by assuming only (O) is possible (not verified).
+     */
     if (sortit)
 	sortit += (casind << 1) + (numord << 2);
 
+    /*
+     * premul, postmul specify the padding character to be used
+     * multiple times with the (l) and (r) flags respectively.
+     */
     if (!premul)
 	premul = " ";
     if (!postmul)
 	postmul = " ";
 
+    /*
+     * Look for special unparenthesised flags.
+     * TODO: could make these able to appear inside parentheses, too,
+     * i.e. ${(^)...} etc.
+     */
     for (;;) {
 	if ((c = *s) == '^' || c == Hat) {
+	    /* RC_EXPAND_PARAM on or off (doubled )*/
 	    if ((c = *++s) == '^' || c == Hat) {
 		plan9 = 0;
 		s++;
 	    } else
 		plan9 = 1;
 	} else if ((c = *s) == '=' || c == Equals) {
+	    /* SH_WORD_SPLIT on or off (doubled). spbreak = 2 means force */
 	    if ((c = *++s) == '=' || c == Equals) {
 		spbreak = 0;
 		s++;
@@ -1114,19 +1342,33 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		   (iident(cc = s[1])
 		    || cc == '*' || cc == Star || cc == '@'
 		    || cc == '-' || (cc == ':' && s[2] == '-')
-		    || (isstring(cc) && (s[2] == Inbrace || s[2] == Inpar))))
+		    || (isstring(cc) && (s[2] == Inbrace || s[2] == Inpar)))) {
 	    getlen = 1 + whichlen, s++;
-	else if (c == '~' || c == Tilde) {
+	    /*
+	     * Return the length of the parameter.
+	     * getlen can be more than 1 to indicate characters (2),
+	     * words ignoring multiple delimiters (3), words taking
+	     * account of multiple delimiters.  delimiter is in
+	     * spsep, NULL means $IFS.
+	     */
+	} else if (c == '~' || c == Tilde) {
+	    /* GLOB_SUBST on or off (doubled) */
 	    if ((c = *++s) == '~' || c == Tilde) {
 		globsubst = 0;
 		s++;
 	    } else
 		globsubst = 1;
 	} else if (c == '+') {
+	    /*
+	     * Return whether indicated parameter is set. 
+	     * Try to handle this when parameter is named
+	     * by (P) (second part of test).
+	     */
 	    if (iident(s[1]) || (aspar && isstring(s[1]) &&
 				 (s[2] == Inbrace || s[2] == Inpar)))
 		chkset = 1, s++;
 	    else if (!inbrace) {
+		/* Special case for `$+' on its own --- leave unmodified */
 		*aptr = '$';
 		*str = aptr + 1;
 		return n;
@@ -1134,13 +1376,31 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		zerr("bad substitution", NULL, 0);
 		return NULL;
 	    }
-	} else if (inbrace && INULL(*s))
+	} else if (inbrace && INULL(*s)) {
+	    /*
+	     * Handles things like ${(f)"$(<file)"} by skipping 
+	     * the double quotes.  We don't need to know what was
+	     * actually there; the presence of a String or Qstring
+	     * is good enough.
+	     */
 	    s++;
-	else
+	} else
 	    break;
     }
+    /* Don't activate special pattern characters if inside quotes */
     globsubst = globsubst && !qt;
 
+    /*
+     * At this point, we usually expect a parameter name.
+     * However, there may be a nested ${...} or $(...).
+     * These say that the parameter itself is somewhere inside,
+     * or that there isn't a parameter and we will get the values
+     * from a command substitution itself.  In either case,
+     * the current instance of paramsubst() doesn't fetch a value,
+     * it just operates on what gets passed up.
+     * (The first ought to have been {...}, reserving ${...}
+     * for substituting a value at that point, but it's too late now.)
+     */
     idbeg = s;
     if ((subexp = (inbrace && s[-1] && isstring(*s) &&
 		   (s[1] == Inbrace || s[1] == Inpar)))) {
@@ -1151,26 +1411,78 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	skipparens(*s, *s == Inpar ? Outpar : Outbrace, &s);
 	sav = *s;
 	*s = 0;
+	/*
+	 * This handles arrays.  TODO: this is not the most obscure call to
+	 * multsub() (see below) but even so it would be nicer to pass down
+	 * and back the arrayness more rationally.  In that case, we should
+	 * remove the aspar test and extract a value from an array, if
+	 * necessary, when we handle (P) lower down.
+	 */
 	if (multsub(&val, (aspar ? NULL : &aval), &isarr, NULL) && quoted) {
+	    /* Empty quoted string --- treat as null string, not elided */
 	    isarr = -1;
 	    aval = (char **) hcalloc(sizeof(char *));
 	    aspar = 0;
 	} else if (aspar)
 	    idbeg = val;
 	*s = sav;
+	/*
+	 * This tests for the second double quote in an expression
+	 * like ${(f)"$(<file)"}, compare above.
+	 */
 	while (INULL(*s))
 	    s++;
 	v = (Value) NULL;
     } else if (aspar) {
+	/*
+	 * No subexpression, but in any case the value is going
+	 * to give us the name of a parameter on which we do
+	 * our remaining processing.  In other words, this
+	 * makes ${(P)param} work like ${(P)${param}}.  (Probably
+	 * better looked at, this is the basic code for ${(P)param}
+	 * and it's been kludged into the subexp code because no
+	 * opportunity for a kludge has been neglected.)
+	 */
 	if ((v = fetchvalue(&vbuf, &s, 1, (qt ? SCANPM_DQUOTED : 0)))) {
 	    val = idbeg = getstrvalue(v);
 	    subexp = 1;
 	} else
 	    vunset = 1;
     }
+    /*
+     * We need to retrieve a value either if we haven't already
+     * got it from a subexpression, or if the processing so
+     * far has just yielded us a parameter name to be processed
+     * with (P).
+     */
     if (!subexp || aspar) {
 	char *ov = val;
 
+	/*
+	 * Second argument: decide whether to use the subexpression or
+	 *   the string next on the line as the parameter name.
+	 * Third argument:  decide how processing for brackets
+	 *   1 means full processing
+	 *   -1 appears to mean something along the lines of
+	 *     only handle single digits and don't handle brackets.
+	 *     I *think* (but it's really only a guess) that this
+	 *     is used by the test below the wantt handling, so
+	 *     that in certain cases we handle brackets there.
+	 *   0 would apparently mean something like we know we
+	 *     should have the name of a scalar and we get cross
+	 *     if there's anything present which disagrees with that
+	 * but you will search fetchvalue() in vain for comments on this.
+	 * Fourth argument gives flags to do with keys, values, quoting,
+	 * assigning depending on context and parameter flags.
+	 *
+	 * This is the last mention of subexp, so presumably this
+	 * is what the code which makes sure subexp is set if aspar (the
+	 * (P) flag) is set.  I *think* what's going on here is the
+	 * second argument is for both input and output: with
+	 * subexp, we only want the input effect, whereas normally
+	 * we let fetchvalue set the main string pointer s to
+	 * the end of the bit it's fetched.
+	 */
 	if (!(v = fetchvalue(&vbuf, (subexp ? &ov : &s),
 			     (wantt ? -1 :
 			      ((unset(KSHARRAYS) || inbrace) ? 1 : -1)),
@@ -1181,6 +1493,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    vunset = 1;
 
 	if (wantt) {
+	    /*
+	     * Handle the (t) flag: value now becomes the type
+	     * information for the parameter.
+	     */
 	    if (v && v->pm && !(v->pm->flags & PM_UNSET)) {
 		int f = v->pm->flags;
 
@@ -1227,8 +1543,24 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    isarr = 0;
 	}
     }
+    /*
+     * We get in here two ways; either we need to convert v into
+     * the local value system, or we need to get rid of brackets
+     * even if there isn't a v.
+     */
     while (v || ((inbrace || (unset(KSHARRAYS) && vunset)) && isbrack(*s))) {
 	if (!v) {
+	    /*
+	     * Index applied to non-existent parameter; we may or may
+	     * not have a value to index, however.  Create a temporary
+	     * empty parameter as a trick, and index on that.  This
+	     * usually happens the second time around the loop when
+	     * we've used up the original parameter value and want to
+	     * apply a subscript to what's left.  However, it's also
+	     * possible it's got something to do with some of that murky
+	     * passing of -1's as the third argument to fetchvalue() to
+	     * inhibit bracket parsing at that stage.
+	     */
 	    Param pm;
 	    char *os = s;
 
@@ -1251,6 +1583,21 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    if (getindex(&s, v, qt) || s == os)
 		break;
 	}
+	/*
+	 * This is where we extract a value (we know now we have
+	 * one) into the local parameters for a scalar (val) or
+	 * array (aval) value.  TODO: move val and aval into
+	 * a structure with a discriminator.  Hope we can make
+	 * more things array values at this point and dearrayify later.
+	 * v->isarr tells us whether the stuff form down below looks
+	 * like an array.  Unlike multsub() this is probably clean
+	 * enough to keep, although possibly the parameter passing
+	 * needs reorganising.
+	 *
+	 * I think we get to discard the existing value of isarr
+	 * here because it's already been taken account of, either
+	 * in the subexp stuff or immediately above.
+	 */
 	if ((isarr = v->isarr)) {
 	    /* No way to get here with v->inv != 0, so getvaluearr() *
 	     * is called by getarrvalue(); needn't test PM_HASHED.   */
@@ -1260,7 +1607,18 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    } else
 		aval = getarrvalue(v);
 	} else {
+	    /* Value retrieved from parameter/subexpression is scalar */
 	    if (v->pm->flags & PM_ARRAY) {
+		/*
+		 * Although the value is a scalar, the parameter
+		 * itself is an array.  Presumably this is due to
+		 * being quoted, or doing single substitution or something,
+		 * TODO: we're about to do some definitely stringy
+		 * stuff, so something like this bit is probably
+		 * necessary.  However, I'd like to leave any
+		 * necessary joining of arrays until this point
+		 * to avoid the multsub() horror.
+		 */
 		int tmplen = arrlen(v->pm->gets.afn(v->pm));
 
 		if (v->start < 0)
@@ -1269,6 +1627,15 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		    vunset = 1;
 	    }
 	    if (!vunset) {
+		/*
+		 * There really is a value.  Apply any necessary
+		 * padding or case transformation.  Note these
+		 * are the per-parameter transformations specified
+		 * with typeset, not the per-substitution ones set
+		 * by flags.  TODO: maybe therefore this would
+		 * be more consistent if moved into getstrvalue()?
+		 * Bet that's easier said than done.
+		 */
 		val = getstrvalue(v);
 		fwidth = v->pm->ct ? v->pm->ct : strlen(val);
 		switch (v->pm->flags & (PM_LEFT | PM_RIGHT_B | PM_RIGHT_Z)) {
@@ -1335,10 +1702,25 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		}
 	    }
 	}
+	/*
+	 * Finished with the original parameter and its indices;
+	 * carry on looping to see if we need to do more indexing.
+	 * This means we final get rid of v in favour of val and
+	 * aval.  We could do with somehow encapsulating the bit
+	 * where we need v.
+	 */
 	v = NULL;
 	if (!inbrace)
 	    break;
     }
+    /*
+     * We're now past the name or subexpression; the only things
+     * which can happen now are a closing brace, one of the standard
+     * parameter postmodifiers, or a history-style colon-modifier.
+     *
+     * Again, this duplicates tests for characters we're about to
+     * examine properly later on.
+     */
     if (inbrace &&
 	(c = *s) != '-' && c != '+' && c != ':' && c != '%'  && c != '/' &&
 	c != '=' && c != Equals &&
@@ -1348,6 +1730,30 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	zerr("bad substitution", NULL, 0);
 	return NULL;
     }
+    /*
+     * Join arrays up if we're in quotes and there isn't some
+     * override such as (@).
+     * TODO: hmm, if we're called as part of some recursive
+     * substitution do we want to delay this until we get back to
+     * the top level?  Or is if there's a qt (i.e. this parameter
+     * substitution is in quotes) always good enough?  Potentially
+     * we may be OK by now --- all potential `@'s and subexpressions
+     * have been handled, including any [@] index which comes up
+     * by virture of v->isarr being set to SCANPM_ISVAR_AT which
+     * is now in isarr.
+     *
+     * However, if we are replacing multsub() with something that
+     * doesn't mangle arrays, we may need to delay this step until after
+     * the foo:- or foo:= or whatever that causes that.  Note the value
+     * (string or array) at this point is irrelevant if we are going to
+     * be doing that.  This would mean // and stuff get applied
+     * arraywise even if quoted.  That's probably wrong, so maybe
+     * this just stays.
+     *
+     * We do a separate stage of dearrayification in the YUK chunk,
+     * I think mostly because of the way we make array or scalar
+     * values appear to the caller.
+     */
     if (isarr) {
 	if (nojoin)
 	    isarr = -1;
@@ -1358,9 +1764,20 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
     }
 
     idend = s;
-    if (inbrace)
+    if (inbrace) {
+	/*
+	 * This is to match a closing double quote in case
+	 * we didn't have a subexpression, e.g. ${"foo"}.
+	 * This form is pointless, but logically it ought to work.
+	 */
 	while (INULL(*s))
 	    s++;
+    }
+    /*
+     * We don't yet know whether a `:' introduces a history-style
+     * colon modifier or qualifies something like ${...:=...}.
+     * But if we remember the colon here it's easy to check later.
+     */
     if ((colf = *s == ':'))
 	s++;
 
@@ -1391,13 +1808,18 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 
     if (inbrace && ((c = *s) == '-' ||
 		    c == '+' ||
-		    c == ':' ||
+		    c == ':' ||	/* i.e. a doubled colon */
 		    c == '=' || c == Equals ||
 		    c == '%' ||
 		    c == '#' || c == Pound ||
 		    c == '?' || c == Quest ||
 		    c == '/')) {
 
+	/*
+	 * Default index is 1 if no (I) or (I) gave zero.   But
+	 * why don't we set the default explicitly at the start
+	 * and massage any passed index where we set flnum anyway?
+	 */
 	if (!flnum)
 	    flnum++;
 	if (c == '%')
@@ -1455,6 +1877,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    *ptr = '\0';
 	}
 
+	/* See if this was ${...:-...}, ${...:=...}, etc. */
 	if (colf)
 	    flags |= SUB_ALL;
 	/*
@@ -1487,12 +1910,23 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		 * (except according to $@ rules); but this leaves the
 		 * unquoted substrings unsplit, and other code below
 		 * for spbreak splits even within the quoted substrings.
+		 *
+		 * TODO: I think multsub needs to be told enough to
+		 * decide about splitting with spbreak at this point
+		 * (and equally in the `=' handler below).  Then
+		 * we can turn off spbreak to avoid the join & split
+		 * nastiness later.
+		 *
+		 * What we really want to do is make this look as
+		 * if it were the result of an assignment from
+		 * the same value, taking account of quoting.
 		 */
 		multsub(&val, (aspar ? NULL : &aval), &isarr, NULL);
 		copied = 1;
 	    }
 	    break;
 	case ':':
+	    /* this must be `::=', unconditional assignment */
 	    if (*s != '=' && *s != Equals)
 		goto noclosebrace;
 	    vunset = 1;
@@ -1507,11 +1941,22 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		*idend = '\0';
 		val = dupstring(s);
 		isarr = 0;
+		/*
+		 * TODO: this is one of those places where I don't
+		 * think we want to do the joining until later on.
+		 * We also need to handle spbreak and spsep at this
+		 * point and unset them.
+		 */
 		if (spsep || spbreak || !arrasg)
 		    multsub(&val, NULL, NULL, sep);
 		else
 		    multsub(&val, &aval, &isarr, NULL);
 		if (arrasg) {
+		    /*
+		     * This is an array assignment in a context
+		     * where we have no syntactic way of finding
+		     * out what an array element is.  So we just guess.
+		     */
 		    char *arr[2], **t, **a, **p;
 		    if (spsep || spbreak) {
 			aval = sepsplit(val, spsep, 0, 1);
@@ -1635,6 +2080,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 #endif
 	    }
 
+	    /*
+	     * Either loop over an array doing replacements or
+	     * do the replacment on a string.
+	     */
 	    if (!vunset && isarr) {
 		getmatcharr(&aval, s, flags, flnum, replstr);
 		copied = 1;
@@ -1647,6 +2096,11 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    break;
 	}
     } else {			/* no ${...=...} or anything, but possible modifiers. */
+	/*
+	 * Handler ${+...}.  TODO: strange, why do we handle this only
+	 * if there isn't a trailing modifier?  Why don't we do this
+	 * e.g. when we hanlder the ${(t)...} flag?
+	 */
 	if (chkset) {
 	    val = dupstring(vunset ? "0" : "1");
 	    isarr = 0;
@@ -1659,6 +2113,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    val = dupstring("");
 	}
 	if (colf) {
+	    /*
+	     * History style colon modifiers.  May need to apply
+	     * on multiple elements of an array.
+	     */
 	    s--;
 	    if (unset(KSHARRAYS) || inbrace) {
 		if (!isarr)
@@ -1695,6 +2153,13 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
     }
     if (errflag)
 	return NULL;
+    /*
+     * This handles taking a length with ${#foo} and variations.
+     * TODO: again. one might naively have thought this had the
+     * same sort of effect as the ${(t)...} flag and the ${+...}
+     * test, although in this case we do need the value rather
+     * the the parameter, so maybe it's a bit different.
+     */
     if (getlen) {
 	long len = 0;
 	char buf[14];
@@ -1725,6 +2190,23 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	val = dupstring(buf);
 	isarr = 0;
     }
+    /*
+     * I think this mult_isarr stuff here is used to pass back
+     * the setting of whether we are an array to multsub, and
+     * thence to the top-level paramsubst().  The way the
+     * setting is passed back is completely obscure, however.
+     * It's presumably at this point because we try to remember
+     * whether the value was `really' an array before massaging
+     * some special cases.
+     *
+     * TODO: YUK.  This is not the right place to turn arrays into
+     * scalars; we should pass back as an array, and let the calling
+     * code decide how to deal with it.  This is almost certainly
+     * a lot harder than it sounds.  Do we really need to handle
+     * one-element arrays as scalars at this point?  Couldn't
+     * we just test for it later rather than having a multiple-valued
+     * wave-function for isarr?
+     */
     mult_isarr = isarr;
     if (isarr > 0 && !plan9 && (!aval || !aval[0])) {
 	val = dupstring("");
@@ -1739,6 +2221,12 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
     }
     /* ssub is true when we are called from singsub (via prefork).
      * It means that we must join arrays and should not split words. */
+    /*
+     * TODO: this is what is screwing up the use of SH_WORD_SPLIT
+     * after `:-' etc.  If we fix multsub(), we might get away
+     * with simply unsetting the appropriate flags when they
+     * get handled.
+     */
     if (ssub || spbreak || spsep || sep) {
 	if (isarr)
 	    val = sepjoin(aval, sep, 1), isarr = 0;
@@ -1753,6 +2241,9 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	}
 	mult_isarr = isarr;
     }
+    /*
+     * Perform case modififications.
+     */
     if (casmod) {
 	if (isarr) {
 	    char **ap;
@@ -1782,6 +2273,9 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		makecapitals(&val);
 	}
     }
+    /*
+     * Perform prompt-style modifications.
+     */
     if (presc) {
 	int ops = opts[PROMPTSUBST], opb = opts[PROMPTBANG];
 	int opp = opts[PROMPTPERCENT], len;
@@ -1790,6 +2284,14 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    opts[PROMPTPERCENT] = 1;
 	    opts[PROMPTSUBST] = opts[PROMPTBANG] = 0;
 	}
+	/*
+	 * TODO:  It would be really quite nice to abstract the
+	 * isarr and !issarr code into a function which gets
+	 * passed a pointer to a function with the effect of
+	 * the promptexpand bit.  Then we could use this for
+	 * a lot of stuff and bury val/aval/isarr inside a structure
+	 * which gets passed to it.
+	 */
 	if (isarr) {
 	    char **ap;
 
@@ -1820,6 +2322,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	opts[PROMPTBANG] = opb;
 	opts[PROMPTPERCENT] = opp;
     }
+    /*
+     * One of the possible set of quotes to apply, depending on
+     * the repetitions of the (q) flag.
+     */
     if (quotemod) {
 	if (--quotetype > 3)
 	    quotetype = 3;
@@ -1903,6 +2409,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    }
 	}
     }
+    /*
+     * Transform special characters in the string to make them
+     * printable.
+     */
     if (visiblemod) {
 	if (isarr) {
 	    char **ap;
@@ -1916,6 +2426,11 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    val = nicedupstring(val);
 	}
     }
+    /*
+     * Nothing particularly to do with SH_WORD_SPLIT --- this
+     * performs lexical splitting on a string as specified by
+     * the (z) flag.
+     */
     if (shsplit) {
 	LinkList list = NULL;
 
@@ -1944,6 +2459,21 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	}
 	copied = 1;
     }
+    /*
+     * TODO: hmm.  At this point we have to be on our toes about
+     * whether we're putting stuff into a line or not, i.e.
+     * we don't want to do this from a recursive call; this is
+     * probably part of the point of the mult_isarr monkey business.
+     * Rather than passing back flags in a non-trivial way, maybe
+     * we could decide on the basis of flags passed down to us.
+     *
+     * This is the ideal place to do any last-minute conversion from
+     * array to strings.  However, given all the transformations we've
+     * already done, probably if it's going to be done it will already
+     * have been.  (I'd really like to keep everying in aval or
+     * equivalent and only locally decide if we need to treat it
+     * as a scalar.)
+     */
     if (isarr) {
 	char *x;
 	char *y;
@@ -1951,6 +2481,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	int i;
 	LinkNode on = n;
 
+	/* Handle the (u) flag; we need this before the next test */
 	if (unique) {
 	    if(!copied)
 		aval = arrdup(aval);
@@ -1960,6 +2491,16 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		zhuniqarray(aval);
 	}
 	if ((!aval[0] || !aval[1]) && !plan9) {
+	    /*
+	     * Empty array or single element.  Currently you only
+	     * get a single element array at this point from the
+	     * unique expansion above. but we can potentially
+	     * have other reasons.
+	     *
+	     * The following test removes the markers
+	     * from surrounding double quotes, but I don't know why
+	     * that's necessary.
+	     */
 	    int vallen;
 	    if (aptr > (char *) getdata(n) &&
 		aptr[-1] == Dnull && *fstr == Dnull)
@@ -1977,6 +2518,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    setdata(n, y);
 	    return n;
 	}
+	/* Handle (o) and (O) and their variants */
 	if (sortit) {
 	    if (!copied)
 		aval = arrdup(aval);
@@ -2004,6 +2546,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	    }
 	}
 	if (plan9) {
+	    /* Handle RC_EXPAND_PARAM */
 	    LinkNode tn;
 	    local_list1(tl);
 
@@ -2049,6 +2592,14 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 		return n;
 	    }
 	} else {
+	    /*
+	     * Not RC_EXPAND_PARAM: simply join the first and
+	     * last values.
+	     * TODO: how about removing the restriction that
+	     * aval[1] is non-NULL to promote consistency?, or
+	     * simply changing the test so that we drop into
+	     * the scalar branch, instead of tricking isarr?
+	     */
 	    x = aval[0];
 	    if (prenum || postnum)
 		x = dopadding(x, prenum, postnum, preone, postone,
@@ -2095,6 +2646,11 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub)
 	if (eval)
 	    n = on;
     } else {
+	/*
+	 * Scalar value.  Handle last minute transformations
+	 * such as left- or right-padding and the (e) flag to
+	 * revaluate the result.
+	 */
 	int xlen;
 	char *x;
 	char *y;