regsub converter: empty match advances by 1 byte to avoid loops

2019-02-18 11:08:21 +01:00
parent acf7efcff2
commit 04906a5835
2 changed files with 55 additions and 39 deletions
--- a/docs/formats.html
+++ b/docs/formats.html
@ -666,6 +666,13 @@ Otherwise <em>precision</em> is the index (counting from 1) of the match to repl
 Without <em>precision</em> (or 0), all matches are replaced.
 </p>
 <p>
 When replacing multiple matches, the next match is searched directly after the currently
 replaced string, so that the <em>subst</em> string itself will never be modified recursively.
 <span class="new">
 However if an empty string is matched, searching advances by 1 character in order to
 avoid matching the same empty string again.</span>
 </p>
 <p>
 In input this converter pre-processes data received from the device before
 following converters read it.
 Converters preceding this one will read unmodified input.
--- a/src/RegexpConverter.cc
+++ b/src/RegexpConverter.cc
@ -197,54 +197,63 @@ static void regsubst(const StreamFormat& fmt, StreamBuffer& buffer, size_t start
            debug("pcre_exec: no match\n");
            break;
        }
        if (!(fmt.flags & sign_flag) && n < fmt.prec) // without + flag
        {
            // do not yet replace this match
            c += ovector[1];
            continue;
        }
        // replace subexpressions
        l = ovector[1] - ovector[0];
-        debug("before [%d]= \"%s\"\n", ovector[0], buffer.expand(start+c,ovector[0])());
+
-        debug("match  [%d]= \"%s\"\n", l, buffer.expand(start+c+ovector[0],l)());
+        // no prec: replace all matches
-        for (r = 1; r < rc; r++)
+        // prec with + flag: replace first prec matches
-            debug("sub%d = \"%s\"\n", r, buffer.expand(start+c+ovector[r*2], ovector[r*2+1]-ovector[r*2])());
+        // prec without + flag: replace only match number prec
-        debug("after     = \"%s\"\n", buffer.expand(start+c+ovector[1])());
+
-        s = subst;
+        if ((fmt.flags & sign_flag) || n >= fmt.prec)
        debug("subs      = \"%s\"\n", s.expand()());
        for (r = 0; r < (int)s.length(); r++)
        {
-            debug("check \"%s\"\n", s.expand(r)());
+            // replace subexpressions
-            if (s[r] == esc)
+            debug("before [%d]= \"%s\"\n", ovector[0], buffer.expand(start+c,ovector[0])());
            debug("match  [%d]= \"%s\"\n", l, buffer.expand(start+c+ovector[0],l)());
            for (r = 1; r < rc; r++)
                debug("sub%d = \"%s\"\n", r, buffer.expand(start+c+ovector[r*2], ovector[r*2+1]-ovector[r*2])());
            debug("after     = \"%s\"\n", buffer.expand(start+c+ovector[1])());
            s = subst;
            debug("subs      = \"%s\"\n", s.expand()());
            for (r = 0; r < (int)s.length(); r++)
            {
-                unsigned char ch = s[r+1];
+                debug("check \"%s\"\n", s.expand(r)());
-                debug("found escaped \\%u, in range 1-%d?\n", ch, rc-1);
+                if (s[r] == esc)
                if (ch != 0 && ch < rc) // escaped 1 - 9 : replace with subexpr
                {
-                    ch *= 2;
+                    unsigned char ch = s[r+1];
-                    rl = ovector[ch+1] - ovector[ch];
+                    debug("found escaped \\%u, in range 1-%d?\n", ch, rc-1);
-                    debug("yes, replace \\%d: \"%s\"\n", ch/2, buffer.expand(start+c+ovector[ch], rl)());
+                    if (ch != 0 && ch < rc) // escaped 1 - 9 : replace with subexpr
-                    s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
+                    {
-                    r += rl - 1;
+                        ch *= 2;
                        rl = ovector[ch+1] - ovector[ch];
                        debug("yes, replace \\%d: \"%s\"\n", ch/2, buffer.expand(start+c+ovector[ch], rl)());
                        s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
                        r += rl - 1;
                    }
                    else
                    {
                        debug("no, use literal \\%u\n", ch);
                        s.remove(r, 1); // just remove escape
                    }
                }
-                else
+                else if (s[r] == '&') // unescaped & : replace with match
                {
-                    debug("no, use literal \\%u\n", ch);
+                    debug("replace &: \"%s\"\n", buffer.expand(start+c+ovector[0], l)());
-                    s.remove(r, 1); // just remove escape
+                    s.replace(r, 1, buffer(start+c+ovector[0]), l);
                    r += l - 1;
                }
                else continue;
                debug("subs = \"%s\"\n", s.expand()());
            }
-            else if (s[r] == '&') // unescaped & : replace with match
+            buffer.replace(start+c+ovector[0], l, s);
-            {
+            length -= l;
-                debug("replace &: \"%s\"\n", buffer.expand(start+c+ovector[0], l)());
+            length += s.length();
-                s.replace(r, 1, buffer(start+c+ovector[0]), l);
+            c += s.length();
-                r += l - 1;
+        }
-            }
+        c += ovector[0];
-            else continue;
+        if (l == 0)
-            debug("subs = \"%s\"\n", s.expand()());
+        {
            debug("pcre_exec: empty match\n");
            c++; // Empty strings may lead to an endless loop. Match them only once.
        }
        buffer.replace(start+c+ovector[0], l, s);
        length += s.length() - l;
        c += ovector[0] + s.length();
        if (n == fmt.prec) // max match reached
        {
            debug("pcre_exec: max match %d reached\n", n);