regsub converter added

2016-06-14 11:36:36 +02:00
parent 126da8c499
commit 2ca8a129f7
3 changed files with 216 additions and 32 deletions
--- a/doc/formats.html
+++ b/doc/formats.html
@ -528,7 +528,7 @@ In input, the next byte or bytes must match the checksum.
 </dl>

 <a name="regex"></a>
-<h2>12. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
+<h2>13. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
 <p>
 This input-only format matches <a target="ex"
 href="http://www.pcre.org/" >Perl compatible regular expressions (PCRE)</a>.
@ -569,9 +569,60 @@ Example: <code>%.1/&lt;title&gt;(.*)&lt;\/title&gt;/</code> returns
 the title of an HTML page, skipps anything before the
 <code>&lt;title&gt</code> tag and leaves anything after the
 <code>&lt;/title&gt;</code> tag in the input buffer.
+</p>
+<a name="regsub"></a>
+<h2>14. Regular Expresion Substitution Pseudo-Converter (<code>%#/<em>regex</em>/<em>subst</em>/</code>)</h2>
+<p>
+This is a variant of the previous converter (note the <code>#</code>)
+but instead of returning the matching string,
+it can be used as a pre-processor for input or
+as a post-processor for output.
+</p>
+<p>
+Any match of the <em>regex</em> is replaced by the string <em>subst</em> with any
+<code>&</code> or <code>\0</code> in <em>subst</em> replaced with the match itself and any
+<code>\1</code> through <code>\9</code> with the corresponding sub-expressions.
+To get a literal <code>&</code> or <code>\</code> in the substitution write
+<code>\&</code> or <code>\\</code>.
+</p>
+<p>
+If <em>width</em> is specified, it limits the number of characters processed.
+If the <code>-</code> flag is used (i.e. <em>width</em> looks like a negative number)
+only the last <em>width</em> caracters are processed, else the first.
+Without <em>width</em> all available characters are processed.
+</p>
+<p>
+If <em>prec</em> is specified, it limits the number of times the substitution is applied.
+Without <em>prec</em>, the substitution is applied as often as possible.
+</p>
+<p>
+In input this converter pre-processes data received from the device before
+other converters after this one read it.
+Converters before this one will see unmodified input.
+Thus place this converter before those whose input should be pre-processed.
+</p>
+<p>
+In output it post-processes data already formatted by other converters before this one
+before sending it to the device.
+Converters after this one will send their output unmodified.
+Thus place this converter after those whose output should be post-processed.
+</p>
+<p>
+Examples:<br>
+<code>%#-10.2/ab/X/</code> replaces the string <code>ab</code> with <code>X</code>
+maximal 2 times in the last 10 characters.
+(<code>abcabcabcabc</code> becomes <code>abcXcXcabc</code>)<br>
+<code>%#/..\B/&:/</code> writes <code>:</code> after every second character
+which is not at the end of a word.
+(<code>0b19353134</code> becomes <code>0b:19:35:31:34</code>)<br>
+<code>%#/://</code> removes all <code>:</code>.
+(<code>0b:19:35:31:34</code> becomes <code>0b19353134</code>)<br>
+<code>%#/([^+-])*([+-])/\2\1/</code> moves a postfix sign to the front.
+(<code>1.23-</code> becomes <code>-1.23</code>)<br>
+
 </p>
 <a name="mantexp"></a>
-<h2>13. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
+<h2>15. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
 <p>
 This exotic and experimental format matches numbers in the format
 <i>[sign] mantissa sign exponent</i>, e.g <code>+123-4</code> meaning
@ -591,7 +642,7 @@ the usual way (always sign, left justified, space instead of + sign).
 Flags <code>#</code> and <code>0</code> are unsupported.
 </p>
 <a name="timestamp"></a>
-<h2>14. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
+<h2>16. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
 <p>
 This format reads or writes timestamps and converts them to a double number.
 The value represents the number of seconds since 1970 (the UNIX epoch).
--- a/doc/nav.html
+++ b/doc/nav.html
@ -132,6 +132,7 @@ div div div a {list-style-type:circle;}
  <a target="_parent" href="formats.html#bcd"       title="Binary coded decimal LONG converter">%D</a>
  <a target="_parent" href="formats.html#chksum"    title="Checksum pseudo converter">%&lt;<em>checksum</em>&gt;</a>
  <a target="_parent" href="formats.html#regex"     title="Perl regular expression STRING converter">%/<em>regex</em>/</a>
+  <a target="_parent" href="formats.html#regsub"    title="Perl regular expression substitution pseudo converter">%#/<em>regex</em>/<em>subst</em>/</a>
  <a target="_parent" href="formats.html#mantexp"   title="MantissaExponent DOUBLE converter">%m</a>
  <a target="_parent" href="formats.html#timestamp" title="Timestamp DOUBLE converter">%T</a>
 </div>
--- a/src/RegexpConverter.cc
+++ b/src/RegexpConverter.cc
@ -37,15 +37,17 @@

 class RegexpConverter : public StreamFormatConverter
 {
-    int parse (const StreamFormat&, StreamBuffer&, const char*&, bool);
-    int scanString(const StreamFormat&, const char*, char*, size_t);
+    int parse (const StreamFormat& fmt, StreamBuffer&, const char*&, bool);
+    int scanString(const StreamFormat& fmt, const char*, char*, size_t);
+    int scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor);
+    bool printPseudo(const StreamFormat& fmt, StreamBuffer& output);
 };

 int RegexpConverter::
 parse(const StreamFormat& fmt, StreamBuffer& info,
    const char*& source, bool scanFormat)
 {
-    if (!scanFormat)
+    if (!scanFormat && !(fmt.flags & alt_flag))
    {
        error("Format conversion %%/regexp/ is only allowed in input formats\n");
        return false;
@ -55,28 +57,24 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
        error("Subexpression index %d too big (>9)\n", fmt.prec);
        return false;
    }    
-    if (fmt.flags & (left_flag|space_flag|zero_flag|alt_flag))
-    {
-        error("Use of modifiers '-', ' ', '0', '#'"
-            "not allowed with %%/regexp/ conversion\n");
-        return false;
-    }
+
    StreamBuffer pattern;
    while (*source != '/')
    {
        if (!*source) {
-            error("Missing closing '/' after %%/ format conversion\n");
+            error("Missing closing '/' after %%/%s format conversion\n", pattern());
            return false;
        }
        if (*source == esc) {
            source++;
-            pattern.print("\\x%02x", *source++ & 0xFF);
+            pattern.append('\\');
            continue;
        }
        pattern.append(*source++);
    }
    source++;
    debug("regexp = \"%s\"\n", pattern());
+    
    const char* errormsg;
    int eoffset;
    pcre* code = pcre_compile(pattern(), 0, 
@ -87,6 +85,29 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
        return false;
    }
    info.append(&code, sizeof(code));
+
+    if (fmt.flags & alt_flag)
+    {
+        StreamBuffer subst;
+        while (*source != '/')
+        {
+            if (!*source) {
+                error("Missing closing '/' after %%#/%s/%s format conversion\n", pattern(), subst());
+                return false;
+            }
+            if (*source == esc) {
+                source++;
+                subst.append('\\');
+                if (*source <= 9) subst.append('0'+*source++);
+                continue;
+            }
+            subst.append(*source++);
+        }
+        source++;
+        debug("subst = \"%s\"\n", subst());
+        info.append(subst).append('\0');
+        return pseudo_format;
+    }
    return string_format;
 }

@ -94,32 +115,143 @@ int RegexpConverter::
 scanString(const StreamFormat& fmt, const char* input,
    char* value, size_t maxlen)
 {
-    pcre* code;
-    size_t len;
    int ovector[30];
    int rc;
-    int subexpr = 0;
+    unsigned int l;
    
-    memcpy (&code, fmt.info, sizeof(code));
+    const char* info = fmt.info;
+    pcre* code = extract<pcre*>(info);
+    int length = fmt.width > 0 ? fmt.width : strlen(input);
+    int subexpr = fmt.prec > 0 ? fmt.prec : 0;
    
-    len = fmt.width > 0 ? fmt.width : strlen(input);
-    subexpr = fmt.prec > 0 ? fmt.prec : 0;
-    rc = pcre_exec(code, NULL, input, len, 0, 0, ovector, 30);
-    if (rc < 1) return -1;
-    if (fmt.flags & skip_flag) return ovector[1];
-    len = ovector[subexpr*2+1] - ovector[subexpr*2];
-    if (len >= maxlen) {
+    debug("input = \"%s\"\n", input);
+    debug("length=%d\n", length);
+    
+    rc = pcre_exec(code, NULL, input, length, 0, 0, ovector, 30);
+    debug("pcre_exec match \"%.*s\" result = %d\n", length, input, rc);
+    if ((subexpr && rc <= subexpr) || rc < 0)
+    {
+        /* error or no match or not enough sub-expressions */
+        return -1;
+    }
+    if (fmt.flags & skip_flag) return ovector[subexpr*2+1];
+
+    l = ovector[subexpr*2+1] - ovector[subexpr*2];
+    if (l >= maxlen) {
        if (!(fmt.flags & sign_flag)) {
-            error("Regexp: Matching string \"%s\" too long (%d>%d bytes). You may want to try the + flag: \"%%+/.../\"\n",
-                StreamBuffer(input+ovector[subexpr*2], len).expand()(),
-                (int)len, (int)maxlen-1);
+            error("Regexp: Matching string \"%s\" too long (%d>%ld bytes). You may want to try the + flag: \"%%+/.../\"\n",
+                StreamBuffer(input + ovector[subexpr*2],l).expand()(),
+                l, (long)maxlen-1);
            return -1;
        }
-        len = maxlen-1;
+        l = maxlen-1;
    }
-    memcpy(value, input+ovector[subexpr*2], len);
-    value[len]=0;
-    return ovector[1];
+    memcpy(value, input + ovector[subexpr*2], l);
+    value[l] = '\0';
+    return ovector[1]; /* consume input until end of match */;
+}
+
+static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length, const char* subst, int max)
+{
+    int rc, l, c, r, rl, n=0;
+    int ovector[30];
+    StreamBuffer s;
+    if (length == 0)
+    {
+        length = buffer.length() - start;
+    }
+    else if (length < 0)
+    {
+        length = -length;
+        if (length > buffer.length() - start)
+            length = buffer.length() - start;
+        start = buffer.length() - length;
+    }
+    else
+    {
+        if (length > buffer.length() - start)
+            length = buffer.length() - start;
+    }
+    debug("regsubst buffer=\"%s\", start=%ld, length=%ld, subst = \"%s\", max = %d\n",
+        buffer.expand()(), start, length, subst, max);
+    for (c = 0; c < length; )
+    {
+        rc = pcre_exec(code, NULL, buffer(start+c), length-c, 0, 0, ovector, 30);
+        debug("pcre_exec match \"%.*s\" result = %d\n", (int)length-c, buffer(start+c), rc);
+
+        if (rc < 0 || (max && n++ == max))
+            return; /* no match or maximum substitutions reached */
+        /* replace & by match in subst */
+        l = ovector[1] - ovector[0];
+        debug("start = \"%s\"\n", buffer(start+c));
+        debug("match = \"%.*s\"\n", l, buffer(start+c+ovector[0]));
+        for (r = 1; r < rc; r++)
+            debug("sub%d = \"%.*s\"\n", r, ovector[r*2+1]-ovector[r*2], buffer(start+c+ovector[r*2]));
+        debug("rest  = \"%s\"\n", buffer(start+c+ovector[1]));
+        s = subst;
+        debug("subs = \"%s\"\n", s.expand()());
+        for (r = 0; r < s.length(); r++)
+        {
+            debug("check \"%s\"\n", s(r));
+            if (s[r] == '\\')
+            {
+                unsigned char ch = s[r+1];
+                if (ch >= '0' && ch <= '9')
+                {
+                    ch = (ch - '0')*2;
+                    rl = ovector[ch+1] - ovector[ch];
+                    debug("replace \\%d: \"%.*s\"\n", ch/2, rl, buffer(start+c+ovector[ch]));
+                    s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
+                    r += rl - 1;
+                }
+                else if (ch == '\\' || ch == '&')
+                    s.remove(r++, 1);
+            }
+            else if (s[r] == '&')
+            {
+                debug("replace &: \"%.*s\"\n", l,  buffer(start+c+ovector[0]));
+                s.replace(r, 1, buffer(start+c+ovector[0]), l);
+                r += l - 1;
+            }
+            else continue;
+            debug("subs = \"%s\"\n", s());
+        }
+        buffer.replace(start+c+ovector[0], l, s);
+        length += s.length() - l;
+        c += s.length();
+    }
+}
+
+int RegexpConverter::
+scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor)
+{
+    /* re-write input buffer */
+    const char* info = fmt.info;
+    pcre* code;
+    long length;
+    StreamBuffer subst;
+    
+    code = extract<pcre*>(info);
+    if (fmt.flags & left_flag) length = -fmt.width;
+    else length = fmt.width;
+    regsubst(code, input, cursor, length, info, fmt.prec);
+    return 0;
+}
+
+bool RegexpConverter::
+printPseudo(const StreamFormat& fmt, StreamBuffer& output)
+{
+    /* re-write output buffer */
+    const char* info = fmt.info;
+    pcre* code;
+    long length;
+    StreamBuffer subst;
+    
+    code = extract<pcre*>(info);
+    if (fmt.flags & left_flag) length = -fmt.width;
+    else length = fmt.width;
+    regsubst(code, output, 0, length, info, fmt.prec);
+    return true;
 }

 RegisterConverter (RegexpConverter, "/");