change the meaning of pre for regsub slightly

2016-06-15 14:52:45 +02:00
parent 06e212c66e
commit bc67317b0b
3 changed files with 119 additions and 100 deletions
--- a/doc/formats.html
+++ b/doc/formats.html
@ -367,7 +367,7 @@ endian</em>, i.e. least significant byte first.
 With the <code>0</code> flag, the value is unsigned, otherwise signed.
 </p>
 <p>
-In output, the <em>prec</em> (or sizeof(long) whatever is less) least
+In output, the <em>precision</em> (or sizeof(long) whatever is less) least
 significant bytes of the value are sign extended or zero extended
 (depending on the <code>0</code> flag) to <em>width</em> bytes.
 </p>
@ -434,7 +434,7 @@ The <em>width</em> field is the byte number from which to start
 calculating the checksum.
 Default is 0, i.e. the first byte of the input or output of the current
 command.
-The last byte is <em>prec</em> bytes before the checksum (default 0).
+The last byte is <em>precision</em> bytes before the checksum (default 0).
 For example in <code>"abcdefg%&lt;xor&gt;"</code> the checksum is calculated
 from <code>abcdefg</code>,
 but in <code>"abcdefg%2.1&lt;xor&gt;"</code> only from <code>cdef</code>.
@ -534,35 +534,38 @@ This input-only format matches <a target="ex"
 href="http://www.pcre.org/" >Perl compatible regular expressions (PCRE)</a>.
 It is only available if a PCRE library is installed.
 </p>
+<div class="box">
 <p>
 If PCRE is not available for your host or cross architecture, download
 the sourcecode from <a target="ex" href="http://www.pcre.org/">www.pcre.org</a>
 and try my EPICS compatible <a target="ex"
 href="http://epics.web.psi.ch/software/streamdevice/pcre/Makefile">Makefile</a>
-to compile it like a normal EPICS application.
+to compile it like a normal EPICS support module.
 The Makefile is known to work with EPICS 3.14.8 and PCRE 7.2.
 In your RELEASE file define the variable <code>PCRE</code> so that
 it points to the install location of PCRE.
 </p>
 <p>
-If PCRE is already installed on your system, use the variables
-<code>PCRE_INCLUDE</code> and <code>PCRE_LIB</code> instead to provide
-the install directories of <code>pcre.h</code> and the library.
-</p>
-<p>
-If you have PCRE installed in different locations for different (cross)
-architectures, define the variables in RELEASE.Common.&lt;architecture&gt;
-instead of the global RELEASE file.
+If PCRE is already installed on (some of) your systems, you may add
+architectures where PCRE can be found in standard include and library
+locations to the variable <code>WITH_SYSTEM_PCRE</code>.
+If either the header file or the library are in a non-standard place,
+set in your RELEASE file the variables <code>PCRE_INCLUDE_<em>arch</em></code>
+and/or <code>PCRE_LIB_<em>arch</em></code> for the respective architectures
+to the correct directories or set
+<code>PCRE_INCLUDE</code> and/or <code>PCRE_LIB</code>
+in architecture specific RELEASE.Common.<em>arch</em> files.
 </p>
+</div>
 <p>
 If the regular expression is not anchored, i.e. does not start with
 <code>^</code>, leading non-matching input is skipped. 
 A maximum of <em>width</em> bytes is matched, if specified.
-If <em>prec</em> is given, it specifies the sub-expression whose match
+If <em>precision</em> is given, it specifies the sub-expression whose match
 is retuned.
 Otherwise the complete match is returned.
 In any case, the complete match is consumed from the input buffer.
-If the expression contains a <code>/</code> it must be escaped.
+If the expression contains a <code>/</code> it must be escaped like <code>\/</code>.
 </p>
 <p>
 Example: <code>%.1/&lt;title&gt;(.*)&lt;\/title&gt;/</code> returns
@ -579,48 +582,63 @@ it can be used as a pre-processor for input or
 as a post-processor for output.
 </p>
 <p>
-Any match of the <em>regex</em> is replaced by the string <em>subst</em> with any
-<code>&</code> or <code>\0</code> in <em>subst</em> replaced with the match itself and any
-<code>\1</code> through <code>\9</code> with the corresponding sub-expressions.
-To get a literal <code>&</code> or <code>\</code> in the substitution write
-<code>\&</code> or <code>\\</code>.
+Matches of the <em>regex</em> are replaced by the string <em>subst</em> with all
+<code>&</code> or <code>\0</code> in <em>subst</em> replaced with the match itself and all
+<code>\1</code> through <code>\9</code> replaced with the match of the corresponding sub-expression.
+To get a literal <code>&</code> or <code>\</code> or <code>/</code> in the substitution write
+<code>\&</code> or <code>\\</code> or <code>\/</code>.
+There is no way to specify literal bytes with values less or equal to 9 in the
+substitution!
 </p>
 <p>
 If <em>width</em> is specified, it limits the number of characters processed.
 If the <code>-</code> flag is used (i.e. <em>width</em> looks like a negative number)
-only the last <em>width</em> caracters are processed, else the first.
-Without <em>width</em> all available characters are processed.
+only the last <em>width</em> characters are processed, else the first.
+Without <em>width</em> (or 0) all available characters are processed.
 </p>
 <p>
-If <em>prec</em> is specified, it limits the number of times the substitution is applied.
-Without <em>prec</em>, the substitution is applied as often as possible.
+If <em>precision</em> is specified, it indicates which matches to replace.
+With the <code>+</code> flag given, <em>precision</em> is the maximum
+number of matches to replace.
+Otherwise <em>precision</em> is the index (counting from 1) of the match to replace. 
+Without <em>precision</em> (or 0), all matches are replaced.
 </p>
 <p>
 In input this converter pre-processes data received from the device before
-other converters after this one read it.
-Converters before this one will see unmodified input.
+following converters read it.
+Converters preceding this one will read unmodified input.
 Thus place this converter before those whose input should be pre-processed.
 </p>
 <p>
-In output it post-processes data already formatted by other converters before this one
+In output it post-processes data already formatted by preceding converters
 before sending it to the device.
-Converters after this one will send their output unmodified.
+Converters following this one will send their output unmodified.
 Thus place this converter after those whose output should be post-processed.
 </p>
 <p>
-Examples:<br>
-<code>%#-10.2/ab/X/</code> replaces the string <code>ab</code> with <code>X</code>
+Examples:
+<div class="indent">
+<code>%#+-10.2/ab/X/</code> replaces the string <code>ab</code> with <code>X</code>
 maximal 2 times in the last 10 characters.
-(<code>abcabcabcabc</code> becomes <code>abcXcXcabc</code>)<br>
-<code>%#/..\B/&:/</code> writes <code>:</code> after every second character
+(<code>abcabcabcabc</code> becomes <code>abcXcXcabc</code>)
+</div>
+<div class="indent">
+<code>%#/\\/\//</code> replaces all <code>\</code> with <code>/</code>
+(<code>\dir\file</code> becomes <code>/dir/file</code>)
+</div>
+<div class="indent">
+<code>%#/..\B/&:/</code> inserts <code>:</code> after every second character
 which is not at the end of a word.
-(<code>0b19353134</code> becomes <code>0b:19:35:31:34</code>)<br>
-<code>%#/://</code> removes all <code>:</code>.
-(<code>0b:19:35:31:34</code> becomes <code>0b19353134</code>)<br>
+(<code>0b19353134</code> becomes <code>0b:19:35:31:34</code>)
+</div>
+<div class="indent">
+<code>%#/://</code> removes all <code>:</code> characters.
+(<code>0b:19:35:31:34</code> becomes <code>0b19353134</code>)
+</div>
+<div class="indent">
 <code>%#/([^+-])*([+-])/\2\1/</code> moves a postfix sign to the front.
 (<code>1.23-</code> becomes <code>-1.23</code>)<br>
-
-</p>
+</div>
 <a name="mantexp"></a>
 <h2>15. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
 <p>
@ -679,7 +697,7 @@ In output, the system function <em>strftime()</em> is used to format the time.
 There may be differences in the implementation between operating systems.
 </p>
 <p>
-In input, <em>StreamDevice</em> used its own implementation because many
+In input, <em>StreamDevice</em> uses its own implementation because many
 systems are missing the <em>strptime()</em> function and additional formats
 are supported.
 </p>
--- a/doc/stream.css
+++ b/doc/stream.css
@ -88,6 +88,16 @@ code {
    text-align:left;
 }

+.box {
+    margin-left:1ex;
+    margin-right:1ex;
+    margin-top:0.5ex;
+    padding: 0 1ex;
+    border: 1px solid black;
+    text-align:left;
+    background-color:#f0f0f0;
+}
+
 #navleft {
    position:fixed;
    left:0;
--- a/src/RegexpConverter.cc
+++ b/src/RegexpConverter.cc
@ -23,7 +23,7 @@
 #include "string.h"
 #include "pcre.h"

-// Perl regular expressions (PCRE)   %/regexp/
+// Perl regular expressions (PCRE) %/regexp/ and  %#/regexp/subst/

 /* Notes:
 - Memory for compiled regexp is allocated in parse but never freed.
@ -65,15 +65,22 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
            error("Missing closing '/' after %%/%s format conversion\n", pattern());
            return false;
        }
-        if (*source == esc) {
-            source++;
-            pattern.append('\\');
-            continue;
+        if (*source == esc) {          // handle escaped chars
+            if (*++source != '/')      // just un-escape /
+            {
+                pattern.append('\\');
+                if ((*source & 0x7f) < 0x30) // handle control chars
+                {
+                    pattern.print("x%02x", *source++);
+                    continue;
+                }
+                // fall through for PCRE codes like \B
+            }
        }
        pattern.append(*source++);
    }
    source++;
-    debug("regexp = \"%s\"\n", pattern());
+    debug("regexp = \"%s\"\n", pattern.expand()());
    
    const char* errormsg;
    int eoffset;
@ -89,22 +96,19 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
    if (fmt.flags & alt_flag)
    {
        StreamBuffer subst;
+        debug("check for subst in \"%s\"\n", StreamBuffer(source).expand()());        
        while (*source != '/')
        {
            if (!*source) {
                error("Missing closing '/' after %%#/%s/%s format conversion\n", pattern(), subst());
                return false;
            }
-            if (*source == esc) {
-                source++;
-                subst.append('\\');
-                if (*source <= 9) subst.append('0'+*source++);
-                continue;
-            }
+            if (*source == esc)
+                subst.append(*source++);
            subst.append(*source++);
        }
        source++;
-        debug("subst = \"%s\"\n", subst());
+        debug("subst = \"%s\"\n", subst.expand()());
        info.append(subst).append('\0');
        return pseudo_format;
    }
@ -131,7 +135,7 @@ scanString(const StreamFormat& fmt, const char* input,
    debug("pcre_exec match \"%.*s\" result = %d\n", length, input, rc);
    if ((subexpr && rc <= subexpr) || rc < 0)
    {
-        /* error or no match or not enough sub-expressions */
+        // error or no match or not enough sub-expressions
        return -1;
    }
    if (fmt.flags & skip_flag) return ovector[subexpr*2+1];
@ -148,40 +152,41 @@ scanString(const StreamFormat& fmt, const char* input,
    }
    memcpy(value, input + ovector[subexpr*2], l);
    value[l] = '\0';
-    return ovector[1]; /* consume input until end of match */;
+    return ovector[1]; // consume input until end of match 
 }

-static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length, const char* subst, int max)
+static void regsubst(const StreamFormat& fmt, StreamBuffer& buffer, long start)
 {
-    int rc, l, c, r, rl, n=0;
+    const char* subst = fmt.info;
+    pcre* code = extract<pcre*>(subst);
+    long length;
+    int rc, l, c, r, rl, n;
    int ovector[30];
    StreamBuffer s;
-    if (length == 0)
-    {
-        length = buffer.length() - start;
-    }
-    else if (length < 0)
-    {
-        length = -length;
-        if (length > buffer.length() - start)
-            length = buffer.length() - start;
+
+    length = buffer.length() - start;
+    if (fmt.width && fmt.width < length)
+        length = fmt.width;
+    if (fmt.flags & sign_flag)
        start = buffer.length() - length;
-    }
-    else
-    {
-        if (length > buffer.length() - start)
-            length = buffer.length() - start;
-    }
-    debug("regsubst buffer=\"%s\", start=%ld, length=%ld, subst = \"%s\", max = %d\n",
-        buffer.expand()(), start, length, subst, max);
-    for (c = 0; c < length; )
+
+    debug("regsubst buffer=\"%s\", start=%ld, length=%ld, subst = \"%s\"\n",
+        buffer.expand()(), start, length, subst);
+    
+    for (c = 0, n = 1; c < length; n++)
    {
        rc = pcre_exec(code, NULL, buffer(start+c), length-c, 0, 0, ovector, 30);
        debug("pcre_exec match \"%.*s\" result = %d\n", (int)length-c, buffer(start+c), rc);
+        if (rc < 0) // no match 
+            return;
            
-        if (rc < 0 || (max && n++ == max))
-            return; /* no match or maximum substitutions reached */
-        /* replace & by match in subst */
+        if (!(fmt.flags & sign_flag) && n < fmt.prec) // without + flag
+        {
+            // do not yet replace this match
+            c += ovector[1];
+            continue;
+        }
+        // replace & by match in subst
        l = ovector[1] - ovector[0];
        debug("start = \"%s\"\n", buffer(start+c));
        debug("match = \"%.*s\"\n", l, buffer(start+c+ovector[0]));
@ -192,22 +197,22 @@ static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length,
        debug("subs = \"%s\"\n", s.expand()());
        for (r = 0; r < s.length(); r++)
        {
-            debug("check \"%s\"\n", s(r));
-            if (s[r] == '\\')
+            debug("check \"%s\"\n", s.expand(r)());
+            if (s[r] == esc)
            {
                unsigned char ch = s[r+1];
-                if (ch >= '0' && ch <= '9')
+                if (ch < 9) // escaped 0 - 9 : replace with subexpr
                {
-                    ch = (ch - '0')*2;
+                    ch *= 2;
                    rl = ovector[ch+1] - ovector[ch];
                    debug("replace \\%d: \"%.*s\"\n", ch/2, rl, buffer(start+c+ovector[ch]));
                    s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
                    r += rl - 1;
                }
-                else if (ch == '\\' || ch == '&')
-                    s.remove(r, 1);
+                else
+                    s.remove(r, 1); // just remove escape
            }
-            else if (s[r] == '&')
+            else if (s[r] == '&') // unescaped & : replace with match
            {
                debug("replace &: \"%.*s\"\n", l,  buffer(start+c+ovector[0]));
                s.replace(r, 1, buffer(start+c+ovector[0]), l);
@ -219,6 +224,8 @@ static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length,
        buffer.replace(start+c+ovector[0], l, s);
        length += s.length() - l;
        c += s.length();
+        if (n == fmt.prec) // max match reached
+            return;
    }
 }

@ -226,15 +233,7 @@ int RegexpConverter::
 scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor)
 {
    /* re-write input buffer */
-    const char* info = fmt.info;
-    pcre* code;
-    long length;
-    StreamBuffer subst;
-    
-    code = extract<pcre*>(info);
-    if (fmt.flags & left_flag) length = -fmt.width;
-    else length = fmt.width;
-    regsubst(code, input, cursor, length, info, fmt.prec);
+    regsubst(fmt, input, cursor);
    return 0;
 }

@ -242,15 +241,7 @@ bool RegexpConverter::
 printPseudo(const StreamFormat& fmt, StreamBuffer& output)
 {
    /* re-write output buffer */
-    const char* info = fmt.info;
-    pcre* code;
-    long length;
-    StreamBuffer subst;
-    
-    code = extract<pcre*>(info);
-    if (fmt.flags & left_flag) length = -fmt.width;
-    else length = fmt.width;
-    regsubst(code, output, 0, length, info, fmt.prec);
+    regsubst(fmt, output, 0);
    return true;
 }