regsub converter added

This commit is contained in:
2016-06-14 11:36:36 +02:00
parent 126da8c499
commit 2ca8a129f7
3 changed files with 216 additions and 32 deletions

View File

@ -528,7 +528,7 @@ In input, the next byte or bytes must match the checksum.
</dl>
<a name="regex"></a>
<h2>12. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
<h2>13. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
<p>
This input-only format matches <a target="ex"
href="http://www.pcre.org/" >Perl compatible regular expressions (PCRE)</a>.
@ -569,9 +569,60 @@ Example: <code>%.1/&lt;title&gt;(.*)&lt;\/title&gt;/</code> returns
the title of an HTML page, skipps anything before the
<code>&lt;title&gt</code> tag and leaves anything after the
<code>&lt;/title&gt;</code> tag in the input buffer.
</p>
<a name="regsub"></a>
<h2>14. Regular Expresion Substitution Pseudo-Converter (<code>%#/<em>regex</em>/<em>subst</em>/</code>)</h2>
<p>
This is a variant of the previous converter (note the <code>#</code>)
but instead of returning the matching string,
it can be used as a pre-processor for input or
as a post-processor for output.
</p>
<p>
Any match of the <em>regex</em> is replaced by the string <em>subst</em> with any
<code>&</code> or <code>\0</code> in <em>subst</em> replaced with the match itself and any
<code>\1</code> through <code>\9</code> with the corresponding sub-expressions.
To get a literal <code>&</code> or <code>\</code> in the substitution write
<code>\&</code> or <code>\\</code>.
</p>
<p>
If <em>width</em> is specified, it limits the number of characters processed.
If the <code>-</code> flag is used (i.e. <em>width</em> looks like a negative number)
only the last <em>width</em> caracters are processed, else the first.
Without <em>width</em> all available characters are processed.
</p>
<p>
If <em>prec</em> is specified, it limits the number of times the substitution is applied.
Without <em>prec</em>, the substitution is applied as often as possible.
</p>
<p>
In input this converter pre-processes data received from the device before
other converters after this one read it.
Converters before this one will see unmodified input.
Thus place this converter before those whose input should be pre-processed.
</p>
<p>
In output it post-processes data already formatted by other converters before this one
before sending it to the device.
Converters after this one will send their output unmodified.
Thus place this converter after those whose output should be post-processed.
</p>
<p>
Examples:<br>
<code>%#-10.2/ab/X/</code> replaces the string <code>ab</code> with <code>X</code>
maximal 2 times in the last 10 characters.
(<code>abcabcabcabc</code> becomes <code>abcXcXcabc</code>)<br>
<code>%#/..\B/&:/</code> writes <code>:</code> after every second character
which is not at the end of a word.
(<code>0b19353134</code> becomes <code>0b:19:35:31:34</code>)<br>
<code>%#/://</code> removes all <code>:</code>.
(<code>0b:19:35:31:34</code> becomes <code>0b19353134</code>)<br>
<code>%#/([^+-])*([+-])/\2\1/</code> moves a postfix sign to the front.
(<code>1.23-</code> becomes <code>-1.23</code>)<br>
</p>
<a name="mantexp"></a>
<h2>13. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
<h2>15. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
<p>
This exotic and experimental format matches numbers in the format
<i>[sign] mantissa sign exponent</i>, e.g <code>+123-4</code> meaning
@ -591,7 +642,7 @@ the usual way (always sign, left justified, space instead of + sign).
Flags <code>#</code> and <code>0</code> are unsupported.
</p>
<a name="timestamp"></a>
<h2>14. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
<h2>16. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
<p>
This format reads or writes timestamps and converts them to a double number.
The value represents the number of seconds since 1970 (the UNIX epoch).

View File

@ -132,6 +132,7 @@ div div div a {list-style-type:circle;}
<a target="_parent" href="formats.html#bcd" title="Binary coded decimal LONG converter">%D</a>
<a target="_parent" href="formats.html#chksum" title="Checksum pseudo converter">%&lt;<em>checksum</em>&gt;</a>
<a target="_parent" href="formats.html#regex" title="Perl regular expression STRING converter">%/<em>regex</em>/</a>
<a target="_parent" href="formats.html#regsub" title="Perl regular expression substitution pseudo converter">%#/<em>regex</em>/<em>subst</em>/</a>
<a target="_parent" href="formats.html#mantexp" title="MantissaExponent DOUBLE converter">%m</a>
<a target="_parent" href="formats.html#timestamp" title="Timestamp DOUBLE converter">%T</a>
</div>

View File

@ -37,15 +37,17 @@
class RegexpConverter : public StreamFormatConverter
{
int parse (const StreamFormat&, StreamBuffer&, const char*&, bool);
int scanString(const StreamFormat&, const char*, char*, size_t);
int parse (const StreamFormat& fmt, StreamBuffer&, const char*&, bool);
int scanString(const StreamFormat& fmt, const char*, char*, size_t);
int scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor);
bool printPseudo(const StreamFormat& fmt, StreamBuffer& output);
};
int RegexpConverter::
parse(const StreamFormat& fmt, StreamBuffer& info,
const char*& source, bool scanFormat)
{
if (!scanFormat)
if (!scanFormat && !(fmt.flags & alt_flag))
{
error("Format conversion %%/regexp/ is only allowed in input formats\n");
return false;
@ -55,28 +57,24 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
error("Subexpression index %d too big (>9)\n", fmt.prec);
return false;
}
if (fmt.flags & (left_flag|space_flag|zero_flag|alt_flag))
{
error("Use of modifiers '-', ' ', '0', '#'"
"not allowed with %%/regexp/ conversion\n");
return false;
}
StreamBuffer pattern;
while (*source != '/')
{
if (!*source) {
error("Missing closing '/' after %%/ format conversion\n");
error("Missing closing '/' after %%/%s format conversion\n", pattern());
return false;
}
if (*source == esc) {
source++;
pattern.print("\\x%02x", *source++ & 0xFF);
pattern.append('\\');
continue;
}
pattern.append(*source++);
}
source++;
debug("regexp = \"%s\"\n", pattern());
const char* errormsg;
int eoffset;
pcre* code = pcre_compile(pattern(), 0,
@ -87,6 +85,29 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
return false;
}
info.append(&code, sizeof(code));
if (fmt.flags & alt_flag)
{
StreamBuffer subst;
while (*source != '/')
{
if (!*source) {
error("Missing closing '/' after %%#/%s/%s format conversion\n", pattern(), subst());
return false;
}
if (*source == esc) {
source++;
subst.append('\\');
if (*source <= 9) subst.append('0'+*source++);
continue;
}
subst.append(*source++);
}
source++;
debug("subst = \"%s\"\n", subst());
info.append(subst).append('\0');
return pseudo_format;
}
return string_format;
}
@ -94,32 +115,143 @@ int RegexpConverter::
scanString(const StreamFormat& fmt, const char* input,
char* value, size_t maxlen)
{
pcre* code;
size_t len;
int ovector[30];
int rc;
int subexpr = 0;
unsigned int l;
memcpy (&code, fmt.info, sizeof(code));
const char* info = fmt.info;
pcre* code = extract<pcre*>(info);
int length = fmt.width > 0 ? fmt.width : strlen(input);
int subexpr = fmt.prec > 0 ? fmt.prec : 0;
len = fmt.width > 0 ? fmt.width : strlen(input);
subexpr = fmt.prec > 0 ? fmt.prec : 0;
rc = pcre_exec(code, NULL, input, len, 0, 0, ovector, 30);
if (rc < 1) return -1;
if (fmt.flags & skip_flag) return ovector[1];
len = ovector[subexpr*2+1] - ovector[subexpr*2];
if (len >= maxlen) {
debug("input = \"%s\"\n", input);
debug("length=%d\n", length);
rc = pcre_exec(code, NULL, input, length, 0, 0, ovector, 30);
debug("pcre_exec match \"%.*s\" result = %d\n", length, input, rc);
if ((subexpr && rc <= subexpr) || rc < 0)
{
/* error or no match or not enough sub-expressions */
return -1;
}
if (fmt.flags & skip_flag) return ovector[subexpr*2+1];
l = ovector[subexpr*2+1] - ovector[subexpr*2];
if (l >= maxlen) {
if (!(fmt.flags & sign_flag)) {
error("Regexp: Matching string \"%s\" too long (%d>%d bytes). You may want to try the + flag: \"%%+/.../\"\n",
StreamBuffer(input+ovector[subexpr*2], len).expand()(),
(int)len, (int)maxlen-1);
error("Regexp: Matching string \"%s\" too long (%d>%ld bytes). You may want to try the + flag: \"%%+/.../\"\n",
StreamBuffer(input + ovector[subexpr*2],l).expand()(),
l, (long)maxlen-1);
return -1;
}
len = maxlen-1;
l = maxlen-1;
}
memcpy(value, input+ovector[subexpr*2], len);
value[len]=0;
return ovector[1];
memcpy(value, input + ovector[subexpr*2], l);
value[l] = '\0';
return ovector[1]; /* consume input until end of match */;
}
static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length, const char* subst, int max)
{
int rc, l, c, r, rl, n=0;
int ovector[30];
StreamBuffer s;
if (length == 0)
{
length = buffer.length() - start;
}
else if (length < 0)
{
length = -length;
if (length > buffer.length() - start)
length = buffer.length() - start;
start = buffer.length() - length;
}
else
{
if (length > buffer.length() - start)
length = buffer.length() - start;
}
debug("regsubst buffer=\"%s\", start=%ld, length=%ld, subst = \"%s\", max = %d\n",
buffer.expand()(), start, length, subst, max);
for (c = 0; c < length; )
{
rc = pcre_exec(code, NULL, buffer(start+c), length-c, 0, 0, ovector, 30);
debug("pcre_exec match \"%.*s\" result = %d\n", (int)length-c, buffer(start+c), rc);
if (rc < 0 || (max && n++ == max))
return; /* no match or maximum substitutions reached */
/* replace & by match in subst */
l = ovector[1] - ovector[0];
debug("start = \"%s\"\n", buffer(start+c));
debug("match = \"%.*s\"\n", l, buffer(start+c+ovector[0]));
for (r = 1; r < rc; r++)
debug("sub%d = \"%.*s\"\n", r, ovector[r*2+1]-ovector[r*2], buffer(start+c+ovector[r*2]));
debug("rest = \"%s\"\n", buffer(start+c+ovector[1]));
s = subst;
debug("subs = \"%s\"\n", s.expand()());
for (r = 0; r < s.length(); r++)
{
debug("check \"%s\"\n", s(r));
if (s[r] == '\\')
{
unsigned char ch = s[r+1];
if (ch >= '0' && ch <= '9')
{
ch = (ch - '0')*2;
rl = ovector[ch+1] - ovector[ch];
debug("replace \\%d: \"%.*s\"\n", ch/2, rl, buffer(start+c+ovector[ch]));
s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
r += rl - 1;
}
else if (ch == '\\' || ch == '&')
s.remove(r++, 1);
}
else if (s[r] == '&')
{
debug("replace &: \"%.*s\"\n", l, buffer(start+c+ovector[0]));
s.replace(r, 1, buffer(start+c+ovector[0]), l);
r += l - 1;
}
else continue;
debug("subs = \"%s\"\n", s());
}
buffer.replace(start+c+ovector[0], l, s);
length += s.length() - l;
c += s.length();
}
}
int RegexpConverter::
scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor)
{
/* re-write input buffer */
const char* info = fmt.info;
pcre* code;
long length;
StreamBuffer subst;
code = extract<pcre*>(info);
if (fmt.flags & left_flag) length = -fmt.width;
else length = fmt.width;
regsubst(code, input, cursor, length, info, fmt.prec);
return 0;
}
bool RegexpConverter::
printPseudo(const StreamFormat& fmt, StreamBuffer& output)
{
/* re-write output buffer */
const char* info = fmt.info;
pcre* code;
long length;
StreamBuffer subst;
code = extract<pcre*>(info);
if (fmt.flags & left_flag) length = -fmt.width;
else length = fmt.width;
regsubst(code, output, 0, length, info, fmt.prec);
return true;
}
RegisterConverter (RegexpConverter, "/");