regsub converter added
This commit is contained in:
@ -528,7 +528,7 @@ In input, the next byte or bytes must match the checksum.
|
||||
</dl>
|
||||
|
||||
<a name="regex"></a>
|
||||
<h2>12. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
|
||||
<h2>13. Regular Expresion STRING Converter (<code>%/<em>regex</em>/</code>)</h2>
|
||||
<p>
|
||||
This input-only format matches <a target="ex"
|
||||
href="http://www.pcre.org/" >Perl compatible regular expressions (PCRE)</a>.
|
||||
@ -569,9 +569,60 @@ Example: <code>%.1/<title>(.*)<\/title>/</code> returns
|
||||
the title of an HTML page, skipps anything before the
|
||||
<code><title></code> tag and leaves anything after the
|
||||
<code></title></code> tag in the input buffer.
|
||||
</p>
|
||||
<a name="regsub"></a>
|
||||
<h2>14. Regular Expresion Substitution Pseudo-Converter (<code>%#/<em>regex</em>/<em>subst</em>/</code>)</h2>
|
||||
<p>
|
||||
This is a variant of the previous converter (note the <code>#</code>)
|
||||
but instead of returning the matching string,
|
||||
it can be used as a pre-processor for input or
|
||||
as a post-processor for output.
|
||||
</p>
|
||||
<p>
|
||||
Any match of the <em>regex</em> is replaced by the string <em>subst</em> with any
|
||||
<code>&</code> or <code>\0</code> in <em>subst</em> replaced with the match itself and any
|
||||
<code>\1</code> through <code>\9</code> with the corresponding sub-expressions.
|
||||
To get a literal <code>&</code> or <code>\</code> in the substitution write
|
||||
<code>\&</code> or <code>\\</code>.
|
||||
</p>
|
||||
<p>
|
||||
If <em>width</em> is specified, it limits the number of characters processed.
|
||||
If the <code>-</code> flag is used (i.e. <em>width</em> looks like a negative number)
|
||||
only the last <em>width</em> caracters are processed, else the first.
|
||||
Without <em>width</em> all available characters are processed.
|
||||
</p>
|
||||
<p>
|
||||
If <em>prec</em> is specified, it limits the number of times the substitution is applied.
|
||||
Without <em>prec</em>, the substitution is applied as often as possible.
|
||||
</p>
|
||||
<p>
|
||||
In input this converter pre-processes data received from the device before
|
||||
other converters after this one read it.
|
||||
Converters before this one will see unmodified input.
|
||||
Thus place this converter before those whose input should be pre-processed.
|
||||
</p>
|
||||
<p>
|
||||
In output it post-processes data already formatted by other converters before this one
|
||||
before sending it to the device.
|
||||
Converters after this one will send their output unmodified.
|
||||
Thus place this converter after those whose output should be post-processed.
|
||||
</p>
|
||||
<p>
|
||||
Examples:<br>
|
||||
<code>%#-10.2/ab/X/</code> replaces the string <code>ab</code> with <code>X</code>
|
||||
maximal 2 times in the last 10 characters.
|
||||
(<code>abcabcabcabc</code> becomes <code>abcXcXcabc</code>)<br>
|
||||
<code>%#/..\B/&:/</code> writes <code>:</code> after every second character
|
||||
which is not at the end of a word.
|
||||
(<code>0b19353134</code> becomes <code>0b:19:35:31:34</code>)<br>
|
||||
<code>%#/://</code> removes all <code>:</code>.
|
||||
(<code>0b:19:35:31:34</code> becomes <code>0b19353134</code>)<br>
|
||||
<code>%#/([^+-])*([+-])/\2\1/</code> moves a postfix sign to the front.
|
||||
(<code>1.23-</code> becomes <code>-1.23</code>)<br>
|
||||
|
||||
</p>
|
||||
<a name="mantexp"></a>
|
||||
<h2>13. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
|
||||
<h2>15. MantissaExponent DOUBLE converter (<code>%m</code>)</h2>
|
||||
<p>
|
||||
This exotic and experimental format matches numbers in the format
|
||||
<i>[sign] mantissa sign exponent</i>, e.g <code>+123-4</code> meaning
|
||||
@ -591,7 +642,7 @@ the usual way (always sign, left justified, space instead of + sign).
|
||||
Flags <code>#</code> and <code>0</code> are unsupported.
|
||||
</p>
|
||||
<a name="timestamp"></a>
|
||||
<h2>14. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
|
||||
<h2>16. Timestamp DOUBLE converter (<code>%T(<em>timeformat</em>)</code>)</h2>
|
||||
<p>
|
||||
This format reads or writes timestamps and converts them to a double number.
|
||||
The value represents the number of seconds since 1970 (the UNIX epoch).
|
||||
|
@ -132,6 +132,7 @@ div div div a {list-style-type:circle;}
|
||||
<a target="_parent" href="formats.html#bcd" title="Binary coded decimal LONG converter">%D</a>
|
||||
<a target="_parent" href="formats.html#chksum" title="Checksum pseudo converter">%<<em>checksum</em>></a>
|
||||
<a target="_parent" href="formats.html#regex" title="Perl regular expression STRING converter">%/<em>regex</em>/</a>
|
||||
<a target="_parent" href="formats.html#regsub" title="Perl regular expression substitution pseudo converter">%#/<em>regex</em>/<em>subst</em>/</a>
|
||||
<a target="_parent" href="formats.html#mantexp" title="MantissaExponent DOUBLE converter">%m</a>
|
||||
<a target="_parent" href="formats.html#timestamp" title="Timestamp DOUBLE converter">%T</a>
|
||||
</div>
|
||||
|
@ -37,15 +37,17 @@
|
||||
|
||||
class RegexpConverter : public StreamFormatConverter
|
||||
{
|
||||
int parse (const StreamFormat&, StreamBuffer&, const char*&, bool);
|
||||
int scanString(const StreamFormat&, const char*, char*, size_t);
|
||||
int parse (const StreamFormat& fmt, StreamBuffer&, const char*&, bool);
|
||||
int scanString(const StreamFormat& fmt, const char*, char*, size_t);
|
||||
int scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor);
|
||||
bool printPseudo(const StreamFormat& fmt, StreamBuffer& output);
|
||||
};
|
||||
|
||||
int RegexpConverter::
|
||||
parse(const StreamFormat& fmt, StreamBuffer& info,
|
||||
const char*& source, bool scanFormat)
|
||||
{
|
||||
if (!scanFormat)
|
||||
if (!scanFormat && !(fmt.flags & alt_flag))
|
||||
{
|
||||
error("Format conversion %%/regexp/ is only allowed in input formats\n");
|
||||
return false;
|
||||
@ -55,28 +57,24 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
|
||||
error("Subexpression index %d too big (>9)\n", fmt.prec);
|
||||
return false;
|
||||
}
|
||||
if (fmt.flags & (left_flag|space_flag|zero_flag|alt_flag))
|
||||
{
|
||||
error("Use of modifiers '-', ' ', '0', '#'"
|
||||
"not allowed with %%/regexp/ conversion\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
StreamBuffer pattern;
|
||||
while (*source != '/')
|
||||
{
|
||||
if (!*source) {
|
||||
error("Missing closing '/' after %%/ format conversion\n");
|
||||
error("Missing closing '/' after %%/%s format conversion\n", pattern());
|
||||
return false;
|
||||
}
|
||||
if (*source == esc) {
|
||||
source++;
|
||||
pattern.print("\\x%02x", *source++ & 0xFF);
|
||||
pattern.append('\\');
|
||||
continue;
|
||||
}
|
||||
pattern.append(*source++);
|
||||
}
|
||||
source++;
|
||||
debug("regexp = \"%s\"\n", pattern());
|
||||
|
||||
const char* errormsg;
|
||||
int eoffset;
|
||||
pcre* code = pcre_compile(pattern(), 0,
|
||||
@ -87,6 +85,29 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
|
||||
return false;
|
||||
}
|
||||
info.append(&code, sizeof(code));
|
||||
|
||||
if (fmt.flags & alt_flag)
|
||||
{
|
||||
StreamBuffer subst;
|
||||
while (*source != '/')
|
||||
{
|
||||
if (!*source) {
|
||||
error("Missing closing '/' after %%#/%s/%s format conversion\n", pattern(), subst());
|
||||
return false;
|
||||
}
|
||||
if (*source == esc) {
|
||||
source++;
|
||||
subst.append('\\');
|
||||
if (*source <= 9) subst.append('0'+*source++);
|
||||
continue;
|
||||
}
|
||||
subst.append(*source++);
|
||||
}
|
||||
source++;
|
||||
debug("subst = \"%s\"\n", subst());
|
||||
info.append(subst).append('\0');
|
||||
return pseudo_format;
|
||||
}
|
||||
return string_format;
|
||||
}
|
||||
|
||||
@ -94,32 +115,143 @@ int RegexpConverter::
|
||||
scanString(const StreamFormat& fmt, const char* input,
|
||||
char* value, size_t maxlen)
|
||||
{
|
||||
pcre* code;
|
||||
size_t len;
|
||||
int ovector[30];
|
||||
int rc;
|
||||
int subexpr = 0;
|
||||
unsigned int l;
|
||||
|
||||
memcpy (&code, fmt.info, sizeof(code));
|
||||
const char* info = fmt.info;
|
||||
pcre* code = extract<pcre*>(info);
|
||||
int length = fmt.width > 0 ? fmt.width : strlen(input);
|
||||
int subexpr = fmt.prec > 0 ? fmt.prec : 0;
|
||||
|
||||
len = fmt.width > 0 ? fmt.width : strlen(input);
|
||||
subexpr = fmt.prec > 0 ? fmt.prec : 0;
|
||||
rc = pcre_exec(code, NULL, input, len, 0, 0, ovector, 30);
|
||||
if (rc < 1) return -1;
|
||||
if (fmt.flags & skip_flag) return ovector[1];
|
||||
len = ovector[subexpr*2+1] - ovector[subexpr*2];
|
||||
if (len >= maxlen) {
|
||||
debug("input = \"%s\"\n", input);
|
||||
debug("length=%d\n", length);
|
||||
|
||||
rc = pcre_exec(code, NULL, input, length, 0, 0, ovector, 30);
|
||||
debug("pcre_exec match \"%.*s\" result = %d\n", length, input, rc);
|
||||
if ((subexpr && rc <= subexpr) || rc < 0)
|
||||
{
|
||||
/* error or no match or not enough sub-expressions */
|
||||
return -1;
|
||||
}
|
||||
if (fmt.flags & skip_flag) return ovector[subexpr*2+1];
|
||||
|
||||
l = ovector[subexpr*2+1] - ovector[subexpr*2];
|
||||
if (l >= maxlen) {
|
||||
if (!(fmt.flags & sign_flag)) {
|
||||
error("Regexp: Matching string \"%s\" too long (%d>%d bytes). You may want to try the + flag: \"%%+/.../\"\n",
|
||||
StreamBuffer(input+ovector[subexpr*2], len).expand()(),
|
||||
(int)len, (int)maxlen-1);
|
||||
error("Regexp: Matching string \"%s\" too long (%d>%ld bytes). You may want to try the + flag: \"%%+/.../\"\n",
|
||||
StreamBuffer(input + ovector[subexpr*2],l).expand()(),
|
||||
l, (long)maxlen-1);
|
||||
return -1;
|
||||
}
|
||||
len = maxlen-1;
|
||||
l = maxlen-1;
|
||||
}
|
||||
memcpy(value, input+ovector[subexpr*2], len);
|
||||
value[len]=0;
|
||||
return ovector[1];
|
||||
memcpy(value, input + ovector[subexpr*2], l);
|
||||
value[l] = '\0';
|
||||
return ovector[1]; /* consume input until end of match */;
|
||||
}
|
||||
|
||||
static void regsubst(pcre* code, StreamBuffer& buffer, long start, long length, const char* subst, int max)
|
||||
{
|
||||
int rc, l, c, r, rl, n=0;
|
||||
int ovector[30];
|
||||
StreamBuffer s;
|
||||
if (length == 0)
|
||||
{
|
||||
length = buffer.length() - start;
|
||||
}
|
||||
else if (length < 0)
|
||||
{
|
||||
length = -length;
|
||||
if (length > buffer.length() - start)
|
||||
length = buffer.length() - start;
|
||||
start = buffer.length() - length;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (length > buffer.length() - start)
|
||||
length = buffer.length() - start;
|
||||
}
|
||||
debug("regsubst buffer=\"%s\", start=%ld, length=%ld, subst = \"%s\", max = %d\n",
|
||||
buffer.expand()(), start, length, subst, max);
|
||||
for (c = 0; c < length; )
|
||||
{
|
||||
rc = pcre_exec(code, NULL, buffer(start+c), length-c, 0, 0, ovector, 30);
|
||||
debug("pcre_exec match \"%.*s\" result = %d\n", (int)length-c, buffer(start+c), rc);
|
||||
|
||||
if (rc < 0 || (max && n++ == max))
|
||||
return; /* no match or maximum substitutions reached */
|
||||
/* replace & by match in subst */
|
||||
l = ovector[1] - ovector[0];
|
||||
debug("start = \"%s\"\n", buffer(start+c));
|
||||
debug("match = \"%.*s\"\n", l, buffer(start+c+ovector[0]));
|
||||
for (r = 1; r < rc; r++)
|
||||
debug("sub%d = \"%.*s\"\n", r, ovector[r*2+1]-ovector[r*2], buffer(start+c+ovector[r*2]));
|
||||
debug("rest = \"%s\"\n", buffer(start+c+ovector[1]));
|
||||
s = subst;
|
||||
debug("subs = \"%s\"\n", s.expand()());
|
||||
for (r = 0; r < s.length(); r++)
|
||||
{
|
||||
debug("check \"%s\"\n", s(r));
|
||||
if (s[r] == '\\')
|
||||
{
|
||||
unsigned char ch = s[r+1];
|
||||
if (ch >= '0' && ch <= '9')
|
||||
{
|
||||
ch = (ch - '0')*2;
|
||||
rl = ovector[ch+1] - ovector[ch];
|
||||
debug("replace \\%d: \"%.*s\"\n", ch/2, rl, buffer(start+c+ovector[ch]));
|
||||
s.replace(r, 2, buffer(start+c+ovector[ch]), rl);
|
||||
r += rl - 1;
|
||||
}
|
||||
else if (ch == '\\' || ch == '&')
|
||||
s.remove(r++, 1);
|
||||
}
|
||||
else if (s[r] == '&')
|
||||
{
|
||||
debug("replace &: \"%.*s\"\n", l, buffer(start+c+ovector[0]));
|
||||
s.replace(r, 1, buffer(start+c+ovector[0]), l);
|
||||
r += l - 1;
|
||||
}
|
||||
else continue;
|
||||
debug("subs = \"%s\"\n", s());
|
||||
}
|
||||
buffer.replace(start+c+ovector[0], l, s);
|
||||
length += s.length() - l;
|
||||
c += s.length();
|
||||
}
|
||||
}
|
||||
|
||||
int RegexpConverter::
|
||||
scanPseudo(const StreamFormat& fmt, StreamBuffer& input, long& cursor)
|
||||
{
|
||||
/* re-write input buffer */
|
||||
const char* info = fmt.info;
|
||||
pcre* code;
|
||||
long length;
|
||||
StreamBuffer subst;
|
||||
|
||||
code = extract<pcre*>(info);
|
||||
if (fmt.flags & left_flag) length = -fmt.width;
|
||||
else length = fmt.width;
|
||||
regsubst(code, input, cursor, length, info, fmt.prec);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool RegexpConverter::
|
||||
printPseudo(const StreamFormat& fmt, StreamBuffer& output)
|
||||
{
|
||||
/* re-write output buffer */
|
||||
const char* info = fmt.info;
|
||||
pcre* code;
|
||||
long length;
|
||||
StreamBuffer subst;
|
||||
|
||||
code = extract<pcre*>(info);
|
||||
if (fmt.flags & left_flag) length = -fmt.width;
|
||||
else length = fmt.width;
|
||||
regsubst(code, output, 0, length, info, fmt.prec);
|
||||
return true;
|
||||
}
|
||||
|
||||
RegisterConverter (RegexpConverter, "/");
|
||||
|
Reference in New Issue
Block a user