diff --git a/doc/formats.html b/doc/formats.html
index 5fdf49a..425603e 100644
--- a/doc/formats.html
+++ b/doc/formats.html
@@ -564,8 +564,8 @@ in architecture specific RELEASE.Common.arch files.
If the regular expression is not anchored, i.e. does not start with
^
, leading non-matching input is skipped.
A maximum of width bytes is matched, if specified.
-If precision is given, it specifies the sub-expression whose match
-is retuned.
+If precision is given, it specifies the sub-expression in ()
+whose match is retuned.
Otherwise the complete match is returned.
In any case, the complete match is consumed from the input buffer.
If the expression contains a /
it must be escaped like \/
.
@@ -586,12 +586,19 @@ as a post-processor for output.
Matches of the regex are replaced by the string subst with all
-&
or \0
in subst replaced with the match itself and all
-\1
through \9
replaced with the match of the corresponding sub-expression.
+&
in subst replaced with the match itself and all
+\1
through \9
replaced with the match of the corresponding
+sub-expression if such a sub-expression exists.
+
+Due to limitations of the parser, \1
and \x01
are the same
+which makes it difficult to use literal bytes with values lower than 10 in subst.
+Therefore \0
aways means a literal byte (incompatible change from earlier version!)
+and \1
through \9
mean literal bytes if they are larger than
+the number of sub-expressions.
+
+
To get a literal &
or \
or /
in the substitution write
\&
or \\
or \/
.
-There is no way to specify literal bytes with values less or equal to 9 in the
-substitution!
If width is specified, it limits the number of characters processed.
diff --git a/src/RegexpConverter.cc b/src/RegexpConverter.cc
index 542a978..a49df4f 100644
--- a/src/RegexpConverter.cc
+++ b/src/RegexpConverter.cc
@@ -32,7 +32,6 @@
run-time leak.
- A maximum of 9 subexpressions is supported. Only one of them can
be the result of the match.
- - vxWorks and maybe other OS don't have a PCRE library. Provide one?
*/
class RegexpConverter : public StreamFormatConverter
@@ -54,9 +53,9 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
}
if (fmt.prec > 9)
{
- error("Subexpression index %d too big (>9)\n", fmt.prec);
+ error("Sub-expression index %d too big (>9)\n", fmt.prec);
return false;
- }
+ }
StreamBuffer pattern;
while (*source != '/')
@@ -81,22 +80,30 @@ parse(const StreamFormat& fmt, StreamBuffer& info,
}
source++;
debug("regexp = \"%s\"\n", pattern.expand()());
-
+
const char* errormsg;
int eoffset;
- pcre* code = pcre_compile(pattern(), 0,
- &errormsg, &eoffset, NULL);
+ int nsubexpr;
+
+ pcre* code = pcre_compile(pattern(), 0, &errormsg, &eoffset, NULL);
if (!code)
{
error("%s after \"%s\"\n", errormsg, pattern.expand(0, eoffset)());
return false;
}
+ pcre_fullinfo(code, NULL, PCRE_INFO_CAPTURECOUNT, &nsubexpr);
+ if (fmt.prec > nsubexpr)
+ {
+ error("Sub-expression index is %d but pattern has only %d sub-expression\n", fmt.prec, nsubexpr);
+ return false;
+ }
info.append(&code, sizeof(code));
if (fmt.flags & alt_flag)
{
StreamBuffer subst;
- debug("check for subst in \"%s\"\n", StreamBuffer(source).expand()());
+
+ debug("check for subst in \"%s\"\n", StreamBuffer(source).expand()());
while (*source != '/')
{
if (!*source) {
@@ -122,15 +129,15 @@ scanString(const StreamFormat& fmt, const char* input,
int ovector[30];
int rc;
unsigned int l;
-
+
const char* info = fmt.info;
pcre* code = extract