Files
pvAccess/pvAccessApp/utils/wildcharMatcher.cpp
2011-06-07 08:31:41 -04:00

274 lines
8.9 KiB
C++

/*
* wildcharMatcher.cpp
*
* Created on: Nov 4, 2010
* Author: Miha Vitorovic
*/
#include <pv/wildcharMatcher.h>
#include <iostream>
using std::cout;
using namespace epics::pvData;
namespace epics {
namespace pvAccess {
/** Wildchar matcher debug */
const bool WCM_DEBUG = false;
/** Value of initial state */
const int WCM_INITIAL = 0;
/** Value of final state */
const int WCM_FINAL = 2;
/** Value of error state */
const int WCM_ERROR = 99;
/** Any character (except control, unless escaped) */
const int WCM_TOKEN_CHAR = 0;
/** Token for end of set: ] */
const int WCM_TOKEN_END = 1;
/** Token for negation: */
const int WCM_TOKEN_NOT = 2;
/** Token for range specification: - */
const int WCM_TOKEN_MINUS = 3;
/**
* Transition table holds the nextState used in set parsing. Rows define
* states, columns define tokens. transitions[1][3] = 5 means: if in state
* 1 next token is 3, goto state 5
*/
const int TRANSITIONS[][4] = { { 1, WCM_FINAL, 3, 4 }, { 1, WCM_FINAL,
WCM_ERROR, 5 }, { WCM_ERROR, WCM_ERROR, WCM_ERROR, WCM_ERROR },
{ 1, WCM_FINAL, WCM_ERROR, 4 }, { 6, WCM_ERROR, WCM_ERROR,
WCM_ERROR }, { 6, WCM_FINAL, WCM_ERROR, WCM_ERROR }, {
1, WCM_FINAL, WCM_ERROR, WCM_ERROR } };
int getToken(const char ch) {
switch(ch) {
case ']':
return WCM_TOKEN_END;
case '!':
return WCM_TOKEN_NOT;
case '-':
return WCM_TOKEN_MINUS;
default:
return WCM_TOKEN_CHAR;
}
}
bool testSet(const String pattern, int offset, const char ch) {
int n = pattern.length();
int state = WCM_INITIAL;
int nextToken = ' ';
char nextChar = ' ';
char ch1 = ' ';
bool found = false;
bool negate = false;
while(!found) {
// Check for offset in case of final state, which is over the limit,
// if ] is at the end of the string.
if(offset<n) {
nextChar = pattern.at(offset);
if(nextChar=='\\') {
// Any escaped sequence is two characters, otherwise error will
// be throws, since this is an invalid sequence anyway
nextChar = pattern.at(offset+1);
nextToken = WCM_TOKEN_CHAR;
offset++;
}
else {
nextToken = getToken(nextChar);
}
}
switch(state) {
case WCM_INITIAL:
if(nextToken==WCM_TOKEN_NOT) {
negate = true;
break;
}
// No break, states 0, 1, 3, 6 have same next condition.
case 1:
if(nextToken==WCM_TOKEN_END) return found^negate;
case 3:
case 6:
if(nextToken==WCM_TOKEN_CHAR) {
found = (ch==nextChar);
ch1 = nextChar;
}
break;
case 4:
// condition [-a...
found = (ch<=nextChar);
break;
case 5:
if(nextToken==WCM_TOKEN_CHAR) found = ((ch>=ch1)&&(ch
<=nextChar)); // condition ...a-z...
if(nextToken==WCM_TOKEN_END) found = (ch>=ch1); // condition ...a-]
break;
}
if(WCM_DEBUG) {
cout<<"( "<<state<<" -> "<<TRANSITIONS[state][nextToken]
<<" ) token = "<<nextToken<<" char = "<<nextChar
<<", found = "<<found<<", negate = "<<negate;
}
// Lookup next state in transition table and check for valid pattern
state = TRANSITIONS[state][nextToken];
if(state==WCM_ERROR) return false;
// don't bother, this is a no match anyway
// throw new RuntimeException("Invalid pattern");
if(state==WCM_FINAL) return found^negate;
offset++;
}
return found^negate;
}
bool parse(const String pattern, const int ofp, const String str,
const int ofs) {
int lp = pattern.length();
int ls = str.length();
int ip = ofp; // index into pattern string
int is = ofs; // index into test string
char chp, chs;
if(WCM_DEBUG) {
if((ip>-1)&&(is>-1)&&(ip<lp)&&(is<ls)) {
cout<<"parse: "<<pattern.substr(ip)<<" "<<str.substr(is);
}
}
// Match happens only, if we parse both strings exactly to the end
while(ip<lp) {
chp = pattern.at(ip);
if(WCM_DEBUG) {
if((ip>-1)&&(is>-1)&&(ip<lp)&&(is<ls)) {
cout<<pattern.substr(ip)<<" "<<str.substr(is);
}
}
switch(chp) {
case '[':
{
// Each set must be close with a ], otherwise it is invalid.
int end = pattern.find(']', ip);
if(end==-1) return false;
// Is this set followed by a *
bool isWildchar = ((end+1)<lp)&&(pattern.at(end+1)=='*');
if(is<ls)
chs = str.at(is);
else
return parse(pattern, end+2, str, is);
// Does this character match
bool thisChar = testSet(pattern, ip+1, chs);
// Check for single character match only if there is no
// * at the end.
if(!thisChar&&!isWildchar) return false; // Return only if this character does not match
if(isWildchar) {
// If this character does not match, maybe this set
// can be skipped entirely
if(!thisChar) {
ip = end+2;
break;
}
// Special case when this character matches, although
// it should not: a[a-z]*z == az
if(parse(pattern, end+2, str, is)) return true;
// Try to match next character
if(parse(pattern, ip, str, is+1)) return true;
}
// Single character matched, set was processed, since
// no * was at the end.
ip = end+1;
is++;
break;
}
case '?':
// Obvious
ip++;
is++;
break;
case '*':
{
// Trailing asterisk means that string matches till the end.
// Also, checks if this is last char in the string
if(ip+1==lp) return true;
// Skip the *
do {
ip++;
chp = pattern.at(ip);
} while((ip+1<lp)&&(chp=='*'));
// But perform a special check and solve it by recursing
// from new position
if(chp=='?'&&parse(pattern, ip, str, is)) return true;
// Iterate through all possible matches in the test string
int i = is;
while(i<ls) {
// Stupid brute force, but isn't as bad as it seems.
// Try all possible matches in the test string.
if(parse(pattern, ip, str, i)) return true;
i++;
}
break;
}
default:
// Literal match
if(is==ls||pattern.at(ip)!=str.at(is)) return false;
ip++;
is++;
}
}
// There could be several * at the end of the pattern, although the
// test string is at the end.
while((ip<lp)&&((pattern.at(ip))=='*'))
ip++;
// Same condition as with while loop
return (is==ls)&&(ip==lp);
}
bool match(const String pattern, const String str) {
return parse(pattern, 0, str, 0);
}
}
}