Com: add epicsStrSimilarity()
This commit is contained in:
@@ -23,9 +23,11 @@
|
||||
#include <errno.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "epicsAssert.h"
|
||||
#include "epicsStdio.h"
|
||||
#include "cantProceed.h"
|
||||
#include "epicsString.h"
|
||||
#include "epicsMath.h"
|
||||
|
||||
/* Deprecated, use epicsStrnRawFromEscaped() instead */
|
||||
int dbTranslateEscape(char *dst, const char *src)
|
||||
@@ -358,3 +360,71 @@ unsigned int epicsMemHash(const char *str, size_t length, unsigned int seed)
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
/* Compute normalized Levenshtein distance
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Levenshtein_distance
|
||||
*
|
||||
* We modify this to give half weight to case insensitive substitution.
|
||||
* All normal integer weights are multiplied by two, with case
|
||||
* insensitive added in as one.
|
||||
*/
|
||||
double epicsStrSimilarity(const char *A, const char *B)
|
||||
{
|
||||
double ret = 0;
|
||||
size_t lA, lB, a, b;
|
||||
size_t norm;
|
||||
size_t *dist0, *dist1, *stemp;
|
||||
|
||||
lA = strlen(A);
|
||||
lB = strlen(B);
|
||||
|
||||
/* max number of edits to change A into B is max(lA, lB) */
|
||||
norm = lA > lB ? lA : lB;
|
||||
/* take into account our weighting */
|
||||
norm *= 2u;
|
||||
|
||||
dist0 = calloc(1+lB, sizeof(*dist0));
|
||||
dist1 = calloc(1+lB, sizeof(*dist1));
|
||||
if(!dist0 || !dist1) {
|
||||
ret = -1.0;
|
||||
goto done;
|
||||
}
|
||||
|
||||
for(b=0; b<1+lB; b++)
|
||||
dist0[b] = 2*b;
|
||||
|
||||
for(a=0; a<lA; a++) {
|
||||
dist1[0] = 2*(a+1);
|
||||
|
||||
for(b=0; b<lB; b++) {
|
||||
size_t delcost = dist0[b+1] + 2,
|
||||
inscost = dist1[b] + 2,
|
||||
subcost = dist0[b],
|
||||
mincost = delcost;
|
||||
char ca = A[a], cb = B[b];
|
||||
|
||||
if(ca!=cb)
|
||||
subcost++;
|
||||
if(toupper((int)ca)!=toupper((int)cb))
|
||||
subcost++;
|
||||
|
||||
if(mincost > inscost)
|
||||
mincost = inscost;
|
||||
if(mincost > subcost)
|
||||
mincost = subcost;
|
||||
|
||||
dist1[b+1] = mincost;
|
||||
}
|
||||
|
||||
stemp = dist0;
|
||||
dist0 = dist1;
|
||||
dist1 = stemp;
|
||||
}
|
||||
|
||||
ret = norm ? (norm - dist0[lB]) / (double)norm : 1.0;
|
||||
done:
|
||||
free(dist0);
|
||||
free(dist1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -41,6 +41,16 @@ LIBCOM_API char * epicsStrtok_r(char *s, const char *delim, char **lasts);
|
||||
LIBCOM_API unsigned int epicsStrHash(const char *str, unsigned int seed);
|
||||
LIBCOM_API unsigned int epicsMemHash(const char *str, size_t length,
|
||||
unsigned int seed);
|
||||
/** Compare two strings and return a number in the range [0.0, 1.0] or -1.0 on error.
|
||||
*
|
||||
* Computes a normalized edit distance representing the similarity between two strings.
|
||||
*
|
||||
* @returns 1.0 when A and B are identical, down to 0.0 when A and B are unrelated,
|
||||
* or < 0.0 on error.
|
||||
*
|
||||
* @since UNRELEASED
|
||||
*/
|
||||
LIBCOM_API double epicsStrSimilarity(const char *A, const char *B);
|
||||
|
||||
/* dbTranslateEscape is deprecated, use epicsStrnRawFromEscaped instead */
|
||||
LIBCOM_API int dbTranslateEscape(char *s, const char *ct);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
#include "epicsUnitTest.h"
|
||||
#include "epicsString.h"
|
||||
#include "epicsMath.h"
|
||||
#include "testMain.h"
|
||||
|
||||
static
|
||||
@@ -74,6 +75,35 @@ void testGlob(void) {
|
||||
testOk1(epicsStrGlobMatch("hello","*"));
|
||||
}
|
||||
|
||||
static
|
||||
void testDistance(void) {
|
||||
double dist;
|
||||
testDiag("testDistance()");
|
||||
|
||||
#define TEST(EXPECT, A, B) dist = epicsStrSimilarity(A, B); testOk(fabs(dist-(EXPECT))<0.01, "distance \"%s\", \"%s\" %f ~= %f", A, B, dist, EXPECT)
|
||||
|
||||
TEST(1.00, "", "");
|
||||
TEST(1.00, "A", "A");
|
||||
TEST(0.00, "A", "B");
|
||||
TEST(1.00, "exact", "exact");
|
||||
TEST(0.90, "10 second", "10 seconds");
|
||||
TEST(0.71, "Passive", "Pensive");
|
||||
TEST(0.11, "10 second", "Pensive");
|
||||
TEST(0.97, "Set output to IVOV", "Set output To IVOV");
|
||||
|
||||
/* we modify Levenshtein to give half weight to case insensitive matches */
|
||||
|
||||
/* totally unrelated except for 'i' ~= 'I' */
|
||||
TEST(0.06, "Passive", "I/O Intr");
|
||||
TEST(0.06, "I/O Intr", "Pensive");
|
||||
/* 2x subst and 1x case subst, max distance 2xlen("YES") */
|
||||
TEST(0.50, "YES", "yes");
|
||||
TEST(0.00, "YES", "NO");
|
||||
TEST(0.67, "YES", "Yes");
|
||||
TEST(0.67, "Tes", "yes");
|
||||
#undef TEST
|
||||
}
|
||||
|
||||
MAIN(epicsStringTest)
|
||||
{
|
||||
const char * const empty = "";
|
||||
@@ -88,7 +118,7 @@ MAIN(epicsStringTest)
|
||||
char *s;
|
||||
int status;
|
||||
|
||||
testPlan(387);
|
||||
testPlan(401);
|
||||
|
||||
testChars();
|
||||
|
||||
@@ -284,5 +314,7 @@ MAIN(epicsStringTest)
|
||||
testOk(result[1] == 'g', " Terminator char got '%c'", result[1]);
|
||||
testOk(result[status] == 0, " 0-terminated");
|
||||
|
||||
testDistance();
|
||||
|
||||
return testDone();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user