From 745c3f552e7cbd13e07cacdd7577468cc3349a6f Mon Sep 17 00:00:00 2001 From: Michael Davidsaver Date: Fri, 5 Feb 2021 07:05:22 -0800 Subject: [PATCH] Com: add epicsStrSimilarity() --- modules/libcom/src/misc/epicsString.c | 70 +++++++++++++++++++++++++++ modules/libcom/src/misc/epicsString.h | 10 ++++ modules/libcom/test/epicsStringTest.c | 34 ++++++++++++- 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/modules/libcom/src/misc/epicsString.c b/modules/libcom/src/misc/epicsString.c index 766af20b6..e18d9214a 100644 --- a/modules/libcom/src/misc/epicsString.c +++ b/modules/libcom/src/misc/epicsString.c @@ -23,9 +23,11 @@ #include #include +#include "epicsAssert.h" #include "epicsStdio.h" #include "cantProceed.h" #include "epicsString.h" +#include "epicsMath.h" /* Deprecated, use epicsStrnRawFromEscaped() instead */ int dbTranslateEscape(char *dst, const char *src) @@ -358,3 +360,71 @@ unsigned int epicsMemHash(const char *str, size_t length, unsigned int seed) } return hash; } + +/* Compute normalized Levenshtein distance + * + * https://en.wikipedia.org/wiki/Levenshtein_distance + * + * We modify this to give half weight to case insensitive substitution. + * All normal integer weights are multiplied by two, with case + * insensitive added in as one. + */ +double epicsStrSimilarity(const char *A, const char *B) +{ + double ret = 0; + size_t lA, lB, a, b; + size_t norm; + size_t *dist0, *dist1, *stemp; + + lA = strlen(A); + lB = strlen(B); + + /* max number of edits to change A into B is max(lA, lB) */ + norm = lA > lB ? lA : lB; + /* take into account our weighting */ + norm *= 2u; + + dist0 = calloc(1+lB, sizeof(*dist0)); + dist1 = calloc(1+lB, sizeof(*dist1)); + if(!dist0 || !dist1) { + ret = -1.0; + goto done; + } + + for(b=0; b<1+lB; b++) + dist0[b] = 2*b; + + for(a=0; a inscost) + mincost = inscost; + if(mincost > subcost) + mincost = subcost; + + dist1[b+1] = mincost; + } + + stemp = dist0; + dist0 = dist1; + dist1 = stemp; + } + + ret = norm ? (norm - dist0[lB]) / (double)norm : 1.0; +done: + free(dist0); + free(dist1); + return ret; +} diff --git a/modules/libcom/src/misc/epicsString.h b/modules/libcom/src/misc/epicsString.h index 2ef675bc9..3c08e5277 100644 --- a/modules/libcom/src/misc/epicsString.h +++ b/modules/libcom/src/misc/epicsString.h @@ -41,6 +41,16 @@ LIBCOM_API char * epicsStrtok_r(char *s, const char *delim, char **lasts); LIBCOM_API unsigned int epicsStrHash(const char *str, unsigned int seed); LIBCOM_API unsigned int epicsMemHash(const char *str, size_t length, unsigned int seed); +/** Compare two strings and return a number in the range [0.0, 1.0] or -1.0 on error. + * + * Computes a normalized edit distance representing the similarity between two strings. + * + * @returns 1.0 when A and B are identical, down to 0.0 when A and B are unrelated, + * or < 0.0 on error. + * + * @since UNRELEASED + */ +LIBCOM_API double epicsStrSimilarity(const char *A, const char *B); /* dbTranslateEscape is deprecated, use epicsStrnRawFromEscaped instead */ LIBCOM_API int dbTranslateEscape(char *s, const char *ct); diff --git a/modules/libcom/test/epicsStringTest.c b/modules/libcom/test/epicsStringTest.c index 054f0c043..e4e95c998 100644 --- a/modules/libcom/test/epicsStringTest.c +++ b/modules/libcom/test/epicsStringTest.c @@ -17,6 +17,7 @@ #include "epicsUnitTest.h" #include "epicsString.h" +#include "epicsMath.h" #include "testMain.h" static @@ -74,6 +75,35 @@ void testGlob(void) { testOk1(epicsStrGlobMatch("hello","*")); } +static +void testDistance(void) { + double dist; + testDiag("testDistance()"); + +#define TEST(EXPECT, A, B) dist = epicsStrSimilarity(A, B); testOk(fabs(dist-(EXPECT))<0.01, "distance \"%s\", \"%s\" %f ~= %f", A, B, dist, EXPECT) + + TEST(1.00, "", ""); + TEST(1.00, "A", "A"); + TEST(0.00, "A", "B"); + TEST(1.00, "exact", "exact"); + TEST(0.90, "10 second", "10 seconds"); + TEST(0.71, "Passive", "Pensive"); + TEST(0.11, "10 second", "Pensive"); + TEST(0.97, "Set output to IVOV", "Set output To IVOV"); + + /* we modify Levenshtein to give half weight to case insensitive matches */ + + /* totally unrelated except for 'i' ~= 'I' */ + TEST(0.06, "Passive", "I/O Intr"); + TEST(0.06, "I/O Intr", "Pensive"); + /* 2x subst and 1x case subst, max distance 2xlen("YES") */ + TEST(0.50, "YES", "yes"); + TEST(0.00, "YES", "NO"); + TEST(0.67, "YES", "Yes"); + TEST(0.67, "Tes", "yes"); +#undef TEST +} + MAIN(epicsStringTest) { const char * const empty = ""; @@ -88,7 +118,7 @@ MAIN(epicsStringTest) char *s; int status; - testPlan(387); + testPlan(401); testChars(); @@ -284,5 +314,7 @@ MAIN(epicsStringTest) testOk(result[1] == 'g', " Terminator char got '%c'", result[1]); testOk(result[status] == 0, " 0-terminated"); + testDistance(); + return testDone(); }