Com: add epicsStrSimilarity()

This commit is contained in:
Michael Davidsaver
2021-02-05 07:05:22 -08:00
parent 27918cb7a1
commit 745c3f552e
3 changed files with 113 additions and 1 deletions

View File

@@ -23,9 +23,11 @@
#include <errno.h>
#include <ctype.h>
#include "epicsAssert.h"
#include "epicsStdio.h"
#include "cantProceed.h"
#include "epicsString.h"
#include "epicsMath.h"
/* Deprecated, use epicsStrnRawFromEscaped() instead */
int dbTranslateEscape(char *dst, const char *src)
@@ -358,3 +360,71 @@ unsigned int epicsMemHash(const char *str, size_t length, unsigned int seed)
}
return hash;
}
/* Compute normalized Levenshtein distance
*
* https://en.wikipedia.org/wiki/Levenshtein_distance
*
* We modify this to give half weight to case insensitive substitution.
* All normal integer weights are multiplied by two, with case
* insensitive added in as one.
*/
double epicsStrSimilarity(const char *A, const char *B)
{
double ret = 0;
size_t lA, lB, a, b;
size_t norm;
size_t *dist0, *dist1, *stemp;
lA = strlen(A);
lB = strlen(B);
/* max number of edits to change A into B is max(lA, lB) */
norm = lA > lB ? lA : lB;
/* take into account our weighting */
norm *= 2u;
dist0 = calloc(1+lB, sizeof(*dist0));
dist1 = calloc(1+lB, sizeof(*dist1));
if(!dist0 || !dist1) {
ret = -1.0;
goto done;
}
for(b=0; b<1+lB; b++)
dist0[b] = 2*b;
for(a=0; a<lA; a++) {
dist1[0] = 2*(a+1);
for(b=0; b<lB; b++) {
size_t delcost = dist0[b+1] + 2,
inscost = dist1[b] + 2,
subcost = dist0[b],
mincost = delcost;
char ca = A[a], cb = B[b];
if(ca!=cb)
subcost++;
if(toupper((int)ca)!=toupper((int)cb))
subcost++;
if(mincost > inscost)
mincost = inscost;
if(mincost > subcost)
mincost = subcost;
dist1[b+1] = mincost;
}
stemp = dist0;
dist0 = dist1;
dist1 = stemp;
}
ret = norm ? (norm - dist0[lB]) / (double)norm : 1.0;
done:
free(dist0);
free(dist1);
return ret;
}

View File

@@ -41,6 +41,16 @@ LIBCOM_API char * epicsStrtok_r(char *s, const char *delim, char **lasts);
LIBCOM_API unsigned int epicsStrHash(const char *str, unsigned int seed);
LIBCOM_API unsigned int epicsMemHash(const char *str, size_t length,
unsigned int seed);
/** Compare two strings and return a number in the range [0.0, 1.0] or -1.0 on error.
*
* Computes a normalized edit distance representing the similarity between two strings.
*
* @returns 1.0 when A and B are identical, down to 0.0 when A and B are unrelated,
* or < 0.0 on error.
*
* @since UNRELEASED
*/
LIBCOM_API double epicsStrSimilarity(const char *A, const char *B);
/* dbTranslateEscape is deprecated, use epicsStrnRawFromEscaped instead */
LIBCOM_API int dbTranslateEscape(char *s, const char *ct);

View File

@@ -17,6 +17,7 @@
#include "epicsUnitTest.h"
#include "epicsString.h"
#include "epicsMath.h"
#include "testMain.h"
static
@@ -74,6 +75,35 @@ void testGlob(void) {
testOk1(epicsStrGlobMatch("hello","*"));
}
static
void testDistance(void) {
double dist;
testDiag("testDistance()");
#define TEST(EXPECT, A, B) dist = epicsStrSimilarity(A, B); testOk(fabs(dist-(EXPECT))<0.01, "distance \"%s\", \"%s\" %f ~= %f", A, B, dist, EXPECT)
TEST(1.00, "", "");
TEST(1.00, "A", "A");
TEST(0.00, "A", "B");
TEST(1.00, "exact", "exact");
TEST(0.90, "10 second", "10 seconds");
TEST(0.71, "Passive", "Pensive");
TEST(0.11, "10 second", "Pensive");
TEST(0.97, "Set output to IVOV", "Set output To IVOV");
/* we modify Levenshtein to give half weight to case insensitive matches */
/* totally unrelated except for 'i' ~= 'I' */
TEST(0.06, "Passive", "I/O Intr");
TEST(0.06, "I/O Intr", "Pensive");
/* 2x subst and 1x case subst, max distance 2xlen("YES") */
TEST(0.50, "YES", "yes");
TEST(0.00, "YES", "NO");
TEST(0.67, "YES", "Yes");
TEST(0.67, "Tes", "yes");
#undef TEST
}
MAIN(epicsStringTest)
{
const char * const empty = "";
@@ -88,7 +118,7 @@ MAIN(epicsStringTest)
char *s;
int status;
testPlan(387);
testPlan(401);
testChars();
@@ -284,5 +314,7 @@ MAIN(epicsStringTest)
testOk(result[1] == 'g', " Terminator char got '%c'", result[1]);
testOk(result[status] == 0, " 0-terminated");
testDistance();
return testDone();
}