From: Andrew Jeffery Date: Sat, 20 Feb 2016 11:03:04 +0000 (+1030) Subject: strgrp: Use ratio of hypotenuse for consistent comparisons X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=44c0274ac8f2a16c981e706d4386f899c5e206f5 strgrp: Use ratio of hypotenuse for consistent comparisons Ensure comparing filter results is sensible by using a consistent calculation. Note that the cosine similarity measurement doesn't yet conform and this can give spurious results that are not detected by the test suite. --- diff --git a/ccan/strgrp/strgrp.c b/ccan/strgrp/strgrp.c index 85d9765c..111e592e 100644 --- a/ccan/strgrp/strgrp.c +++ b/ccan/strgrp/strgrp.c @@ -117,13 +117,11 @@ should_grp_score_cos(const struct strgrp *const ctx, static inline bool should_grp_score_len(const struct strgrp *const ctx, const struct strgrp_grp *const grp, const char *const str) { - const size_t strl = strlen(str); - const size_t keyl = grp->key_len; - double sr = strl / keyl; - if (1 < sr) { - sr = 1 / sr; - } - return ctx->threshold <= sr; + const double lstr = (double) strlen(str); + const double lkey = (double) grp->key_len; + const double lmin = (lstr > lkey) ? lkey : lstr; + const double s = sqrt((2 * lmin * lmin) / (1.0 * lstr * lstr + lkey * lkey)); + return ctx->threshold <= s; } /* Scoring - Longest Common Subsequence[2] @@ -172,7 +170,10 @@ lcs(const char *const a, const char *const b) { static inline double nlcs(const char *const a, const char *const b) { const double lcss = lcs(a, b); - return 2 * lcss / (strlen(a) + strlen(b)); + const double la = (double) strlen(a); + const double lb = (double) strlen(b); + const double s = sqrt((2 * lcss * lcss) / (la * la + lb * lb)); + return s; } static inline double