]> git.ozlabs.org Git - ccan/commitdiff
strgrp: Add cosine fudge-curve to unify filter comparison spaces
authorAndrew Jeffery <andrew@aj.id.au>
Sat, 20 Feb 2016 10:52:41 +0000 (21:22 +1030)
committerAndrew Jeffery <andrew@aj.id.au>
Fri, 25 Mar 2016 12:39:32 +0000 (23:09 +1030)
If we are to use should_grp_score_cos(x,y) as a filter the the following
relationship must hold (from least to most expensive):

        should_grp_score_len(x,y)
                >= should_grp_score_cos(x,y)
                >= grp_score(x)

should_grp_score_cos(x,y) wasn't holding up its part of the bargain, so
real data was used to generate a fudge curve to bring
should_grp_score_cos(x,y) results into the same space. Really this is a
terrible hack and the problem needs more thought. Evaluation of
should_grp_score_cos(x,y)'s performance benefit (given the relaxation of
the filter under the fudge curve) is sorely needed.

ccan/strgrp/_info
ccan/strgrp/strgrp.c

index 3c78b35cf476d599dfa998f38d5eac3b4d73a940..2b88ea7b94eb34cb7ccec58d49e4179b6aacbeac 100644 (file)
  * License: LGPL
  * Author: Andrew Jeffery <andrew@aj.id.au>
  *
- * Ccanlint:
- *    tests_pass FAIL
- *    tests_pass_without_features FAIL
- *
  * Example:
  *     FILE *f;
  *     char *buf;
index 12119a1f276c7e4c4a9776cf061d1b8dfc77eddf..bab8d334a99eb6b0216c22cb806ea61f5b77b624 100644 (file)
@@ -108,10 +108,18 @@ strcossim(const int16_t ref[CHAR_N_VALUES], const int16_t key[CHAR_N_VALUES]) {
 
 /* Low-cost filter functions */
 
+static inline double
+cossim_correction(const double s)
+{
+    return -((s - 0.5) * (s - 0.5)) + 0.33;
+}
+
 static inline bool
 should_grp_score_cos(const struct strgrp *const ctx,
         struct strgrp_grp *const grp, const char *const str) {
-    return ctx->threshold <= strcossim(ctx->pop, grp->pop);
+    const double s1 = strcossim(ctx->pop, grp->pop);
+    const double s2 = s1 + cossim_correction(s1);
+    return ctx->threshold <= s2;
 }
 
 static inline bool