From: Andrew Jeffery <andrew@aj.id.au>
Date: Thu, 24 Sep 2015 12:49:39 +0000 (+0930)
Subject: strgrp: Add cosine similarity filter
X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=e95575c4ad0485eef9edf562dfdbc34f8deb4b76

strgrp: Add cosine similarity filter

The cosine similarity measure[1] (O(m + n)) contributes a decent runtime
reduction when used as a filter prior to execution of more expensive
algorithms such as LCS[2] (O(m * n)).

A private test set of 3500 strings was used to quantify the improvement.
The shape of the test set is described by Python's Pandas module as:

    >>> frames.apply(len).describe()
    count    3500.000000
    mean       47.454286
    std        14.980197
    min        10.000000
    25%        37.000000
    50%        45.000000
    75%        61.000000
    max       109.000000
    dtype: float64
    >>>

The tests were performed on a lightly loaded Lenovo X201s stocked with a
Intel Core i7 L 640 @ 2.13GHz CPU with 3.7 GiB of memory. The test was
compiled with GCC 4.9.3:

    $ gcc --version
    gcc (Gentoo 4.9.3 p1.0, pie-0.6.2) 4.9.3
    Copyright (C) 2015 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions.  There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Using the test program outlined below, ten runs for input set sizes
incrementing in batches of 500 were taken prior to filtering with cosine
similarity:

    500:  0.61, 0.25, 0.08, 0.07, 0.07, 0.07, 0.09, 0.07, 0.07, 0.07
    1000: 0.33, 0.32, 0.34, 0.32, 0.32, 0.33, 0.32, 0.32, 0.34, 0.32
    1500: 0.72, 1.53, 0.72, 0.70, 0.72, 0.70, 0.72, 0.71, 1.46, 0.71
    2000: 1.22, 1.20, 1.22, 1.98, 1.20, 1.20, 1.22, 1.94, 1.20, 1.20
    2500: 1.97, 2.72, 1.94, 2.33, 2.44, 1.94, 2.82, 1.93, 1.94, 2.69
    3000: 2.69, 3.41, 2.66, 3.38, 2.67, 3.42, 2.63, 3.44, 2.65, 3.39
    3500: 4.18, 3.65, 4.21, 4.21, 3.56, 4.21, 4.16, 3.63, 4.20, 4.13

After adding the cosine similarity filter the runtimes became:

    500:  0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02
    1000: 0.08, 0.07, 0.07, 0.07, 0.08, 0.07, 0.07, 0.08, 0.07, 0.07
    1500: 0.16, 0.16, 0.15, 0.16, 0.16, 0.15, 0.15, 0.15, 0.16, 0.16
    2000: 0.26, 0.26, 0.25, 0.26, 0.26, 0.26, 0.25, 0.26, 0.26, 0.26
    2500: 0.41, 0.41, 0.41, 0.40, 0.42, 0.42, 0.42, 0.41, 0.41, 0.41
    3000: 0.58, 0.56, 0.57, 0.56, 0.58, 0.57, 0.56, 0.56, 0.57, 0.55
    3500: 0.75, 0.74, 0.73, 0.74, 0.74, 0.73, 0.72, 0.75, 0.75, 0.75

As such, on average the runtime improvements are:

    N       Avg Pre         Avg Post        Improvement (Pre / Post)
    500     0.145           0.02            7.25
    1000    0.326           0.073           4.47
    1500    0.869           0.156           5.57
    2000    1.358           0.258           5.26
    2500    2.272           0.412           5.51
    3000    3.034           0.566           5.36
    3500    4.014           0.74            5.42

The test driver is as below, where both it and its dependencies were
compiled with 'CFLAGS=-O2 -fopenmp' and linked with 'LDFLAGS=-fopenmp',
'LDLIBS=-lm':

    $ cat test.c
    #include "config.h"
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "ccan/strgrp/strgrp.h"

    int main(void) {
        FILE *f;
        char *buf;
        struct strgrp *ctx;
        f = fdopen(0, "r");
    #define BUF_SIZE 512
        buf = malloc(BUF_SIZE);
        ctx = strgrp_new(0.85);
        while(fgets(buf, BUF_SIZE, f)) {
            buf[strcspn(buf, "\r\n")] = '\0';
            if (!strgrp_add(ctx, buf, NULL)) {
                printf("Failed to classify %s\n", buf);
            }
        }
        strgrp_free(ctx);
        free(buf);
        fclose(f);
        return 0;
    }

[1] https://en.wikipedia.org/wiki/Cosine_similarity
[2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
---

diff --git a/ccan/strgrp/_info b/ccan/strgrp/_info
index e0b70b40..2b88ea7b 100644
--- a/ccan/strgrp/_info
+++ b/ccan/strgrp/_info
@@ -5,14 +5,15 @@
 /**
  * strgrp - group/cluster similar strings.
  *
- * strgrp uses the Longest Common Subsequence (LCS) algorithm[1] to cluster
- * similar strings. It is governed by a threshold which is compared against
- * the computed normalised LCS length for all known groups.
+ * strgrp clusters similar strings using the Longest Common Subsequence (LCS)
+ * algorithm[1]. Clustering is governed by a threshold value, which is compared
+ * with the normalised LCS scores calculated from the input string and each
+ * existing group.
  *
  * As a coarse and not entirely accurate summary, strgrp takes the following
  * steps:
  *
- * 1. For all known strings, calculate the normalised LCS value against the
+ * 1. For all known groups, calculate the normalised LCS value against the
  * input string
  *
  * 2. Find the maximum normalised LCS value and associated group
@@ -21,37 +22,55 @@
  * threshold add the input string to the group, otherwise create a new group
  *
  * The clustering operation is expensive; LCS on its own is computationally
- * O(mn) on its two input strings and optimally requires O(min(m,n)) memory. In
- * general each input string should be compared against all known strings,
- * giving O(n^2) behaviour of the clustering algorithm on top of the O(mn) LCS
- * similarity measurement.
+ * O(m * n) on its two input strings and optimally requires O(min(m, n))
+ * memory. In general each input string should be compared against all known
+ * strings, giving O(n^2) behaviour of the clustering algorithm on top of the
+ * O(m * n) LCS similarity measurement.
  *
  * strgrp tries to battle this complexity on several fronts:
  *
- * 1. Coarse reduction of the required comparisons. Each group has a 'key',
+ * 1. Caching of input strings and their associated group. By incurring the
+ * cost of a map's string hash function we may eliminate all further search
+ * costs for exact matches, potentially reducing the insertion to a
+ * constant-time operation.
+ *
+ * 2. Coarse reduction of the required comparisons. Each group has a 'key',
  * which is the string that triggered the creation of the group. Input strings
  * are only compared against group keys rather than all known strings, reducing
  * the complexity to the current number of groups rather than all known
  * strings. Note due the pathological case where the number of groups is equal
- * to the number of known strings the algorithm still  has O(n^2) computational
+ * to the number of known strings the algorithm still has O(n^2) computational
  * complexity
  *
- * 2. Elimination of LCS computations that will never breach the configured
- * threshold. This property can be measured from the length of the input
- * strings, and a negative result avoids invoking the O(mn) behaviour of LCS
- *
- * 3. Caching of input strings and their associated group. By incurring the
- * cost of a map's string hash function we may eliminate all calls to the LCS
- * function for exact matches, potentially reducing the insertion to a
- * constant-time operation.
- *
- * 4. Whilst the data dependencies of LCS prevent internally parallel
- * implementations, LCS as a function can be applied in parallel. The code
+ * 3. Whilst the data dependencies of LCS prevent internally parallel
+ * implementations, LCS and other filters can be applied in parallel. The code
  * uses OpenMP to automatically distribute scoring of the input string
  * against group keys across a number of threads.
  *
+ * 4. Elimination of LCS computations that will never breach the configured
+ * threshold. Two measurements are used as rejecting filters (i.e. a failure to
+ * exceed the threshold prevents further measurements being made):
+ *
+ * 4a. Comparsion of string lengths: String lengths must be within given bounds
+ * to satisfy the user-supplied similarity constraint. A negative result avoids
+ * invoking the O(m * n) behaviour of LCS at the cost of O(m + n) in the two
+ * strlen() invocations.
+ *
+ * 4b. Comparison of character distribution: Cosine similarity[2] is used to
+ * measure unordered character similarity between the input strings. The
+ * implementation is again O(m + n), and avoids the O(m * n) behaviour of LCS.
+ *
+ * Performance will vary not only with the number of input strings but
+ * with their lengths and relative similarities. A large number of long input
+ * strings that are relatively similar will give the worst performance.
+ * However to provide some context, strgrp can cluster a real-world test set of
+ * 3500 strings distributed in length between 20 and 100 characters to 85%
+ * similarity on a 4-core 2010-era amd64 laptop in approximately 750ms.
+ *
  * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
  *
+ * [2] https://en.wikipedia.org/wiki/Cosine_similarity
+ *
  * License: LGPL
  * Author: Andrew Jeffery <andrew@aj.id.au>
  *
@@ -110,12 +129,18 @@ int main(int argc, char *argv[]) {
         return 0;
     }
 
-#if HAVE_OPENMP
     if (strcmp(argv[1], "cflags") == 0) {
+#if HAVE_OPENMP
         printf("-fopenmp\n");
+#endif
+        printf("-O2\n");
+        return 0;
+    }
+
+    if (strcmp(argv[1], "libs") == 0) {
+        printf("m\n");
         return 0;
     }
-#endif
 
     return 1;
 }
diff --git a/ccan/strgrp/strgrp.c b/ccan/strgrp/strgrp.c
index bc2cc83a..ba870632 100644
--- a/ccan/strgrp/strgrp.c
+++ b/ccan/strgrp/strgrp.c
@@ -16,8 +16,11 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "ccan/darray/darray.h"
@@ -27,6 +30,15 @@
 #include "strgrp.h"
 #include "config.h"
 
+#if HAVE_OPENMP
+#include <omp.h>
+#else
+#define omp_get_max_threads() 1
+#define omp_get_thread_num() 0
+#endif
+
+#define CHAR_N_VALUES (1 << CHAR_BIT)
+
 typedef darray(struct strgrp_grp *) darray_grp;
 typedef darray(struct strgrp_item *) darray_item;
 
@@ -37,6 +49,11 @@ struct grp_score {
     double score;
 };
 
+struct cosvec {
+    const char *str;
+    int16_t pop[CHAR_N_VALUES];
+};
+
 typedef darray(struct grp_score *) darray_score;
 
 struct strgrp {
@@ -45,6 +62,8 @@ struct strgrp {
     unsigned int n_grps;
     darray_grp grps;
     struct grp_score *scores;
+    int32_t n_cosvecs;
+    struct cosvec *cosvecs;
 };
 
 struct strgrp_iter {
@@ -69,6 +88,62 @@ struct strgrp_item {
     void *value;
 };
 
+
+/* String vector cosine similarity[1]
+ *
+ * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/
+ */
+
+static inline void
+strpopcnt(struct cosvec *ctx) {
+    const char *c;
+    memset(ctx->pop, 0, CHAR_N_VALUES * sizeof(int16_t));
+    for(c = ctx->str; *c; c++) {
+        assert(*c >= 0);
+        ctx->pop[(unsigned char)*c]++;
+    }
+}
+
+static inline double
+strcossim(struct cosvec *avec, struct cosvec *bvec) {
+    strpopcnt(avec);
+    strpopcnt(bvec);
+    int32_t saibi = 0;
+    int32_t sai2 = 0;
+    int32_t sbi2 = 0;
+    size_t i;
+    for (i = 0; i < CHAR_N_VALUES; i++) {
+        saibi += avec->pop[i] * bvec->pop[i];
+        sai2 += avec->pop[i] * avec->pop[i];
+        sbi2 += bvec->pop[i] * bvec->pop[i];
+    }
+    return saibi / (sqrt(sai2) * sqrt(sbi2));
+}
+
+/* Low-cost filter functions */
+
+static inline bool
+should_grp_score_cos(const struct strgrp *const ctx, struct cosvec *const avec,
+        struct cosvec *const bvec) {
+    return ctx->threshold <= strcossim(avec, bvec);
+}
+
+static inline bool
+should_grp_score_len(const struct strgrp *const ctx,
+        const struct strgrp_grp *const grp, const char *const str) {
+    const size_t strl = strlen(str);
+    const size_t keyl = grp->key_len;
+    double sr =  strl / keyl;
+    if (1 < sr) {
+        sr = 1 / sr;
+    }
+    return ctx->threshold <= sr;
+}
+
+/* Scoring - Longest Common Subsequence[2]
+ *
+ * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+ */
 #define ROWS 2
 
 static inline int cmi(int i, int j) {
@@ -114,23 +189,13 @@ nlcs(const char *const a, const char *const b) {
     return 2 * lcss / (strlen(a) + strlen(b));
 }
 
-static bool
-should_grp_score(const struct strgrp *const ctx,
-        const struct strgrp_grp *const grp, const char *const str) {
-    const size_t strl = strlen(str);
-    const size_t keyl = grp->key_len;
-    double sr =  strl / keyl;
-    if (1 < sr) {
-        sr = 1 / sr;
-    }
-    return ctx->threshold <= sr;
-}
-
 static inline double
 grp_score(const struct strgrp_grp *const grp, const char *const str) {
     return nlcs(grp->key, str);
 }
 
+/* Structure management */
+
 static struct strgrp_item *
 new_item(tal_t *const tctx, const char *const str, void *const data) {
     struct strgrp_item *i = talz(tctx, struct strgrp_item);
@@ -203,6 +268,9 @@ strgrp_new(const double threshold) {
     struct strgrp *ctx = talz(NULL, struct strgrp);
     ctx->threshold = threshold;
     stringmap_init(ctx->known, NULL);
+    // n threads compare strings
+    ctx->n_cosvecs = 2 * omp_get_max_threads();
+    ctx->cosvecs = tal_arrz(ctx, struct cosvec, ctx->n_cosvecs);
     darray_init(ctx->grps);
     return ctx;
 }
@@ -213,6 +281,16 @@ cache(struct strgrp *const ctx, struct strgrp_grp *const grp,
     *(stringmap_enter(ctx->known, str)) = grp;
 }
 
+static inline struct cosvec *
+tl_avec(struct strgrp *ctx) {
+    return &ctx->cosvecs[omp_get_thread_num()];
+}
+
+static inline struct cosvec *
+tl_bvec(struct strgrp *ctx) {
+    return &ctx->cosvecs[omp_get_thread_num() + 1];
+}
+
 static struct strgrp_grp *
 grp_for(struct strgrp *const ctx, const char *const str) {
     if (!ctx->n_grps) {
@@ -231,8 +309,17 @@ grp_for(struct strgrp *const ctx, const char *const str) {
 #endif
     for (i = 0; i < ctx->n_grps; i++) {
         ctx->scores[i].grp = darray_item(ctx->grps, i);
-        const bool ss = should_grp_score(ctx, ctx->scores[i].grp, str);
-        ctx->scores[i].score = ss ? grp_score(ctx->scores[i].grp, str) : 0;
+        if (should_grp_score_len(ctx, ctx->scores[i].grp, str)) {
+            tl_avec(ctx)->str = ctx->scores[i].grp->key;
+            tl_bvec(ctx)->str = str;
+            if (should_grp_score_cos(ctx, tl_avec(ctx), tl_bvec(ctx))) {
+                ctx->scores[i].score = grp_score(ctx->scores[i].grp, str);
+            } else {
+                ctx->scores[i].score = 0;
+            }
+        } else {
+            ctx->scores[i].score = 0;
+        }
     }
     struct grp_score *max = NULL;
     for (i = 0; i < ctx->n_grps; i++) {
@@ -344,8 +431,6 @@ strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) {
     strgrp_free(ctx);
 }
 
-#include <stdio.h>
-
 static void
 print_item(const struct strgrp_item *item) {
     printf("\t%s\n", item->key);