strgrp: Add cosine similarity filter

author Andrew Jeffery <andrew@aj.id.au>

Thu, 24 Sep 2015 12:49:39 +0000 (22:19 +0930)

committer Andrew Jeffery <andrew@aj.id.au>

Sun, 8 Nov 2015 03:40:27 +0000 (14:10 +1030)
author Andrew Jeffery <andrew@aj.id.au>
Thu, 24 Sep 2015 12:49:39 +0000 (22:19 +0930)
committer Andrew Jeffery <andrew@aj.id.au>
Sun, 8 Nov 2015 03:40:27 +0000 (14:10 +1030)
diff --git a/ccan/strgrp/_info b/ccan/strgrp/_info

index e0b70b401eb90418f8ba8aee647708ad8a54cc0f..2b88ea7b94eb34cb7ccec58d49e4179b6aacbeac 100644 (file)
--- a/ccan/strgrp/_info
+++ b/ccan/strgrp/_info
@@ -5,14 +5,15 @@
  /**
   * strgrp - group/cluster similar strings.
   *
- * strgrp uses the Longest Common Subsequence (LCS) algorithm[1] to cluster
- * similar strings. It is governed by a threshold which is compared against
- * the computed normalised LCS length for all known groups.
+ * strgrp clusters similar strings using the Longest Common Subsequence (LCS)
+ * algorithm[1]. Clustering is governed by a threshold value, which is compared
+ * with the normalised LCS scores calculated from the input string and each
+ * existing group.
   *
   * As a coarse and not entirely accurate summary, strgrp takes the following
   * steps:
   *
- * 1. For all known strings, calculate the normalised LCS value against the
+ * 1. For all known groups, calculate the normalised LCS value against the
   * input string
   *
   * 2. Find the maximum normalised LCS value and associated group
@@ -21,37 +22,55 @@
   * threshold add the input string to the group, otherwise create a new group
   *
   * The clustering operation is expensive; LCS on its own is computationally
- * O(mn) on its two input strings and optimally requires O(min(m,n)) memory. In
- * general each input string should be compared against all known strings,
- * giving O(n^2) behaviour of the clustering algorithm on top of the O(mn) LCS
- * similarity measurement.
+ * O(m * n) on its two input strings and optimally requires O(min(m, n))
+ * memory. In general each input string should be compared against all known
+ * strings, giving O(n^2) behaviour of the clustering algorithm on top of the
+ * O(m * n) LCS similarity measurement.
   *
   * strgrp tries to battle this complexity on several fronts:
   *
- * 1. Coarse reduction of the required comparisons. Each group has a 'key',
+ * 1. Caching of input strings and their associated group. By incurring the
+ * cost of a map's string hash function we may eliminate all further search
+ * costs for exact matches, potentially reducing the insertion to a
+ * constant-time operation.
+ *
+ * 2. Coarse reduction of the required comparisons. Each group has a 'key',
   * which is the string that triggered the creation of the group. Input strings
   * are only compared against group keys rather than all known strings, reducing
   * the complexity to the current number of groups rather than all known
   * strings. Note due the pathological case where the number of groups is equal
- * to the number of known strings the algorithm still  has O(n^2) computational
+ * to the number of known strings the algorithm still has O(n^2) computational
   * complexity
   *
- * 2. Elimination of LCS computations that will never breach the configured
- * threshold. This property can be measured from the length of the input
- * strings, and a negative result avoids invoking the O(mn) behaviour of LCS
- *
- * 3. Caching of input strings and their associated group. By incurring the
- * cost of a map's string hash function we may eliminate all calls to the LCS
- * function for exact matches, potentially reducing the insertion to a
- * constant-time operation.
- *
- * 4. Whilst the data dependencies of LCS prevent internally parallel
- * implementations, LCS as a function can be applied in parallel. The code
+ * 3. Whilst the data dependencies of LCS prevent internally parallel
+ * implementations, LCS and other filters can be applied in parallel. The code
   * uses OpenMP to automatically distribute scoring of the input string
   * against group keys across a number of threads.
   *
+ * 4. Elimination of LCS computations that will never breach the configured
+ * threshold. Two measurements are used as rejecting filters (i.e. a failure to
+ * exceed the threshold prevents further measurements being made):
+ *
+ * 4a. Comparsion of string lengths: String lengths must be within given bounds
+ * to satisfy the user-supplied similarity constraint. A negative result avoids
+ * invoking the O(m * n) behaviour of LCS at the cost of O(m + n) in the two
+ * strlen() invocations.
+ *
+ * 4b. Comparison of character distribution: Cosine similarity[2] is used to
+ * measure unordered character similarity between the input strings. The
+ * implementation is again O(m + n), and avoids the O(m * n) behaviour of LCS.
+ *
+ * Performance will vary not only with the number of input strings but
+ * with their lengths and relative similarities. A large number of long input
+ * strings that are relatively similar will give the worst performance.
+ * However to provide some context, strgrp can cluster a real-world test set of
+ * 3500 strings distributed in length between 20 and 100 characters to 85%
+ * similarity on a 4-core 2010-era amd64 laptop in approximately 750ms.
+ *
   * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
   *
+ * [2] https://en.wikipedia.org/wiki/Cosine_similarity
+ *
   * License: LGPL
   * Author: Andrew Jeffery <andrew@aj.id.au>
   *
@@ -110,12 +129,18 @@ int main(int argc, char *argv[]) {
          return 0;
      }
  
-#if HAVE_OPENMP
      if (strcmp(argv[1], "cflags") == 0) {
+#if HAVE_OPENMP
          printf("-fopenmp\n");
+#endif
+        printf("-O2\n");
+        return 0;
+    }
+
+    if (strcmp(argv[1], "libs") == 0) {
+        printf("m\n");
          return 0;
      }
-#endif
  
      return 1;
  }
diff --git a/ccan/strgrp/strgrp.c b/ccan/strgrp/strgrp.c

index bc2cc83ae12b0672fbf8dedbff9d22a249f8d3b1..ba87063206c9d76d7f51eb106111b101f3c19a2c 100644 (file)
--- a/ccan/strgrp/strgrp.c
+++ b/ccan/strgrp/strgrp.c
@@ -16,8 +16,11 @@
      along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
  #include <assert.h>
+#include <limits.h>
+#include <math.h>
  #include <stdbool.h>
  #include <stdint.h>
+#include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include "ccan/darray/darray.h"
@@ -27,6 +30,15 @@
  #include "strgrp.h"
  #include "config.h"
  
+#if HAVE_OPENMP
+#include <omp.h>
+#else
+#define omp_get_max_threads() 1
+#define omp_get_thread_num() 0
+#endif
+
+#define CHAR_N_VALUES (1 << CHAR_BIT)
+
  typedef darray(struct strgrp_grp *) darray_grp;
  typedef darray(struct strgrp_item *) darray_item;
  
@@ -37,6 +49,11 @@ struct grp_score {
      double score;
  };
  
+struct cosvec {
+    const char *str;
+    int16_t pop[CHAR_N_VALUES];
+};
+
  typedef darray(struct grp_score *) darray_score;
  
  struct strgrp {
@@ -45,6 +62,8 @@ struct strgrp {
      unsigned int n_grps;
      darray_grp grps;
      struct grp_score *scores;
+    int32_t n_cosvecs;
+    struct cosvec *cosvecs;
  };
  
  struct strgrp_iter {
@@ -69,6 +88,62 @@ struct strgrp_item {
      void *value;
  };
  
+
+/* String vector cosine similarity[1]
+ *
+ * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/
+ */
+
+static inline void
+strpopcnt(struct cosvec *ctx) {
+    const char *c;
+    memset(ctx->pop, 0, CHAR_N_VALUES * sizeof(int16_t));
+    for(c = ctx->str; *c; c++) {
+        assert(*c >= 0);
+        ctx->pop[(unsigned char)*c]++;
+    }
+}
+
+static inline double
+strcossim(struct cosvec *avec, struct cosvec *bvec) {
+    strpopcnt(avec);
+    strpopcnt(bvec);
+    int32_t saibi = 0;
+    int32_t sai2 = 0;
+    int32_t sbi2 = 0;
+    size_t i;
+    for (i = 0; i < CHAR_N_VALUES; i++) {
+        saibi += avec->pop[i] * bvec->pop[i];
+        sai2 += avec->pop[i] * avec->pop[i];
+        sbi2 += bvec->pop[i] * bvec->pop[i];
+    }
+    return saibi / (sqrt(sai2) * sqrt(sbi2));
+}
+
+/* Low-cost filter functions */
+
+static inline bool
+should_grp_score_cos(const struct strgrp *const ctx, struct cosvec *const avec,
+        struct cosvec *const bvec) {
+    return ctx->threshold <= strcossim(avec, bvec);
+}
+
+static inline bool
+should_grp_score_len(const struct strgrp *const ctx,
+        const struct strgrp_grp *const grp, const char *const str) {
+    const size_t strl = strlen(str);
+    const size_t keyl = grp->key_len;
+    double sr =  strl / keyl;
+    if (1 < sr) {
+        sr = 1 / sr;
+    }
+    return ctx->threshold <= sr;
+}
+
+/* Scoring - Longest Common Subsequence[2]
+ *
+ * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+ */
  #define ROWS 2
  
  static inline int cmi(int i, int j) {
@@ -114,23 +189,13 @@ nlcs(const char *const a, const char *const b) {
      return 2 * lcss / (strlen(a) + strlen(b));
  }
  
-static bool
-should_grp_score(const struct strgrp *const ctx,
-        const struct strgrp_grp *const grp, const char *const str) {
-    const size_t strl = strlen(str);
-    const size_t keyl = grp->key_len;
-    double sr =  strl / keyl;
-    if (1 < sr) {
-        sr = 1 / sr;
-    }
-    return ctx->threshold <= sr;
-}
-
  static inline double
  grp_score(const struct strgrp_grp *const grp, const char *const str) {
      return nlcs(grp->key, str);
  }
  
+/* Structure management */
+
  static struct strgrp_item *
  new_item(tal_t *const tctx, const char *const str, void *const data) {
      struct strgrp_item *i = talz(tctx, struct strgrp_item);
@@ -203,6 +268,9 @@ strgrp_new(const double threshold) {
      struct strgrp *ctx = talz(NULL, struct strgrp);
      ctx->threshold = threshold;
      stringmap_init(ctx->known, NULL);
+    // n threads compare strings
+    ctx->n_cosvecs = 2 * omp_get_max_threads();
+    ctx->cosvecs = tal_arrz(ctx, struct cosvec, ctx->n_cosvecs);
      darray_init(ctx->grps);
      return ctx;
  }
@@ -213,6 +281,16 @@ cache(struct strgrp *const ctx, struct strgrp_grp *const grp,
      *(stringmap_enter(ctx->known, str)) = grp;
  }
  
+static inline struct cosvec *
+tl_avec(struct strgrp *ctx) {
+    return &ctx->cosvecs[omp_get_thread_num()];
+}
+
+static inline struct cosvec *
+tl_bvec(struct strgrp *ctx) {
+    return &ctx->cosvecs[omp_get_thread_num() + 1];
+}
+
  static struct strgrp_grp *
  grp_for(struct strgrp *const ctx, const char *const str) {
      if (!ctx->n_grps) {
@@ -231,8 +309,17 @@ grp_for(struct strgrp *const ctx, const char *const str) {
  #endif
      for (i = 0; i < ctx->n_grps; i++) {
          ctx->scores[i].grp = darray_item(ctx->grps, i);
-        const bool ss = should_grp_score(ctx, ctx->scores[i].grp, str);
-        ctx->scores[i].score = ss ? grp_score(ctx->scores[i].grp, str) : 0;
+        if (should_grp_score_len(ctx, ctx->scores[i].grp, str)) {
+            tl_avec(ctx)->str = ctx->scores[i].grp->key;
+            tl_bvec(ctx)->str = str;
+            if (should_grp_score_cos(ctx, tl_avec(ctx), tl_bvec(ctx))) {
+                ctx->scores[i].score = grp_score(ctx->scores[i].grp, str);
+            } else {
+                ctx->scores[i].score = 0;
+            }
+        } else {
+            ctx->scores[i].score = 0;
+        }
      }
      struct grp_score *max = NULL;
      for (i = 0; i < ctx->n_grps; i++) {
@@ -344,8 +431,6 @@ strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) {
      strgrp_free(ctx);
  }
  
-#include <stdio.h>
-
  static void
  print_item(const struct strgrp_item *item) {
      printf("\t%s\n", item->key);
author	Andrew Jeffery <andrew@aj.id.au>
	Thu, 24 Sep 2015 12:49:39 +0000 (22:19 +0930)
committer	Andrew Jeffery <andrew@aj.id.au>
	Sun, 8 Nov 2015 03:40:27 +0000 (14:10 +1030)
ccan/strgrp/_info		patch \| blob \| history
ccan/strgrp/strgrp.c		patch \| blob \| history