X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fstrgrp%2Fstrgrp.c;h=bab8d334a99eb6b0216c22cb806ea61f5b77b624;hp=bc2cc83ae12b0672fbf8dedbff9d22a249f8d3b1;hb=e8f7a978bf4eb41c1877958e31a1a7213680335b;hpb=9722c4a494b66b9e06ade1ac3c4a5a9c947eedf3 diff --git a/ccan/strgrp/strgrp.c b/ccan/strgrp/strgrp.c index bc2cc83a..bab8d334 100644 --- a/ccan/strgrp/strgrp.c +++ b/ccan/strgrp/strgrp.c @@ -16,8 +16,11 @@ along with this program. If not, see . */ #include +#include +#include #include #include +#include #include #include #include "ccan/darray/darray.h" @@ -27,6 +30,8 @@ #include "strgrp.h" #include "config.h" +#define CHAR_N_VALUES (1 << CHAR_BIT) + typedef darray(struct strgrp_grp *) darray_grp; typedef darray(struct strgrp_item *) darray_item; @@ -45,6 +50,7 @@ struct strgrp { unsigned int n_grps; darray_grp grps; struct grp_score *scores; + int16_t pop[CHAR_N_VALUES]; }; struct strgrp_iter { @@ -57,6 +63,7 @@ struct strgrp_grp { size_t key_len; darray_item items; int32_t n_items; + int16_t pop[CHAR_N_VALUES]; }; struct strgrp_grp_iter { @@ -69,6 +76,66 @@ struct strgrp_item { void *value; }; + +/* String vector cosine similarity[1] + * + * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/ + */ + +static inline void +strpopcnt(const char *const str, int16_t pop[CHAR_N_VALUES]) { + const char *c; + memset(pop, 0, CHAR_N_VALUES * sizeof(*pop)); + for(c = str; *c; c++) { + assert(*c >= 0); + pop[(unsigned char)*c]++; + } +} + +static inline double +strcossim(const int16_t ref[CHAR_N_VALUES], const int16_t key[CHAR_N_VALUES]) { + int32_t saibi = 0; + int32_t sai2 = 0; + int32_t sbi2 = 0; + size_t i; + for (i = 0; i < CHAR_N_VALUES; i++) { + saibi += ref[i] * key[i]; + sai2 += ref[i] * ref[i]; + sbi2 += key[i] * key[i]; + } + return 1.0 - (2 * acos(saibi / sqrt(sai2 * sbi2)) / M_PI); +} + +/* Low-cost filter functions */ + +static inline double +cossim_correction(const double s) +{ + return -((s - 0.5) * (s - 0.5)) + 0.33; +} + +static inline bool +should_grp_score_cos(const struct strgrp *const ctx, + struct strgrp_grp *const grp, const char *const str) { + const double s1 = strcossim(ctx->pop, grp->pop); + const double s2 = s1 + cossim_correction(s1); + return ctx->threshold <= s2; +} + +static inline bool +should_grp_score_len(const struct strgrp *const ctx, + const struct strgrp_grp *const grp, const char *const str) { + const double lstr = (double) strlen(str); + const double lkey = (double) grp->key_len; + const double lmin = (lstr > lkey) ? lkey : lstr; + const double s = sqrt((2 * lmin * lmin) / (1.0 * lstr * lstr + lkey * lkey)); + return ctx->threshold <= s; +} + +/* Scoring - Longest Common Subsequence[2] + * + * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem + */ #define ROWS 2 static inline int cmi(int i, int j) { @@ -86,9 +153,9 @@ lcs(const char *const a, const char *const b) { int ia, ib; for (ia = (strlen(a) - 1); ia >= 0; ia--) { const char iav = a[ia]; + const int ial = (ia + 1) & 1; // ia last for (ib = lb - 1; ib >= 0; ib--) { const char ibv = b[ib]; - const int ial = (ia + 1) & 1; // ia last const int iac = ia & 1; // ia current const int ibl = ib + 1; // ib last // don't need separate "ib current" as it's just ib @@ -111,19 +178,10 @@ lcs(const char *const a, const char *const b) { static inline double nlcs(const char *const a, const char *const b) { const double lcss = lcs(a, b); - return 2 * lcss / (strlen(a) + strlen(b)); -} - -static bool -should_grp_score(const struct strgrp *const ctx, - const struct strgrp_grp *const grp, const char *const str) { - const size_t strl = strlen(str); - const size_t keyl = grp->key_len; - double sr = strl / keyl; - if (1 < sr) { - sr = 1 / sr; - } - return ctx->threshold <= sr; + const double la = (double) strlen(a); + const double lb = (double) strlen(b); + const double s = sqrt((2 * lcss * lcss) / (la * la + lb * lb)); + return s; } static inline double @@ -131,6 +189,8 @@ grp_score(const struct strgrp_grp *const grp, const char *const str) { return nlcs(grp->key, str); } +/* Structure management */ + static struct strgrp_item * new_item(tal_t *const tctx, const char *const str, void *const data) { struct strgrp_item *i = talz(tctx, struct strgrp_item); @@ -183,6 +243,7 @@ add_grp(struct strgrp *const ctx, const char *const str, if (!b) { return NULL; } + memcpy(b->pop, ctx->pop, sizeof(ctx->pop)); darray_push(ctx->grps, b); ctx->n_grps++; if (ctx->scores) { @@ -203,6 +264,7 @@ strgrp_new(const double threshold) { struct strgrp *ctx = talz(NULL, struct strgrp); ctx->threshold = threshold; stringmap_init(ctx->known, NULL); + // n threads compare strings darray_init(ctx->grps); return ctx; } @@ -215,6 +277,10 @@ cache(struct strgrp *const ctx, struct strgrp_grp *const grp, static struct strgrp_grp * grp_for(struct strgrp *const ctx, const char *const str) { + // Ensure ctx->pop is always populated. Returning null here indicates a new + // group should be created, at which point add_grp() copies ctx->pop into + // the new group's struct. + strpopcnt(str, ctx->pop); if (!ctx->n_grps) { return NULL; } @@ -230,9 +296,14 @@ grp_for(struct strgrp *const ctx, const char *const str) { #pragma omp parallel for schedule(dynamic) #endif for (i = 0; i < ctx->n_grps; i++) { - ctx->scores[i].grp = darray_item(ctx->grps, i); - const bool ss = should_grp_score(ctx, ctx->scores[i].grp, str); - ctx->scores[i].score = ss ? grp_score(ctx->scores[i].grp, str) : 0; + struct strgrp_grp *grp = darray_item(ctx->grps, i); + ctx->scores[i].grp = grp; + ctx->scores[i].score = 0; + if (should_grp_score_len(ctx, grp, str)) { + if (should_grp_score_cos(ctx, grp, str)) { + ctx->scores[i].score = grp_score(grp, str); + } + } } struct grp_score *max = NULL; for (i = 0; i < ctx->n_grps; i++) { @@ -252,6 +323,9 @@ const struct strgrp_grp * strgrp_add(struct strgrp *const ctx, const char *const str, void *const data) { bool inserted = false; + // grp_for() populates the ctx->pop memory. add_grp() copies this memory + // into the strgrp_grp that it creates. It's assumed the ctx->pop memory + // has not been modified between the grp_for() and add_grp() calls. struct strgrp_grp *pick = grp_for(ctx, str); if (pick) { inserted = add_item(pick, str, data); @@ -344,8 +418,6 @@ strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) { strgrp_free(ctx); } -#include - static void print_item(const struct strgrp_item *item) { printf("\t%s\n", item->key);