3 Copyright (C) 2014 Andrew Jeffery <andrew@aj.id.au>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include "ccan/darray/darray.h"
27 #include "ccan/stringmap/stringmap.h"
28 #include "ccan/tal/tal.h"
29 #include "ccan/tal/str/str.h"
33 #define CHAR_N_VALUES (1 << CHAR_BIT)
35 typedef darray(struct strgrp_grp *) darray_grp;
36 typedef darray(struct strgrp_item *) darray_item;
38 typedef stringmap(struct strgrp_grp *) stringmap_grp;
41 struct strgrp_grp *grp;
45 typedef darray(struct grp_score *) darray_score;
52 struct grp_score *scores;
53 int16_t pop[CHAR_N_VALUES];
57 const struct strgrp *ctx;
66 int16_t pop[CHAR_N_VALUES];
69 struct strgrp_grp_iter {
70 const struct strgrp_grp *grp;
80 /* String vector cosine similarity[1]
82 * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/
86 strpopcnt(const char *const str, int16_t pop[CHAR_N_VALUES]) {
88 memset(pop, 0, CHAR_N_VALUES * sizeof(*pop));
89 for(c = str; *c; c++) {
91 pop[(unsigned char)*c]++;
96 strcossim(const int16_t ref[CHAR_N_VALUES], const int16_t key[CHAR_N_VALUES]) {
101 for (i = 0; i < CHAR_N_VALUES; i++) {
102 saibi += ref[i] * key[i];
103 sai2 += ref[i] * ref[i];
104 sbi2 += key[i] * key[i];
106 return 1.0 - (2 * acos(saibi / sqrt(sai2 * sbi2)) / M_PI);
109 /* Low-cost filter functions */
112 cossim_correction(const double s)
114 return -((s - 0.5) * (s - 0.5)) + 0.33;
118 should_grp_score_cos(const struct strgrp *const ctx,
119 struct strgrp_grp *const grp, const char *const str) {
120 const double s1 = strcossim(ctx->pop, grp->pop);
121 const double s2 = s1 + cossim_correction(s1);
122 return ctx->threshold <= s2;
126 should_grp_score_len(const struct strgrp *const ctx,
127 const struct strgrp_grp *const grp, const char *const str) {
128 const double lstr = (double) strlen(str);
129 const double lkey = (double) grp->key_len;
130 const double lmin = (lstr > lkey) ? lkey : lstr;
131 const double s = sqrt((2 * lmin * lmin) / (1.0 * lstr * lstr + lkey * lkey));
132 return ctx->threshold <= s;
135 /* Scoring - Longest Common Subsequence[2]
137 * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
141 static inline int cmi(int i, int j) {
145 static inline int16_t
146 lcs(const char *const a, const char *const b) {
147 const int lb = strlen(b);
148 const int lbp1 = lb + 1;
149 int16_t *const lookup = calloc(ROWS * lbp1, sizeof(int16_t));
154 for (ia = (strlen(a) - 1); ia >= 0; ia--) {
155 const char iav = a[ia];
156 const int ial = (ia + 1) & 1; // ia last
157 for (ib = lb - 1; ib >= 0; ib--) {
158 const char ibv = b[ib];
159 const int iac = ia & 1; // ia current
160 const int ibl = ib + 1; // ib last
161 // don't need separate "ib current" as it's just ib
163 lookup[cmi(iac, ib)] = 1 + lookup[cmi(ial, ibl)];
165 const int16_t valb = lookup[cmi(ial, ib)];
166 const int16_t vabl = lookup[cmi(iac, ibl)];
167 lookup[cmi(iac, ib)] = (valb > vabl) ? valb : vabl;
171 int16_t result = lookup[0];
179 nlcs(const char *const a, const char *const b) {
180 const double lcss = lcs(a, b);
181 const double la = (double) strlen(a);
182 const double lb = (double) strlen(b);
183 const double s = sqrt((2 * lcss * lcss) / (la * la + lb * lb));
188 grp_score(const struct strgrp_grp *const grp, const char *const str) {
189 return nlcs(grp->key, str);
192 /* Structure management */
194 static struct strgrp_item *
195 new_item(tal_t *const tctx, const char *const str, void *const data) {
196 struct strgrp_item *i = talz(tctx, struct strgrp_item);
200 i->key = tal_strdup(i, str);
206 add_item(struct strgrp_grp *const ctx, const char *const str,
208 struct strgrp_item *i = new_item(ctx, str, data);
212 darray_push(ctx->items, i);
218 free_grp(struct strgrp_grp *grp) {
219 darray_free(grp->items);
222 static struct strgrp_grp *
223 new_grp(tal_t *const tctx, const char *const str, void *const data) {
224 struct strgrp_grp *b = talz(tctx, struct strgrp_grp);
228 b->key = tal_strdup(b, str);
229 b->key_len = strlen(str);
231 darray_init(b->items);
232 tal_add_destructor(b, free_grp);
233 if (!add_item(b, str, data)) {
239 static struct strgrp_grp *
240 add_grp(struct strgrp *const ctx, const char *const str,
242 struct strgrp_grp *b = new_grp(ctx, str, data);
246 memcpy(b->pop, ctx->pop, sizeof(ctx->pop));
247 darray_push(ctx->grps, b);
250 if (!tal_resize(&ctx->scores, ctx->n_grps)) {
254 ctx->scores = tal_arr(ctx, struct grp_score, ctx->n_grps);
263 strgrp_new(const double threshold) {
264 struct strgrp *ctx = talz(NULL, struct strgrp);
265 ctx->threshold = threshold;
266 stringmap_init(ctx->known, NULL);
267 // n threads compare strings
268 darray_init(ctx->grps);
273 cache(struct strgrp *const ctx, struct strgrp_grp *const grp,
274 const char *const str) {
275 *(stringmap_enter(ctx->known, str)) = grp;
278 static struct strgrp_grp *
279 grp_for(struct strgrp *const ctx, const char *const str) {
280 // Ensure ctx->pop is always populated. Returning null here indicates a new
281 // group should be created, at which point add_grp() copies ctx->pop into
282 // the new group's struct.
283 strpopcnt(str, ctx->pop);
288 struct strgrp_grp **const grp = stringmap_lookup(ctx->known, str);
294 // Keep ccanlint happy in reduced feature mode
296 #pragma omp parallel for schedule(dynamic)
298 for (i = 0; i < ctx->n_grps; i++) {
299 struct strgrp_grp *grp = darray_item(ctx->grps, i);
300 ctx->scores[i].grp = grp;
301 ctx->scores[i].score = 0;
302 if (should_grp_score_len(ctx, grp, str)) {
303 if (should_grp_score_cos(ctx, grp, str)) {
304 ctx->scores[i].score = grp_score(grp, str);
308 struct grp_score *max = NULL;
309 for (i = 0; i < ctx->n_grps; i++) {
310 if (!max || ctx->scores[i].score > max->score) {
311 max = &(ctx->scores[i]);
314 return (max && max->score >= ctx->threshold) ? max->grp : NULL;
317 const struct strgrp_grp *
318 strgrp_grp_for(struct strgrp *const ctx, const char *const str) {
319 return grp_for(ctx, str);
322 const struct strgrp_grp *
323 strgrp_add(struct strgrp *const ctx, const char *const str,
325 bool inserted = false;
326 // grp_for() populates the ctx->pop memory. add_grp() copies this memory
327 // into the strgrp_grp that it creates. It's assumed the ctx->pop memory
328 // has not been modified between the grp_for() and add_grp() calls.
329 struct strgrp_grp *pick = grp_for(ctx, str);
331 inserted = add_item(pick, str, data);
333 pick = add_grp(ctx, str, data);
334 inserted = (NULL != pick);
337 assert(NULL != pick);
338 cache(ctx, pick, str);
344 strgrp_iter_new(struct strgrp *const ctx) {
345 struct strgrp_iter *iter = talz(ctx, struct strgrp_iter);
354 const struct strgrp_grp *
355 strgrp_iter_next(struct strgrp_iter *const iter) {
356 return (iter->ctx->n_grps == iter->i) ?
357 NULL : darray_item(iter->ctx->grps, iter->i++);
361 strgrp_iter_free(struct strgrp_iter *const iter) {
365 struct strgrp_grp_iter *
366 strgrp_grp_iter_new(const struct strgrp_grp *const grp) {
367 struct strgrp_grp_iter *iter = talz(grp, struct strgrp_grp_iter);
376 const struct strgrp_item *
377 strgrp_grp_iter_next(struct strgrp_grp_iter *const iter) {
378 return (iter->grp->n_items == iter->i) ?
379 NULL : darray_item(iter->grp->items, iter->i++);
383 strgrp_grp_iter_free(struct strgrp_grp_iter *iter) {
388 strgrp_grp_key(const struct strgrp_grp *const grp) {
393 strgrp_item_key(const struct strgrp_item *const item) {
398 strgrp_item_value(const struct strgrp_item *const item) {
403 strgrp_free(struct strgrp *const ctx) {
404 darray_free(ctx->grps);
405 stringmap_free(ctx->known);
410 strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) {
411 struct strgrp_grp **grp;
412 struct strgrp_item **item;
413 darray_foreach(grp, ctx->grps) {
414 darray_foreach(item, (*grp)->items) {
422 print_item(const struct strgrp_item *item) {
423 printf("\t%s\n", item->key);
427 print_grp(const struct strgrp_grp *const grp) {
428 struct strgrp_item **item;
429 printf("%s:\n", grp->key);
430 darray_foreach(item, grp->items) {
437 strgrp_print(const struct strgrp *const ctx) {
438 struct strgrp_grp **grp;
439 darray_foreach(grp, ctx->grps) {