3 Copyright (C) 2014 Andrew Jeffery <andrew@aj.id.au>
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include "ccan/darray/darray.h"
27 #include "ccan/stringmap/stringmap.h"
28 #include "ccan/tal/tal.h"
29 #include "ccan/tal/str/str.h"
36 #define omp_get_max_threads() 1
37 #define omp_get_thread_num() 0
40 #define CHAR_N_VALUES (1 << CHAR_BIT)
42 typedef darray(struct strgrp_grp *) darray_grp;
43 typedef darray(struct strgrp_item *) darray_item;
45 typedef stringmap(struct strgrp_grp *) stringmap_grp;
48 struct strgrp_grp *grp;
54 int16_t pop[CHAR_N_VALUES];
57 typedef darray(struct grp_score *) darray_score;
64 struct grp_score *scores;
66 struct cosvec *cosvecs;
70 const struct strgrp *ctx;
81 struct strgrp_grp_iter {
82 const struct strgrp_grp *grp;
92 /* String vector cosine similarity[1]
94 * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/
98 strpopcnt(struct cosvec *ctx) {
100 memset(ctx->pop, 0, CHAR_N_VALUES * sizeof(int16_t));
101 for(c = ctx->str; *c; c++) {
103 ctx->pop[(unsigned char)*c]++;
108 strcossim(struct cosvec *avec, struct cosvec *bvec) {
115 for (i = 0; i < CHAR_N_VALUES; i++) {
116 saibi += avec->pop[i] * bvec->pop[i];
117 sai2 += avec->pop[i] * avec->pop[i];
118 sbi2 += bvec->pop[i] * bvec->pop[i];
120 return saibi / (sqrt(sai2) * sqrt(sbi2));
123 /* Low-cost filter functions */
126 should_grp_score_cos(const struct strgrp *const ctx, struct cosvec *const avec,
127 struct cosvec *const bvec) {
128 return ctx->threshold <= strcossim(avec, bvec);
132 should_grp_score_len(const struct strgrp *const ctx,
133 const struct strgrp_grp *const grp, const char *const str) {
134 const size_t strl = strlen(str);
135 const size_t keyl = grp->key_len;
136 double sr = strl / keyl;
140 return ctx->threshold <= sr;
143 /* Scoring - Longest Common Subsequence[2]
145 * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
149 static inline int cmi(int i, int j) {
153 static inline int16_t
154 lcs(const char *const a, const char *const b) {
155 const int lb = strlen(b);
156 const int lbp1 = lb + 1;
157 int16_t *const lookup = calloc(ROWS * lbp1, sizeof(int16_t));
162 for (ia = (strlen(a) - 1); ia >= 0; ia--) {
163 const char iav = a[ia];
164 for (ib = lb - 1; ib >= 0; ib--) {
165 const char ibv = b[ib];
166 const int ial = (ia + 1) & 1; // ia last
167 const int iac = ia & 1; // ia current
168 const int ibl = ib + 1; // ib last
169 // don't need separate "ib current" as it's just ib
171 lookup[cmi(iac, ib)] = 1 + lookup[cmi(ial, ibl)];
173 const int16_t valb = lookup[cmi(ial, ib)];
174 const int16_t vabl = lookup[cmi(iac, ibl)];
175 lookup[cmi(iac, ib)] = (valb > vabl) ? valb : vabl;
179 int16_t result = lookup[0];
187 nlcs(const char *const a, const char *const b) {
188 const double lcss = lcs(a, b);
189 return 2 * lcss / (strlen(a) + strlen(b));
193 grp_score(const struct strgrp_grp *const grp, const char *const str) {
194 return nlcs(grp->key, str);
197 /* Structure management */
199 static struct strgrp_item *
200 new_item(tal_t *const tctx, const char *const str, void *const data) {
201 struct strgrp_item *i = talz(tctx, struct strgrp_item);
205 i->key = tal_strdup(i, str);
211 add_item(struct strgrp_grp *const ctx, const char *const str,
213 struct strgrp_item *i = new_item(ctx, str, data);
217 darray_push(ctx->items, i);
223 free_grp(struct strgrp_grp *grp) {
224 darray_free(grp->items);
227 static struct strgrp_grp *
228 new_grp(tal_t *const tctx, const char *const str, void *const data) {
229 struct strgrp_grp *b = talz(tctx, struct strgrp_grp);
233 b->key = tal_strdup(b, str);
234 b->key_len = strlen(str);
236 darray_init(b->items);
237 tal_add_destructor(b, free_grp);
238 if (!add_item(b, str, data)) {
244 static struct strgrp_grp *
245 add_grp(struct strgrp *const ctx, const char *const str,
247 struct strgrp_grp *b = new_grp(ctx, str, data);
251 darray_push(ctx->grps, b);
254 if (!tal_resize(&ctx->scores, ctx->n_grps)) {
258 ctx->scores = tal_arr(ctx, struct grp_score, ctx->n_grps);
267 strgrp_new(const double threshold) {
268 struct strgrp *ctx = talz(NULL, struct strgrp);
269 ctx->threshold = threshold;
270 stringmap_init(ctx->known, NULL);
271 // n threads compare strings
272 ctx->n_cosvecs = 2 * omp_get_max_threads();
273 ctx->cosvecs = tal_arrz(ctx, struct cosvec, ctx->n_cosvecs);
274 darray_init(ctx->grps);
279 cache(struct strgrp *const ctx, struct strgrp_grp *const grp,
280 const char *const str) {
281 *(stringmap_enter(ctx->known, str)) = grp;
284 static inline struct cosvec *
285 tl_avec(struct strgrp *ctx) {
286 return &ctx->cosvecs[omp_get_thread_num()];
289 static inline struct cosvec *
290 tl_bvec(struct strgrp *ctx) {
291 return &ctx->cosvecs[omp_get_thread_num() + 1];
294 static struct strgrp_grp *
295 grp_for(struct strgrp *const ctx, const char *const str) {
300 struct strgrp_grp **const grp = stringmap_lookup(ctx->known, str);
306 // Keep ccanlint happy in reduced feature mode
308 #pragma omp parallel for schedule(dynamic)
310 for (i = 0; i < ctx->n_grps; i++) {
311 ctx->scores[i].grp = darray_item(ctx->grps, i);
312 if (should_grp_score_len(ctx, ctx->scores[i].grp, str)) {
313 tl_avec(ctx)->str = ctx->scores[i].grp->key;
314 tl_bvec(ctx)->str = str;
315 if (should_grp_score_cos(ctx, tl_avec(ctx), tl_bvec(ctx))) {
316 ctx->scores[i].score = grp_score(ctx->scores[i].grp, str);
318 ctx->scores[i].score = 0;
321 ctx->scores[i].score = 0;
324 struct grp_score *max = NULL;
325 for (i = 0; i < ctx->n_grps; i++) {
326 if (!max || ctx->scores[i].score > max->score) {
327 max = &(ctx->scores[i]);
330 return (max && max->score >= ctx->threshold) ? max->grp : NULL;
333 const struct strgrp_grp *
334 strgrp_grp_for(struct strgrp *const ctx, const char *const str) {
335 return grp_for(ctx, str);
338 const struct strgrp_grp *
339 strgrp_add(struct strgrp *const ctx, const char *const str,
341 bool inserted = false;
342 struct strgrp_grp *pick = grp_for(ctx, str);
344 inserted = add_item(pick, str, data);
346 pick = add_grp(ctx, str, data);
347 inserted = (NULL != pick);
350 assert(NULL != pick);
351 cache(ctx, pick, str);
357 strgrp_iter_new(struct strgrp *const ctx) {
358 struct strgrp_iter *iter = talz(ctx, struct strgrp_iter);
367 const struct strgrp_grp *
368 strgrp_iter_next(struct strgrp_iter *const iter) {
369 return (iter->ctx->n_grps == iter->i) ?
370 NULL : darray_item(iter->ctx->grps, iter->i++);
374 strgrp_iter_free(struct strgrp_iter *const iter) {
378 struct strgrp_grp_iter *
379 strgrp_grp_iter_new(const struct strgrp_grp *const grp) {
380 struct strgrp_grp_iter *iter = talz(grp, struct strgrp_grp_iter);
389 const struct strgrp_item *
390 strgrp_grp_iter_next(struct strgrp_grp_iter *const iter) {
391 return (iter->grp->n_items == iter->i) ?
392 NULL : darray_item(iter->grp->items, iter->i++);
396 strgrp_grp_iter_free(struct strgrp_grp_iter *iter) {
401 strgrp_grp_key(const struct strgrp_grp *const grp) {
406 strgrp_item_key(const struct strgrp_item *const item) {
411 strgrp_item_value(const struct strgrp_item *const item) {
416 strgrp_free(struct strgrp *const ctx) {
417 darray_free(ctx->grps);
418 stringmap_free(ctx->known);
423 strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) {
424 struct strgrp_grp **grp;
425 struct strgrp_item **item;
426 darray_foreach(grp, ctx->grps) {
427 darray_foreach(item, (*grp)->items) {
435 print_item(const struct strgrp_item *item) {
436 printf("\t%s\n", item->key);
440 print_grp(const struct strgrp_grp *const grp) {
441 struct strgrp_item **item;
442 printf("%s:\n", grp->key);
443 darray_foreach(item, grp->items) {
450 strgrp_print(const struct strgrp *const ctx) {
451 struct strgrp_grp **grp;
452 darray_foreach(grp, ctx->grps) {