#include "strgrp.h"
#include "config.h"
-#if HAVE_OPENMP
-#include <omp.h>
-#else
-#define omp_get_max_threads() 1
-#define omp_get_thread_num() 0
-#endif
-
#define CHAR_N_VALUES (1 << CHAR_BIT)
typedef darray(struct strgrp_grp *) darray_grp;
double score;
};
-struct cosvec {
- const char *str;
- int16_t pop[CHAR_N_VALUES];
-};
-
typedef darray(struct grp_score *) darray_score;
struct strgrp {
unsigned int n_grps;
darray_grp grps;
struct grp_score *scores;
- int32_t n_cosvecs;
- struct cosvec *cosvecs;
+ int16_t pop[CHAR_N_VALUES];
};
struct strgrp_iter {
size_t key_len;
darray_item items;
int32_t n_items;
+ int16_t pop[CHAR_N_VALUES];
};
struct strgrp_grp_iter {
*/
static inline void
-strpopcnt(struct cosvec *ctx) {
+strpopcnt(const char *const str, int16_t pop[CHAR_N_VALUES]) {
const char *c;
- memset(ctx->pop, 0, CHAR_N_VALUES * sizeof(int16_t));
- for(c = ctx->str; *c; c++) {
+ memset(pop, 0, CHAR_N_VALUES * sizeof(*pop));
+ for(c = str; *c; c++) {
assert(*c >= 0);
- ctx->pop[(unsigned char)*c]++;
+ pop[(unsigned char)*c]++;
}
}
static inline double
-strcossim(struct cosvec *avec, struct cosvec *bvec) {
- strpopcnt(avec);
- strpopcnt(bvec);
+strcossim(const int16_t ref[CHAR_N_VALUES], const int16_t key[CHAR_N_VALUES]) {
int32_t saibi = 0;
int32_t sai2 = 0;
int32_t sbi2 = 0;
size_t i;
for (i = 0; i < CHAR_N_VALUES; i++) {
- saibi += avec->pop[i] * bvec->pop[i];
- sai2 += avec->pop[i] * avec->pop[i];
- sbi2 += bvec->pop[i] * bvec->pop[i];
+ saibi += ref[i] * key[i];
+ sai2 += ref[i] * ref[i];
+ sbi2 += key[i] * key[i];
}
- return saibi / (sqrt(sai2) * sqrt(sbi2));
+ return 1.0 - (2 * acos(saibi / sqrt(sai2 * sbi2)) / M_PI);
}
/* Low-cost filter functions */
+static inline double
+cossim_correction(const double s)
+{
+ return -((s - 0.5) * (s - 0.5)) + 0.33;
+}
+
static inline bool
-should_grp_score_cos(const struct strgrp *const ctx, struct cosvec *const avec,
- struct cosvec *const bvec) {
- return ctx->threshold <= strcossim(avec, bvec);
+should_grp_score_cos(const struct strgrp *const ctx,
+ struct strgrp_grp *const grp, const char *const str) {
+ const double s1 = strcossim(ctx->pop, grp->pop);
+ const double s2 = s1 + cossim_correction(s1);
+ return ctx->threshold <= s2;
}
static inline bool
should_grp_score_len(const struct strgrp *const ctx,
const struct strgrp_grp *const grp, const char *const str) {
- const size_t strl = strlen(str);
- const size_t keyl = grp->key_len;
- double sr = strl / keyl;
- if (1 < sr) {
- sr = 1 / sr;
- }
- return ctx->threshold <= sr;
+ const double lstr = (double) strlen(str);
+ const double lkey = (double) grp->key_len;
+ const double lmin = (lstr > lkey) ? lkey : lstr;
+ const double s = sqrt((2 * lmin * lmin) / (1.0 * lstr * lstr + lkey * lkey));
+ return ctx->threshold <= s;
}
/* Scoring - Longest Common Subsequence[2]
int ia, ib;
for (ia = (strlen(a) - 1); ia >= 0; ia--) {
const char iav = a[ia];
+ const int ial = (ia + 1) & 1; // ia last
for (ib = lb - 1; ib >= 0; ib--) {
const char ibv = b[ib];
- const int ial = (ia + 1) & 1; // ia last
const int iac = ia & 1; // ia current
const int ibl = ib + 1; // ib last
// don't need separate "ib current" as it's just ib
static inline double
nlcs(const char *const a, const char *const b) {
const double lcss = lcs(a, b);
- return 2 * lcss / (strlen(a) + strlen(b));
+ const double la = (double) strlen(a);
+ const double lb = (double) strlen(b);
+ const double s = sqrt((2 * lcss * lcss) / (la * la + lb * lb));
+ return s;
}
static inline double
if (!b) {
return NULL;
}
+ memcpy(b->pop, ctx->pop, sizeof(ctx->pop));
darray_push(ctx->grps, b);
ctx->n_grps++;
if (ctx->scores) {
ctx->threshold = threshold;
stringmap_init(ctx->known, NULL);
// n threads compare strings
- ctx->n_cosvecs = 2 * omp_get_max_threads();
- ctx->cosvecs = tal_arrz(ctx, struct cosvec, ctx->n_cosvecs);
darray_init(ctx->grps);
return ctx;
}
*(stringmap_enter(ctx->known, str)) = grp;
}
-static inline struct cosvec *
-tl_avec(struct strgrp *ctx) {
- return &ctx->cosvecs[omp_get_thread_num()];
-}
-
-static inline struct cosvec *
-tl_bvec(struct strgrp *ctx) {
- return &ctx->cosvecs[omp_get_thread_num() + 1];
-}
-
static struct strgrp_grp *
grp_for(struct strgrp *const ctx, const char *const str) {
+ // Ensure ctx->pop is always populated. Returning null here indicates a new
+ // group should be created, at which point add_grp() copies ctx->pop into
+ // the new group's struct.
+ strpopcnt(str, ctx->pop);
if (!ctx->n_grps) {
return NULL;
}
#pragma omp parallel for schedule(dynamic)
#endif
for (i = 0; i < ctx->n_grps; i++) {
- ctx->scores[i].grp = darray_item(ctx->grps, i);
- if (should_grp_score_len(ctx, ctx->scores[i].grp, str)) {
- tl_avec(ctx)->str = ctx->scores[i].grp->key;
- tl_bvec(ctx)->str = str;
- if (should_grp_score_cos(ctx, tl_avec(ctx), tl_bvec(ctx))) {
- ctx->scores[i].score = grp_score(ctx->scores[i].grp, str);
- } else {
- ctx->scores[i].score = 0;
+ struct strgrp_grp *grp = darray_item(ctx->grps, i);
+ ctx->scores[i].grp = grp;
+ ctx->scores[i].score = 0;
+ if (should_grp_score_len(ctx, grp, str)) {
+ if (should_grp_score_cos(ctx, grp, str)) {
+ ctx->scores[i].score = grp_score(grp, str);
}
- } else {
- ctx->scores[i].score = 0;
}
}
struct grp_score *max = NULL;
strgrp_add(struct strgrp *const ctx, const char *const str,
void *const data) {
bool inserted = false;
+ // grp_for() populates the ctx->pop memory. add_grp() copies this memory
+ // into the strgrp_grp that it creates. It's assumed the ctx->pop memory
+ // has not been modified between the grp_for() and add_grp() calls.
struct strgrp_grp *pick = grp_for(ctx, str);
if (pick) {
inserted = add_item(pick, str, data);