along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
+#include <limits.h>
+#include <math.h>
#include <stdbool.h>
#include <stdint.h>
+#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ccan/darray/darray.h"
#include "strgrp.h"
#include "config.h"
+#define CHAR_N_VALUES (1 << CHAR_BIT)
+
typedef darray(struct strgrp_grp *) darray_grp;
typedef darray(struct strgrp_item *) darray_item;
unsigned int n_grps;
darray_grp grps;
struct grp_score *scores;
+ int16_t pop[CHAR_N_VALUES];
};
struct strgrp_iter {
size_t key_len;
darray_item items;
int32_t n_items;
+ int16_t pop[CHAR_N_VALUES];
};
struct strgrp_grp_iter {
void *value;
};
+
+/* String vector cosine similarity[1]
+ *
+ * [1] http://blog.nishtahir.com/2015/09/19/fuzzy-string-matching-using-cosine-similarity/
+ */
+
+static inline void
+strpopcnt(const char *const str, int16_t pop[CHAR_N_VALUES]) {
+ const char *c;
+ memset(pop, 0, CHAR_N_VALUES * sizeof(*pop));
+ for(c = str; *c; c++) {
+ assert(*c >= 0);
+ pop[(unsigned char)*c]++;
+ }
+}
+
+static inline double
+strcossim(const int16_t ref[CHAR_N_VALUES], const int16_t key[CHAR_N_VALUES]) {
+ int32_t saibi = 0;
+ int32_t sai2 = 0;
+ int32_t sbi2 = 0;
+ size_t i;
+ for (i = 0; i < CHAR_N_VALUES; i++) {
+ saibi += ref[i] * key[i];
+ sai2 += ref[i] * ref[i];
+ sbi2 += key[i] * key[i];
+ }
+ return saibi / (sqrt(sai2) * sqrt(sbi2));
+}
+
+/* Low-cost filter functions */
+
+static inline bool
+should_grp_score_cos(const struct strgrp *const ctx,
+ struct strgrp_grp *const grp, const char *const str) {
+ return ctx->threshold <= strcossim(ctx->pop, grp->pop);
+}
+
+static inline bool
+should_grp_score_len(const struct strgrp *const ctx,
+ const struct strgrp_grp *const grp, const char *const str) {
+ const size_t strl = strlen(str);
+ const size_t keyl = grp->key_len;
+ double sr = strl / keyl;
+ if (1 < sr) {
+ sr = 1 / sr;
+ }
+ return ctx->threshold <= sr;
+}
+
+/* Scoring - Longest Common Subsequence[2]
+ *
+ * [2] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+ */
#define ROWS 2
static inline int cmi(int i, int j) {
return 2 * lcss / (strlen(a) + strlen(b));
}
-static bool
-should_grp_score(const struct strgrp *const ctx,
- const struct strgrp_grp *const grp, const char *const str) {
- const size_t strl = strlen(str);
- const size_t keyl = grp->key_len;
- double sr = strl / keyl;
- if (1 < sr) {
- sr = 1 / sr;
- }
- return ctx->threshold <= sr;
-}
-
static inline double
grp_score(const struct strgrp_grp *const grp, const char *const str) {
return nlcs(grp->key, str);
}
+/* Structure management */
+
static struct strgrp_item *
new_item(tal_t *const tctx, const char *const str, void *const data) {
struct strgrp_item *i = talz(tctx, struct strgrp_item);
if (!b) {
return NULL;
}
+ memcpy(b->pop, ctx->pop, sizeof(ctx->pop));
darray_push(ctx->grps, b);
ctx->n_grps++;
if (ctx->scores) {
struct strgrp *ctx = talz(NULL, struct strgrp);
ctx->threshold = threshold;
stringmap_init(ctx->known, NULL);
+ // n threads compare strings
darray_init(ctx->grps);
return ctx;
}
static struct strgrp_grp *
grp_for(struct strgrp *const ctx, const char *const str) {
+ // Ensure ctx->pop is always populated. Returning null here indicates a new
+ // group should be created, at which point add_grp() copies ctx->pop into
+ // the new group's struct.
+ strpopcnt(str, ctx->pop);
if (!ctx->n_grps) {
return NULL;
}
#pragma omp parallel for schedule(dynamic)
#endif
for (i = 0; i < ctx->n_grps; i++) {
- ctx->scores[i].grp = darray_item(ctx->grps, i);
- const bool ss = should_grp_score(ctx, ctx->scores[i].grp, str);
- ctx->scores[i].score = ss ? grp_score(ctx->scores[i].grp, str) : 0;
+ struct strgrp_grp *grp = darray_item(ctx->grps, i);
+ ctx->scores[i].grp = grp;
+ ctx->scores[i].score = 0;
+ if (should_grp_score_len(ctx, grp, str)) {
+ if (should_grp_score_cos(ctx, grp, str)) {
+ ctx->scores[i].score = grp_score(grp, str);
+ }
+ }
}
struct grp_score *max = NULL;
for (i = 0; i < ctx->n_grps; i++) {
strgrp_add(struct strgrp *const ctx, const char *const str,
void *const data) {
bool inserted = false;
+ // grp_for() populates the ctx->pop memory. add_grp() copies this memory
+ // into the strgrp_grp that it creates. It's assumed the ctx->pop memory
+ // has not been modified between the grp_for() and add_grp() calls.
struct strgrp_grp *pick = grp_for(ctx, str);
if (pick) {
inserted = add_item(pick, str, data);
strgrp_free(ctx);
}
-#include <stdio.h>
-
static void
print_item(const struct strgrp_item *item) {
printf("\t%s\n", item->key);