#include "config.h" #include #include /** * strgrp - group/cluster similar strings. * * strgrp clusters similar strings using the Longest Common Subsequence (LCS) * algorithm[1]. Clustering is governed by a threshold value, which is compared * with the normalised LCS scores calculated from the input string and each * existing group. * * As a coarse and not entirely accurate summary, strgrp takes the following * steps: * * 1. For all known groups, calculate the normalised LCS value against the * input string * * 2. Find the maximum normalised LCS value and associated group * * 3. If the calculated maximum normalised LCS value exceeds the configured * threshold add the input string to the group, otherwise create a new group * * The clustering operation is expensive; LCS on its own is computationally * O(m * n) on its two input strings and optimally requires O(min(m, n)) * memory. In general each input string should be compared against all known * strings, giving O(n^2) behaviour of the clustering algorithm on top of the * O(m * n) LCS similarity measurement. * * strgrp tries to battle this complexity on several fronts: * * 1. Caching of input strings and their associated group. By incurring the * cost of a map's string hash function we may eliminate all further search * costs for exact matches, potentially reducing the insertion to a * constant-time operation. * * 2. Coarse reduction of the required comparisons. Each group has a 'key', * which is the string that triggered the creation of the group. Input strings * are only compared against group keys rather than all known strings, reducing * the complexity to the current number of groups rather than all known * strings. Note due the pathological case where the number of groups is equal * to the number of known strings the algorithm still has O(n^2) computational * complexity * * 3. Whilst the data dependencies of LCS prevent internally parallel * implementations, LCS and other filters can be applied in parallel. The code * uses OpenMP to automatically distribute scoring of the input string * against group keys across a number of threads. * * 4. Elimination of LCS computations that will never breach the configured * threshold. Two measurements are used as rejecting filters (i.e. a failure to * exceed the threshold prevents further measurements being made): * * 4a. Comparsion of string lengths: String lengths must be within given bounds * to satisfy the user-supplied similarity constraint. A negative result avoids * invoking the O(m * n) behaviour of LCS at the cost of O(m + n) in the two * strlen() invocations. * * 4b. Comparison of character distribution: Cosine similarity[2] is used to * measure unordered character similarity between the input strings. The * implementation is again O(m + n), and avoids the O(m * n) behaviour of LCS. * * Performance will vary not only with the number of input strings but * with their lengths and relative similarities. A large number of long input * strings that are relatively similar will give the worst performance. * However to provide some context, strgrp can cluster a real-world test set of * 3500 strings distributed in length between 20 and 100 characters to 85% * similarity on a 4-core 2010-era amd64 laptop in approximately 750ms. * * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem * * [2] https://en.wikipedia.org/wiki/Cosine_similarity * * License: LGPL * Author: Andrew Jeffery * * Example: * FILE *f; * char *buf; * struct strgrp *ctx; * struct strgrp_iter *iter; * const struct strgrp_grp *grp; * struct strgrp_grp_iter *grp_iter; * const struct strgrp_item *item; * * f = fdopen(0, "r"); * #define BUF_SIZE 512 * buf = malloc(BUF_SIZE); * ctx = strgrp_new(0.85); * while(fgets(buf, BUF_SIZE, f)) { * buf[strcspn(buf, "\r\n")] = '\0'; * if (!strgrp_add(ctx, buf, NULL)) { * printf("Failed to classify %s\n", buf); * } * } * * // Re-implement something similar to strgrp_print() via API as an * // example * iter = strgrp_iter_new(ctx); * while(NULL != (grp = strgrp_iter_next(iter))) { * printf("%s\n", strgrp_grp_key(grp)); * grp_iter = strgrp_grp_iter_new(grp); * while(NULL != (item = strgrp_grp_iter_next(grp_iter))) { * printf("\t%s\n", strgrp_item_key(item)); * } * strgrp_grp_iter_free(grp_iter); * } * * strgrp_iter_free(iter); * strgrp_free(ctx); * free(buf); * fclose(f); */ int main(int argc, char *argv[]) { if (argc != 2) { return 1; } if (strcmp(argv[1], "depends") == 0) { printf("ccan/darray\n"); printf("ccan/stringmap\n"); printf("ccan/tal\n"); printf("ccan/tal/str\n"); return 0; } if (strcmp(argv[1], "testdepends") == 0) { printf("ccan/str\n"); return 0; } if (strcmp(argv[1], "cflags") == 0) { #if HAVE_OPENMP printf("-fopenmp\n"); #endif printf("-O2\n"); return 0; } if (strcmp(argv[1], "libs") == 0) { printf("m\n"); return 0; } return 1; }