git.ozlabs.org Git - ccan/blob - ccan/strgrp/_info

   1 #include "config.h"
   2 #include <stdio.h>
   3 #include <string.h>
   4
   5 /**
   6  * strgrp - group/cluster similar strings.
   7  *
   8  * strgrp clusters similar strings using the Longest Common Subsequence (LCS)
   9  * algorithm[1]. Clustering is governed by a threshold value, which is compared
  10  * with the normalised LCS scores calculated from the input string and each
  11  * existing group.
  12  *
  13  * As a coarse and not entirely accurate summary, strgrp takes the following
  14  * steps:
  15  *
  16  * 1. For all known groups, calculate the normalised LCS value against the
  17  * input string
  18  *
  19  * 2. Find the maximum normalised LCS value and associated group
  20  *
  21  * 3. If the calculated maximum normalised LCS value exceeds the configured
  22  * threshold add the input string to the group, otherwise create a new group
  23  *
  24  * The clustering operation is expensive; LCS on its own is computationally
  25  * O(m * n) on its two input strings and optimally requires O(min(m, n))
  26  * memory. In general each input string should be compared against all known
  27  * strings, giving O(n^2) behaviour of the clustering algorithm on top of the
  28  * O(m * n) LCS similarity measurement.
  29  *
  30  * strgrp tries to battle this complexity on several fronts:
  31  *
  32  * 1. Caching of input strings and their associated group. By incurring the
  33  * cost of a map's string hash function we may eliminate all further search
  34  * costs for exact matches, potentially reducing the insertion to a
  35  * constant-time operation.
  36  *
  37  * 2. Coarse reduction of the required comparisons. Each group has a 'key',
  38  * which is the string that triggered the creation of the group. Input strings
  39  * are only compared against group keys rather than all known strings, reducing
  40  * the complexity to the current number of groups rather than all known
  41  * strings. Note due the pathological case where the number of groups is equal
  42  * to the number of known strings the algorithm still has O(n^2) computational
  43  * complexity
  44  *
  45  * 3. Whilst the data dependencies of LCS prevent internally parallel
  46  * implementations, LCS and other filters can be applied in parallel. The code
  47  * uses OpenMP to automatically distribute scoring of the input string
  48  * against group keys across a number of threads.
  49  *
  50  * 4. Elimination of LCS computations that will never breach the configured
  51  * threshold. Two measurements are used as rejecting filters (i.e. a failure to
  52  * exceed the threshold prevents further measurements being made):
  53  *
  54  * 4a. Comparsion of string lengths: String lengths must be within given bounds
  55  * to satisfy the user-supplied similarity constraint. A negative result avoids
  56  * invoking the O(m * n) behaviour of LCS at the cost of O(m + n) in the two
  57  * strlen() invocations.
  58  *
  59  * 4b. Comparison of character distribution: Cosine similarity[2] is used to
  60  * measure unordered character similarity between the input strings. The
  61  * implementation is again O(m + n), and avoids the O(m * n) behaviour of LCS.
  62  *
  63  * Performance will vary not only with the number of input strings but
  64  * with their lengths and relative similarities. A large number of long input
  65  * strings that are relatively similar will give the worst performance.
  66  * However to provide some context, strgrp can cluster a real-world test set of
  67  * 3500 strings distributed in length between 20 and 100 characters to 85%
  68  * similarity on a 4-core 2010-era amd64 laptop in approximately 750ms.
  69  *
  70  * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
  71  *
  72  * [2] https://en.wikipedia.org/wiki/Cosine_similarity
  73  *
  74  * License: LGPL
  75  * Author: Andrew Jeffery <andrew@aj.id.au>
  76  *
  77  * Ccanlint:
  78  *    tests_pass FAIL
  79  *    tests_pass_without_features FAIL
  80  *
  81  * Example:
  82  *     FILE *f;
  83  *     char *buf;
  84  *     struct strgrp *ctx;
  85  *     struct strgrp_iter *iter;
  86  *     const struct strgrp_grp *grp;
  87  *     struct strgrp_grp_iter *grp_iter;
  88  *     const struct strgrp_item *item;
  89  *
  90  *     f = fdopen(0, "r");
  91  *     #define BUF_SIZE 512
  92  *     buf = malloc(BUF_SIZE);
  93  *     ctx = strgrp_new(0.85);
  94  *     while(fgets(buf, BUF_SIZE, f)) {
  95  *         buf[strcspn(buf, "\r\n")] = '\0';
  96  *         if (!strgrp_add(ctx, buf, NULL)) {
  97  *             printf("Failed to classify %s\n", buf);
  98  *         }
  99  *     }
 100  *
 101  *     // Re-implement something similar to strgrp_print() via API as an
 102  *     // example
 103  *     iter = strgrp_iter_new(ctx);
 104  *     while(NULL != (grp = strgrp_iter_next(iter))) {
 105  *         printf("%s\n", strgrp_grp_key(grp));
 106  *         grp_iter = strgrp_grp_iter_new(grp);
 107  *         while(NULL != (item = strgrp_grp_iter_next(grp_iter))) {
 108  *              printf("\t%s\n", strgrp_item_key(item));
 109  *         }
 110  *         strgrp_grp_iter_free(grp_iter);
 111  *     }
 112  *
 113  *     strgrp_iter_free(iter);
 114  *     strgrp_free(ctx);
 115  *     free(buf);
 116  *     fclose(f);
 117  */
 118 int main(int argc, char *argv[]) {
 119     if (argc != 2) {
 120         return 1;
 121     }
 122
 123     if (strcmp(argv[1], "depends") == 0) {
 124         printf("ccan/darray\n");
 125         printf("ccan/stringmap\n");
 126         printf("ccan/tal\n");
 127         printf("ccan/tal/str\n");
 128         return 0;
 129     }
 130
 131     if (strcmp(argv[1], "testdepends") == 0) {
 132         printf("ccan/str\n");
 133         return 0;
 134     }
 135
 136     if (strcmp(argv[1], "cflags") == 0) {
 137 #if HAVE_OPENMP
 138         printf("-fopenmp\n");
 139 #endif
 140         printf("-O2\n");
 141         return 0;
 142     }
 143
 144     if (strcmp(argv[1], "libs") == 0) {
 145         printf("m\n");
 146         return 0;
 147     }
 148
 149     return 1;
 150 }