From e0663bef2ec73ed33b22e5a029284296495aab87 Mon Sep 17 00:00:00 2001 From: Kevin Locke Date: Fri, 18 Nov 2016 21:19:25 -0700 Subject: [PATCH] edit_distance: calculate edit distance between strings This commit creates the edit_distance module for calculating various edit distance measures. The edit distance quantifies the similarity between two strings based on the number of modifications necessary to turn one string into the other. There are several edit distance measures which differ in the operations which are permitted and the cost (aka weight) of the operations. This module provides functions for calculating the Longest Common Subsequence (LCS), Levenshtein, and Damerau-Levenshtein (restricted and unrestricted) distances. Weighted versions of these functions can be created by defining cost functions as preprocessor macros when compiling this module. Distances over other array types (e.g. wide strings, integers, structs) can be accomplished by defining the element type and equality test macros. Signed-off-by: Kevin Locke Signed-off-by: David Gibson --- ccan/edit_distance/LICENSE | 1 + ccan/edit_distance/_info | 89 +++++++ ccan/edit_distance/edit_distance-params.h | 126 +++++++++ ccan/edit_distance/edit_distance-private.h | 81 ++++++ ccan/edit_distance/edit_distance.bib | 57 +++++ ccan/edit_distance/edit_distance.c | 116 +++++++++ ccan/edit_distance/edit_distance.h | 131 ++++++++++ ccan/edit_distance/edit_distance_dl.c | 179 +++++++++++++ ccan/edit_distance/edit_distance_lcs.c | 69 +++++ ccan/edit_distance/edit_distance_lev.c | 74 ++++++ ccan/edit_distance/edit_distance_rdl.c | 98 +++++++ ccan/edit_distance/test/api.c | 221 ++++++++++++++++ ccan/edit_distance/test/run-types-short.c | 40 +++ ccan/edit_distance/test/run-types-struct.c | 56 ++++ ccan/edit_distance/test/run-types-wchar.c | 60 +++++ ccan/edit_distance/test/run-weights.c | 282 +++++++++++++++++++++ 16 files changed, 1680 insertions(+) create mode 120000 ccan/edit_distance/LICENSE create mode 100644 ccan/edit_distance/_info create mode 100644 ccan/edit_distance/edit_distance-params.h create mode 100644 ccan/edit_distance/edit_distance-private.h create mode 100644 ccan/edit_distance/edit_distance.bib create mode 100644 ccan/edit_distance/edit_distance.c create mode 100644 ccan/edit_distance/edit_distance.h create mode 100644 ccan/edit_distance/edit_distance_dl.c create mode 100644 ccan/edit_distance/edit_distance_lcs.c create mode 100644 ccan/edit_distance/edit_distance_lev.c create mode 100644 ccan/edit_distance/edit_distance_rdl.c create mode 100644 ccan/edit_distance/test/api.c create mode 100644 ccan/edit_distance/test/run-types-short.c create mode 100644 ccan/edit_distance/test/run-types-struct.c create mode 100644 ccan/edit_distance/test/run-types-wchar.c create mode 100644 ccan/edit_distance/test/run-weights.c diff --git a/ccan/edit_distance/LICENSE b/ccan/edit_distance/LICENSE new file mode 120000 index 00000000..2354d129 --- /dev/null +++ b/ccan/edit_distance/LICENSE @@ -0,0 +1 @@ +../../licenses/BSD-MIT \ No newline at end of file diff --git a/ccan/edit_distance/_info b/ccan/edit_distance/_info new file mode 100644 index 00000000..1955957f --- /dev/null +++ b/ccan/edit_distance/_info @@ -0,0 +1,89 @@ +#include "config.h" +#include +#include + +/** + * edit_distance - calculate the edit distance between two strings + * + * The edit distance quantifies the similarity between two strings based on the + * number of modifications necessary to turn one string into the other. There + * are several edit distance measures which differ in the operations which are + * permitted and the cost (aka weight) of the operations. This module provides + * functions for calculating the Longest Common Subsequence (LCS), Levenshtein, + * and Damerau-Levenshtein (restricted and unrestricted) distances. Weighted + * versions of these functions can be created by defining cost functions as + * preprocessor macros when compiling this module. Distances over other array + * types (e.g. wide strings, integers, structs) can be accomplished by defining + * the element type and equality test macros. + * + * Example: + * #include // UINT_MAX + * #include // fprintf, printf + * #include // EXIT_* + * #include // strlen + * + * #include + * + * const char *counties[] = { + * "Anglesey", + * "Brecknockshire", + * "Cardiganshire", + * "Carmarthenshire", + * "Caernarfonshire", + * "Denbighshire", + * "Flintshire", + * "Glamorgan", + * "Merionethshire", + * "Monmouthshire", + * "Montgomeryshire", + * "Pembrokeshire", + * "Radnorshire", + * NULL + * }; + * + * int main(int argc, char *argv[]) + * { + * if (argc != 2) { + * fprintf(stderr, "Usage: %s \n", argv[0]); + * return EXIT_FAILURE; + * } + * const char *guess = argv[1]; + * + * unsigned int bestdist = UINT_MAX; + * const char *bestcounty = NULL; + * for (const char **pcounty = counties; *pcounty; ++pcounty) { + * const char *county = *pcounty; + * unsigned int dist = edit_distance(guess, strlen(guess), + * county, strlen(county), EDIT_DISTANCE_RDL); + * if (dist < bestdist) { + * bestdist = dist; + * bestcounty = county; + * } + * } + * + * printf("You were probably trying to spell %s\n", bestcounty); + * return EXIT_SUCCESS; + * } + * License: MIT + * Author: Kevin Locke + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) { + return 1; + } + + if (strcmp(argv[1], "depends") == 0) { + return 0; + } + + if (strcmp(argv[1], "testdepends") == 0) { + printf("ccan/array_size\n"); + printf("ccan/build_assert\n"); + printf("ccan/tap\n"); + return 0; + } + + return 1; +} diff --git a/ccan/edit_distance/edit_distance-params.h b/ccan/edit_distance/edit_distance-params.h new file mode 100644 index 00000000..f4846188 --- /dev/null +++ b/ccan/edit_distance/edit_distance-params.h @@ -0,0 +1,126 @@ +/** @file + * Defines the tunable parameters for the edit_distance functions which can + * be changed at module-compile-time. + * + * The macros in this header can be redefined, either by editing this file or + * providing external definitions. This makes it possible to calculate the + * weighted edit distance with arbitrary weighting. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#ifndef CCAN_EDIT_DISTANCE_PARAMS_H +#define CCAN_EDIT_DISTANCE_PARAMS_H + +#ifndef ed_elem +# include /* memchr */ +#endif + +#ifndef ED_HASH_ELEM +# include /* UCHAR_MAX */ +#endif + +#if !defined(ED_DEL_COST) && \ + !defined(ED_INS_COST) && \ + !defined(ED_SUB_COST) && \ + !defined(ED_TRA_COST) +/** Defined if costs are symmetric. + * (i.e. distance(a, b) == distance(b, a) forall a,b) */ +# define ED_COST_IS_SYMMETRIC +#endif + +#ifndef ED_DEL_COST +/** Cost to delete element @p e. */ +# define ED_DEL_COST(e) 1 +/** Defined if the cost to delete is the same for all elements. */ +# define ED_DEL_COST_CONST +#endif + +#ifndef ED_INS_COST +/** Cost to insert element @p e. */ +# define ED_INS_COST(e) 1 +/** Defined if the cost to insert is the same for all elements. */ +# define ED_INS_COST_CONST +#endif + +#ifndef ED_SUB_COST +/** Cost to substitute element @p f for element @p e (i.e. replace @p e with + * @p f ). + * + * All algorithms which use this macro expect that ED_SUB_COST(e, f) + + * ED_SUB_COST(f, g) >= ED_SUB_COST(e, g). They do not check for any + * substitution chains and will not find minimal distances when this assumption + * does not hold. + * + * Although it is possible to set ED_SUB_COST(e, f) >= ED_DEL_COST(e) + + * ED_INS_COST(f). If this is true for all @p e and @p f, consider + * using edit_distance_lcs() instead of edit_distance_lev() which calculates + * the same result more quickly by not considering substitutions which are + * never less costly than delete + insert. + */ +# define ED_SUB_COST(e, f) 1 +/** Defined if the cost to substitute is the same for all element pairs. */ +# define ED_SUB_COST_CONST +#endif + +#ifndef ED_TRA_COST +/** Cost to transpose elements @p ef to @p fe. + * + * @c edit_distance_dl requires 2 * ED_TRA_COST(e, f) >= ED_INS_COST(g) + + * ED_DEL_COST(h) for all @c e,f,g,h. See Trace Theorem 3 of + * Wagner-Lowrance. @cite Wagner75 + */ +# define ED_TRA_COST(e, f) 1 +/** Defined if the cost to transpose is the same for all element pairs. */ +# define ED_TRA_COST_CONST +#endif + +#if defined(DOXYGEN) +/** Optimized array search function for single-array-heavy workloads. + * + * The expression generated by this macro must evaluate to @c true in a boolean + * context when @p elem is contained in the first @p len elements of @p arr and + * @c false otherwise. @p len is guaranteed to be greater than 0. + * + * This macro is currently only used when #ED_INS_COST_CONST and + * #ED_SUB_COST_CONST are defined. + * + * @note + * Only define this macro when the search function is highly optimized and when + * single-element arrays are expected to be passed frequently, because it adds + * a conditional branch to each execution of the distance functions with + * non-empty arguments. + */ +# define ED_HAS_ELEM(arr, elem, len) (memchr(arr, elem, len) != NULL) +#endif + +#ifndef ED_ELEM_EQUAL +/** Are two elements equal? */ +# define ED_ELEM_EQUAL(elem1, elem2) (elem1 == elem2) +#endif + +#if !defined(ED_HASH_ELEM) && !defined(ed_elem) +/** Hash an element to a unique value in [0, ED_HASH_MAX]. + * + * Note: This module does not use a "real" dictionary. Hash collisions are + * not expected nor supported. If elements can not be uniquely mapped to a + * reasonable range, consider implementing a "real" dictionary. + */ +# define ED_HASH_ELEM(e) ((unsigned char)e) +/** Maximum value that can be returned from #ED_HASH_ELEM */ +# define ED_HASH_MAX UCHAR_MAX +/** Can an array of #ED_HASH_MAX ::ed_dist values be stored on the stack? */ +# define ED_HASH_ON_STACK +#endif + +#ifndef ED_STACK_ELEMS +/** Maximum number of elements that will be allocated on the stack. + * + * Note: The threshold does not need to be tight since with + * ED_STACK_ELEMS > ~100 algorithm cost will dominate @c malloc + * cost anyway. + */ +# define ED_STACK_ELEMS 512 +#endif + +#endif diff --git a/ccan/edit_distance/edit_distance-private.h b/ccan/edit_distance/edit_distance-private.h new file mode 100644 index 00000000..30fe9cc7 --- /dev/null +++ b/ccan/edit_distance/edit_distance-private.h @@ -0,0 +1,81 @@ +/** @file + * Private macros and functions for use by the edit_distance implementation. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#ifndef CCAN_EDIT_DISTANCE_PRIVATE_H +#define CCAN_EDIT_DISTANCE_PRIVATE_H + +#include "edit_distance.h" + +/** Unsafe (arguments evaluated multiple times) 3-value minimum. */ +#define ED_MIN2(a, b) ((a) < (b) ? (a) : (b)) + +/** Unsafe (arguments evaluated multiple times) 3-value minimum. */ +#define ED_MIN3(a, b, c) \ + ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) + +/** Swap variables @p a and @p b of a given type. */ +#define ED_SWAP(a, b, Type) \ + do { Type swaptmp = a; a = b; b = swaptmp; } while (0) + +/** Number of elements in triangular matrix (with diagonal). + * @param rc Number of rows (or columns, since square). */ +#define ED_TMAT_SIZE(rc) (((rc) + 1) * (rc) / 2) + +/** Index of element in lower triangular matrix. */ +#define ED_TMAT_IND(r, c) (ED_TMAT_SIZE(r) + c) + +/** + * Calculates non-trivial LCS distance (for internal use). + * @private + * @param src Source array to calculate distance from (smaller when symmetric). + * @param slen Number of elements in @p src to consider (must be > 0). + * @param tgt Target array to calculate distance to. + * @param tlen Number of elements in @p tgt to consider (must be > 0). + * @return LCS distance from @p src[0..slen-1] to @p tgt[0..tlen-1]. + */ +ed_dist edit_distance_lcs(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen); + +/** + * Calculates non-trivial Levenshtein distance (for internal use). + * @private + * @param src Source array to calculate distance from (smaller when symmetric). + * @param slen Number of elements in @p src to consider (must be > 0). + * @param tgt Target array to calculate distance to. + * @param tlen Number of elements in @p tgt to consider (must be > 0). + * @return Levenshtein distance from @p src[0..slen-1] to @p tgt[0..tlen-1]. + */ +ed_dist edit_distance_lev(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen); + +/** + * Calculates non-trivial Restricted Damerau-Levenshtein distance (for internal + * use). + * @private + * @param src Source array to calculate distance from (smaller when symmetric). + * @param slen Number of elements in @p src to consider (must be > 0). + * @param tgt Target array to calculate distance to. + * @param tlen Number of elements in @p tgt to consider (must be > 0). + * @return Restricted Damerau-Levenshtein distance from @p src[0..slen-1] to + * @p tgt[0..tlen-1]. + */ +ed_dist edit_distance_rdl(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen); + +/** + * Calculates non-trivial Damerau-Levenshtein distance (for internal use). + * @private + * @param src Source array to calculate distance from (smaller when symmetric). + * @param slen Number of elements in @p src to consider (must be > 0). + * @param tgt Target array to calculate distance to. + * @param tlen Number of elements in @p tgt to consider (must be > 0). + * @return Damerau-Levenshtein distance from @p src[0..slen-1] to + * @p tgt[0..tlen-1]. + */ +ed_dist edit_distance_dl(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen); + +#endif diff --git a/ccan/edit_distance/edit_distance.bib b/ccan/edit_distance/edit_distance.bib new file mode 100644 index 00000000..9b803147 --- /dev/null +++ b/ccan/edit_distance/edit_distance.bib @@ -0,0 +1,57 @@ +@article{Boytsov11, + doi = {10.1145/1963190.1963191}, + year = {2011}, + month = {may}, + publisher = {Association for Computing Machinery ({ACM})}, + volume = {16}, + pages = {1}, + author = {Leonid Boytsov}, + title = {Indexing methods for approximate dictionary searching}, + journal = {Journal of Experimental Algorithmics} +} +@article{Damerau64, + doi = {10.1145/363958.363994}, + year = {1964}, + month = {mar}, + publisher = {Association for Computing Machinery ({ACM})}, + volume = {7}, + number = {3}, + pages = {171--176}, + author = {Fred J. Damerau}, + title = {A technique for computer detection and correction of spelling errors}, + journal = {Communications of the {ACM}} +} +@inproceedings{Levenshtein66, + title = {Binary codes capable of correcting deletions, insertions and reversals}, + author = {Vladimir I. Levenshtein}, + booktitle = {Soviet Physics Doklady}, + volume = {10}, + number = {8}, + pages = {707--710}, + year = {1966}, + month = {feb} +} +@article{Wagner74, + doi = {10.1145/321796.321811}, + year = {1974}, + month = {jan}, + publisher = {Association for Computing Machinery ({ACM})}, + volume = {21}, + number = {1}, + pages = {168--173}, + author = {Robert A. Wagner and Michael J. Fischer}, + title = {The String-to-String Correction Problem}, + journal = {Journal of the {ACM}} +} +@article{Wagner75, + title = {An Extension of the String-to-String Correction Problem}, + author = {Robert A. Wagner and Roy Lowrance}, + journal = {Journal of the {ACM}}, + volume = {22}, + number = {2}, + pages = {177--183}, + year = {1975}, + month = {apr}, + publisher = {Association for Computing Machinery ({ACM})}, + doi = {10.1145/321879.321880} +} diff --git a/ccan/edit_distance/edit_distance.c b/ccan/edit_distance/edit_distance.c new file mode 100644 index 00000000..82379d0b --- /dev/null +++ b/ccan/edit_distance/edit_distance.c @@ -0,0 +1,116 @@ +/** @file + * Defines shared edit distance functions. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#include "edit_distance-params.h" +#include "edit_distance-private.h" + +ed_dist edit_distance(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen, enum ed_measure measure) +{ + /* Remove common prefix. */ + while (slen > 0 && tlen > 0 && ED_ELEM_EQUAL(src[0], tgt[0])) { + ++src; + ++tgt; + --slen; + --tlen; + } + + /* Remove common suffix. */ + while (slen > 0 && tlen > 0 && + ED_ELEM_EQUAL(src[slen - 1], tgt[tlen - 1])) { + --slen; + --tlen; + } + +#if defined(ED_COST_IS_SYMMETRIC) + /* Use smaller array for src. */ + if (slen > tlen) { + ED_SWAP(src, tgt, const ed_elem *); + ED_SWAP(slen, tlen, ed_size); + } +#endif + + /* Early return when all insertions. */ + if (slen == 0) { +#ifdef ED_INS_COST_CONST + return (ed_dist)tlen *ED_INS_COST(); +#else + ed_dist result = 0; + for (ed_size i = 0; i < tlen; ++i) { + result += ED_INS_COST(tgt[i]); + } + return result; +#endif + } +#ifndef ED_COST_IS_SYMMETRIC + /* Early return when all deletions. */ + if (tlen == 0) { +# ifdef ED_DEL_COST_CONST + return (ed_dist)slen *ED_DEL_COST(); +# else + ed_dist result = 0; + for (ed_size i = 0; i < slen; ++i) { + result += ED_DEL_COST(src[i]); + } + return result; +# endif + } +#endif + +#if defined(ED_HAS_ELEM) && \ + defined(ED_INS_COST_CONST) && \ + defined(ED_SUB_COST_CONST) + /* Fast search for single-element source. */ + if (slen == 1) { + /* Always tlen - 1 inserts (of non-src elements). */ + ed_dist result = (ed_dist)(tlen - 1) * ED_INS_COST(); + if (!ED_HAS_ELEM(tgt, src[0], tlen)) { + if (measure == EDIT_DISTANCE_LCS) { + /* If src is not present, delete + insert. */ + result += ED_DEL_COST(src[0]) + ED_INS_COST(); + } else { + /* If src is not present, substitute it out. */ + result += ED_SUB_COST(,); + } + } + return result; + } +#endif + +#if !defined(ED_COST_IS_SYMMETRIC) && \ + defined(ED_HAS_ELEM) && \ + defined(ED_DEL_COST_CONST) && \ + defined(ED_SUB_COST_CONST) + /* Fast search for single-element target. */ + if (tlen == 1) { + /* Always slen - 1 deletes (of non-tgt elements). */ + ed_dist result = (ed_dist)(slen - 1) * ED_DEL_COST(); + if (!ED_HAS_ELEM(src, tgt[0], slen)) { + if (measure == EDIT_DISTANCE_LCS) { + /* If tgt is not present, delete + insert. */ + result += ED_INS_COST(tgt[0]) + ED_DEL_COST(); + } else { + /* If tgt is not present, substitute it out. */ + result += ED_SUB_COST(,); + } + } + return result; + } +#endif + + switch (measure) { + case EDIT_DISTANCE_LCS: + return edit_distance_lcs(src, slen, tgt, tlen); + case EDIT_DISTANCE_LEV: + return edit_distance_lev(src, slen, tgt, tlen); + case EDIT_DISTANCE_RDL: + return edit_distance_rdl(src, slen, tgt, tlen); + case EDIT_DISTANCE_DL: + return edit_distance_dl(src, slen, tgt, tlen); + } + + return (ed_dist)-1; +} diff --git a/ccan/edit_distance/edit_distance.h b/ccan/edit_distance/edit_distance.h new file mode 100644 index 00000000..982b9dce --- /dev/null +++ b/ccan/edit_distance/edit_distance.h @@ -0,0 +1,131 @@ +/** @file + * Main header file for edit_distance which defines the module API. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#ifndef CCAN_EDIT_DISTANCE_H +#define CCAN_EDIT_DISTANCE_H + +#include /* size_t */ + +#ifndef ed_dist +/** + * ed_dist - Type in which the edit distance is expressed. + * + * The name @c ed_dist can be defined to another algebraic type (or this + * typedef can be edited) when compiling this module. + */ +typedef unsigned int ed_dist; +#endif + +#ifndef ed_elem +/** + * ed_elem - Element type of arrays for which the edit distance is calculated. + * + * Changing this type often requires changing #ED_HAS_ELEM, #ED_HASH_ELEM, and + * #ED_HASH_MAX. It requires changing #ED_ELEM_EQUAL for non-primitive types. + * These can be changed in edit_distance-params.h or defined externally. + */ +typedef char ed_elem; +#endif + +#ifndef ed_size +/** + * ed_size - Type in which the array size is expressed. + * + * The name @c ed_size can be defined to another algebraic type (or this + * typedef can be edited) when compiling this module. If @c ed_size is a + * signed type, the caller must ensure passed values are non-negative. + * @c ed_size must be large enough to hold @c max(slen,tlen)+2, it does not + * need to be large enough to hold @c slen*tlen. + * + * Note: @c ed_size is only likely to have a noticeable impact in + * edit_distance_dl() which maintains an array of @c #ED_HASH_MAX+1 ::ed_size + * values. + */ +typedef unsigned int ed_size; +#endif + +/** + * ed_measure - Supported edit distance measures. + */ +enum ed_measure { + /** + * Longest Common Subsequence (LCS) distance. + * + * The LCS distance is an edit distance measured by the number of + * insert and delete operations necessary to turn @p src into @p tgt. + * + * This implementation uses an iterative version of the Wagner-Fischer + * algorithm @cite Wagner74 which requires O(slen * tlen) + * time and min(slen, tlen) + 1 space. + */ + EDIT_DISTANCE_LCS = 1, + /** + * Levenshtein distance. + * + * The Levenshtein distance is an edit distance measured by the number + * of insert, delete, and substitute operations necessary to turn + * @p src into @p tgt as described by Vladimir Levenshtein. + * @cite Levenshtein66 + * + * This implementation uses a modified version of the Wagner-Fischer + * algorithm @cite Wagner74 which requires O(slen * tlen) + * time and only min(slen, tlen) + 1 space. + */ + EDIT_DISTANCE_LEV, + /** + * Restricted Damerau-Levenshtein distance (aka Optimal String + * Alignment distance). + * + * The Restricted Damerau-Levenshtein distance is an edit distance + * measured by the number of insert, delete, substitute, and transpose + * operations necessary to turn @p src into @p tgt with the restriction + * that no substring is edited more than once (equivalently, that no + * edits overlap). @cite Boytsov11 + * + * This implementation uses a modified version of the Wagner-Fischer + * algorithm @cite Wagner74 which requires O(slen * tlen) + * time and only 2 * min(slen, tlen) + 2 space. + */ + EDIT_DISTANCE_RDL, + /** + * Damerau-Levenshtein distance. + * + * The Damerau-Levenshtein distance is an edit distance measured by the + * number of insert, delete, substitute, and transpose operations + * necessary to turn @p src into @p tgt . @cite Damerau64 + * + * This implementation uses the Lowrance-Wagner algorithm @cite Wagner75 + * which takes O(slen * tlen) time and + * O(slen * tlen) space. + */ + EDIT_DISTANCE_DL, +}; + +/** + * edit_distance - Calculates the edit distance between two arrays. + * + * @param src Source array to calculate distance from. + * @param slen Number of elements in @p src to consider. + * @param tgt Target array to calculate distance to. + * @param tlen Number of elements in @p tgt to consider. + * @param measure Edit distance measure to calculate. + * @return Edit distance from @p src[0..slen-1] to @p tgt[0..tlen-1]. (i.e. + * The minimal sum of the weights of the operations necessary to convert + * @p src[0..slen-1] into @p tgt[0..tlen-1].) + * + * @code + * Example: + * const char *source = "kitten"; + * const char *target = "sitting"; + * assert(edit_distance(source, strlen(source), + * target, strlen(target), + * EDIT_DISTANCE_DL) == 3); + * Example_End: @endcode + */ +ed_dist edit_distance(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen, + enum ed_measure measure); +#endif diff --git a/ccan/edit_distance/edit_distance_dl.c b/ccan/edit_distance/edit_distance_dl.c new file mode 100644 index 00000000..ed690547 --- /dev/null +++ b/ccan/edit_distance/edit_distance_dl.c @@ -0,0 +1,179 @@ +/** @file + * Defines Damerau-Levenshtein distance functions. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#include /* free, malloc */ + +#include "edit_distance.h" +#include "edit_distance-params.h" +#include "edit_distance-private.h" + +ed_dist edit_distance_dl(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen) +{ + /* Optimization: Avoid malloc when distance matrix can fit on the stack. + */ + ed_dist stackdist[ED_STACK_ELEMS]; + + /* Lowrance-Wagner distance matrix, in row-major order. */ + size_t matsize = ((size_t)slen + 2) * (tlen + 2); + ed_dist *distmem = matsize <= ED_STACK_ELEMS ? stackdist : + malloc(matsize * sizeof(ed_dist)); + ed_dist *dist = distmem; + +#ifdef ED_HASH_ON_STACK + ed_size lasttgt[ED_HASH_MAX + 1] = { 0 }; +#else + ed_size *lasttgt = calloc(ED_HASH_MAX + 1, sizeof(ed_size)); +#endif + + /* Upper bound on distance between strings. */ + ed_dist maxdist = 0; + +#ifdef ED_DEL_COST_CONST + maxdist += (ed_dist)slen *ED_DEL_COST(); +#else + /* Lower-triangular matrix of deletion costs. + * delcost[i2, i1] is cost to delete src[i1..i2-1]. + * delcost[i, i] is 0. */ + ed_dist *delcost = malloc(ED_TMAT_SIZE(slen + 1) * sizeof(ed_dist)); + ed_dist *delcostitr = delcost; + ed_dist *delcostprevitr = delcost; + *delcostitr++ = 0; + for (ed_size i2 = 1; i2 <= slen; ++i2) { + ed_dist costi2 = ED_DEL_COST(src[i2 - 1]); + for (ed_size i1 = 0; i1 < i2; ++i1) { + *delcostitr++ = *delcostprevitr++ + costi2; + } + *delcostitr++ = 0; + } + maxdist += delcost[ED_TMAT_IND(slen, 0)]; +#endif + +#ifdef ED_INS_COST_CONST + maxdist += (ed_dist)tlen *ED_INS_COST(); +#else + /* Lower-triangular matrix of insertion costs. + * inscost[j2, j1] is cost to insert tgt[j1..j2-1]. + * inscost[j, j] is 0. */ + ed_dist *inscost = malloc(ED_TMAT_SIZE(tlen + 1) * sizeof(ed_dist)); + ed_dist *inscostitr = inscost; + ed_dist *inscostprevitr = inscost; + *inscostitr++ = 0; + for (ed_size j2 = 1; j2 <= tlen; ++j2) { + ed_dist costj2 = ED_INS_COST(tgt[j2 - 1]); + for (ed_size j1 = 0; j1 < j2; ++j1) { + *inscostitr++ = *inscostprevitr++ + costj2; + } + *inscostitr++ = 0; + } + maxdist += inscost[ED_TMAT_IND(tlen, 0)]; +#endif + + /* Initialize first row with maximal cost */ + for (ed_size i = 0; i < slen + 2; ++i) { + dist[i] = maxdist; + } + + /* Position dist to match other algorithms. dist[-1] will be maxdist */ + dist += slen + 3; + + /* Initialize row with cost to delete src[0..i-1] */ + dist[-1] = maxdist; + dist[0] = 0; + for (ed_size i = 1; i <= slen; ++i) { + dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]); + } + + for (ed_size j = 1; j <= tlen; ++j) { + /* Largest y < i such that src[y] = tgt[j] */ + ed_size lastsrc = 0; + ed_dist *prevdist = dist; + dist += slen + 2; + dist[-1] = maxdist; + dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]); + + /* Loop invariant: dist[i] is the edit distance between first j + * elements of tgt and first i elements of src. + * + * Loop invariant: lasttgt[ED_HASH_ELEM(c)] holds the largest + * x < j such that tgt[x-1] = c or 0 if no such x exists. + */ + for (ed_size i = 1; i <= slen; ++i) { + ed_size i1 = lastsrc; + ed_size j1 = lasttgt[ED_HASH_ELEM(src[i - 1])]; + + if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) { + /* Same as tgt upto j-2, src upto i-2. */ + dist[i] = prevdist[i - 1]; + lastsrc = i; + } else { + /* Insertion is tgt upto j-2, src upto i-1 + * + insert tgt[j-1] */ + ed_dist insdist = + prevdist[i] + ED_INS_COST(tgt[j - 1]); + + /* Deletion is tgt upto j-1, src upto i-2 + * + delete src[i-1] */ + ed_dist deldist = + dist[i - 1] + ED_DEL_COST(src[i - 1]); + + /* Substitution is tgt upto j-2, src upto i-2 + * + substitute tgt[j-1] for src[i-1] */ + ed_dist subdist = prevdist[i - 1] + + ED_SUB_COST(src[i - 1], tgt[j - 1]); + + /* Use best distance available */ + dist[i] = ED_MIN3(insdist, deldist, subdist); + + ed_dist swpdist = + distmem[(size_t)j1 * (slen + 2) + i1]; +#ifdef ED_INS_COST_CONST + swpdist += + (ed_dist)(j - j1 - 1) * ED_INS_COST(); +#else + swpdist += inscost[ED_TMAT_IND(j - 1, j1)]; +#endif +#ifdef ED_TRA_COST_CONST + swpdist += ED_TRA_COST(,); +#else + if (i1 > 0) { + swpdist += + ED_TRA_COST(src[i1 - 1], + src[i - 1]); + } +#endif +#ifdef ED_DEL_COST_CONST + swpdist += + (ed_dist)(i - i1 - 1) * ED_DEL_COST(); +#else + swpdist += delcost[ED_TMAT_IND(i - 1, i1)]; +#endif + + dist[i] = ED_MIN2(dist[i], swpdist); + } + } + + lasttgt[ED_HASH_ELEM(tgt[j - 1])] = j; + } + +#ifndef ED_HASH_ON_STACK + free(lasttgt); +#endif + +#ifndef ED_DEL_COST_CONST + free(delcost); +#endif + +#ifndef ED_INS_COST_CONST + free(inscost); +#endif + + ed_dist total = dist[slen]; + if (distmem != stackdist) { + free(distmem); + } + return total; +} diff --git a/ccan/edit_distance/edit_distance_lcs.c b/ccan/edit_distance/edit_distance_lcs.c new file mode 100644 index 00000000..e612c73c --- /dev/null +++ b/ccan/edit_distance/edit_distance_lcs.c @@ -0,0 +1,69 @@ +/** @file + * Defines Longest Common Subsequence distance functions. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#include /* free, malloc */ + +#include "edit_distance.h" +#include "edit_distance-params.h" +#include "edit_distance-private.h" + +ed_dist edit_distance_lcs(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen) +{ + /* Optimization: Avoid malloc when row of distance matrix can fit on + * the stack. + */ + ed_dist stackdist[ED_STACK_ELEMS]; + + /* One row of the Wagner-Fischer distance matrix. */ + ed_dist *dist = slen < ED_STACK_ELEMS ? stackdist : + malloc((slen + 1) * sizeof(ed_dist)); + + /* Initialize row with cost to delete src[0..i-1] */ + dist[0] = 0; + for (ed_size i = 1; i <= slen; ++i) { + dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]); + } + + for (ed_size j = 1; j <= tlen; ++j) { + /* Value for dist[j-1][i-1] (one row up, one col left). */ + ed_dist diagdist = dist[0]; + dist[0] = dist[0] + ED_INS_COST(tgt[j - 1]); + + /* Loop invariant: dist[i] is the edit distance between first j + * elements of tgt and first i elements of src. + */ + for (ed_size i = 1; i <= slen; ++i) { + ed_dist nextdiagdist = dist[i]; + + if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) { + /* Same as tgt upto j-2, src upto i-2. */ + dist[i] = diagdist; + } else { + /* Insertion is tgt upto j-2, src upto i-1 + * + insert tgt[j-1] */ + ed_dist insdist = + dist[i] + ED_INS_COST(tgt[j - 1]); + + /* Deletion is tgt upto j-1, src upto i-2 + * + delete src[i-1] */ + ed_dist deldist = + dist[i - 1] + ED_DEL_COST(src[i - 1]); + + /* Use best distance available */ + dist[i] = ED_MIN2(insdist, deldist); + } + + diagdist = nextdiagdist; + } + } + + ed_dist total = dist[slen]; + if (dist != stackdist) { + free(dist); + } + return total; +} diff --git a/ccan/edit_distance/edit_distance_lev.c b/ccan/edit_distance/edit_distance_lev.c new file mode 100644 index 00000000..7638c6dc --- /dev/null +++ b/ccan/edit_distance/edit_distance_lev.c @@ -0,0 +1,74 @@ +/** @file + * Defines Levenshtein distance functions. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#include /* free, malloc */ + +#include "edit_distance.h" +#include "edit_distance-params.h" +#include "edit_distance-private.h" + +ed_dist edit_distance_lev(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen) +{ + /* Optimization: Avoid malloc when row of distance matrix can fit on + * the stack. + */ + ed_dist stackdist[ED_STACK_ELEMS]; + + /* One row of the Wagner-Fischer distance matrix. */ + ed_dist *dist = slen < ED_STACK_ELEMS ? stackdist : + malloc((slen + 1) * sizeof(ed_dist)); + + /* Initialize row with cost to delete src[0..i-1] */ + dist[0] = 0; + for (ed_size i = 1; i <= slen; ++i) { + dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]); + } + + for (ed_size j = 1; j <= tlen; ++j) { + /* Value for dist[j-1][i-1] (one row up, one col left). */ + ed_dist diagdist = dist[0]; + dist[0] = dist[0] + ED_INS_COST(tgt[j - 1]); + + /* Loop invariant: dist[i] is the edit distance between first j + * elements of tgt and first i elements of src. + */ + for (ed_size i = 1; i <= slen; ++i) { + ed_dist nextdiagdist = dist[i]; + + if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) { + /* Same as tgt upto j-2, src upto i-2. */ + dist[i] = diagdist; + } else { + /* Insertion is tgt upto j-2, src upto i-1 + * + insert tgt[j-1] */ + ed_dist insdist = + dist[i] + ED_INS_COST(tgt[j - 1]); + + /* Deletion is tgt upto j-1, src upto i-2 + * + delete src[i-1] */ + ed_dist deldist = + dist[i - 1] + ED_DEL_COST(src[i - 1]); + + /* Substitution is tgt upto j-2, src upto i-2 + * + substitute tgt[j-1] for src[i-1] */ + ed_dist subdist = diagdist + + ED_SUB_COST(src[i - 1], tgt[j - 1]); + + /* Use best distance available */ + dist[i] = ED_MIN3(insdist, deldist, subdist); + } + + diagdist = nextdiagdist; + } + } + + ed_dist total = dist[slen]; + if (dist != stackdist) { + free(dist); + } + return total; +} diff --git a/ccan/edit_distance/edit_distance_rdl.c b/ccan/edit_distance/edit_distance_rdl.c new file mode 100644 index 00000000..76ba3ce2 --- /dev/null +++ b/ccan/edit_distance/edit_distance_rdl.c @@ -0,0 +1,98 @@ +/** @file + * Defines Restricted Damerau-Levenshtein distance functions. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ +#include /* free, malloc */ + +#include "edit_distance.h" +#include "edit_distance-params.h" +#include "edit_distance-private.h" + +ed_dist edit_distance_rdl(const ed_elem *src, ed_size slen, + const ed_elem *tgt, ed_size tlen) +{ + /* Optimization: Avoid malloc when required rows of distance matrix can + * fit on the stack. + */ + ed_dist stackdist[ED_STACK_ELEMS]; + + /* Two rows of the Wagner-Fischer distance matrix. */ + ed_dist *distmem, *dist, *prevdist; + if (slen < ED_STACK_ELEMS / 2) { + distmem = stackdist; + dist = distmem; + prevdist = distmem + slen + 1; + } else { + distmem = malloc((slen + 1) * sizeof(ed_dist) * 2); + dist = distmem; + prevdist = distmem + slen + 1; + } + + /* Initialize row with cost to delete src[0..i-1] */ + dist[0] = 0; + for (ed_size i = 1; i <= slen; ++i) { + dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]); + } + + for (ed_size j = 1; j <= tlen; ++j) { + /* Value for dist[j-2][i-1] (two rows up, one col left). */ + /* Note: dist[0] is not initialized when j == 1, var unused. */ + ed_dist diagdist1 = prevdist[0]; + /* Value for dist[j-2][i-2] (two rows up, two cols left). + * Initialization value only used to placate GCC. */ + ed_dist diagdist2 = 0; + + ED_SWAP(dist, prevdist, ed_dist *); + + dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]); + + /* Loop invariant: dist[i] is the edit distance between first j + * elements of tgt and first i elements of src. + */ + for (ed_size i = 1; i <= slen; ++i) { + ed_dist nextdiagdist = dist[i]; + + if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) { + /* Same as tgt upto j-2, src upto i-2. */ + dist[i] = prevdist[i - 1]; + } else { + /* Insertion is tgt upto j-2, src upto i-1 + * + insert tgt[j-1] */ + ed_dist insdist = + prevdist[i] + ED_INS_COST(tgt[j - 1]); + + /* Deletion is tgt upto j-1, src upto i-2 + * + delete src[i-1] */ + ed_dist deldist = + dist[i - 1] + ED_DEL_COST(src[i - 1]); + + /* Substitution is tgt upto j-2, src upto i-2 + * + substitute tgt[j-1] for src[i-1] */ + ed_dist subdist = prevdist[i - 1] + + ED_SUB_COST(src[i - 1], tgt[j - 1]); + + /* Use best distance available */ + dist[i] = ED_MIN3(insdist, deldist, subdist); + + if (j > 1 && i > 1 && + ED_ELEM_EQUAL(src[i - 2], tgt[j - 1]) && + ED_ELEM_EQUAL(src[i - 1], tgt[j - 2])) { + ed_dist tradist = diagdist2 + + ED_TRA_COST(src[j - 2], src[j - 1]); + dist[i] = ED_MIN2(dist[i], tradist); + } + } + + diagdist2 = diagdist1; + diagdist1 = nextdiagdist; + } + } + + ed_dist total = dist[slen]; + if (distmem != stackdist) { + free(distmem); + } + return total; +} diff --git a/ccan/edit_distance/test/api.c b/ccan/edit_distance/test/api.c new file mode 100644 index 00000000..d365ea12 --- /dev/null +++ b/ccan/edit_distance/test/api.c @@ -0,0 +1,221 @@ +/** @file + * Runnable public API tests of the edit_distance module. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ + +#include + +#include +#include +#include + +#define edit_distance_lcs_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_LCS) +#define edit_distance_lev_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_LEV) +#define edit_distance_rdl_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_RDL) +#define edit_distance_dl_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_DL) + +static void test_lcs(void) +{ + /* Trivial cases */ + ok1(edit_distance_lcs_static("", "") == 0); + ok1(edit_distance_lcs_static("a", "") == 1); + ok1(edit_distance_lcs_static("", "a") == 1); + ok1(edit_distance_lcs_static("a", "a") == 0); + ok1(edit_distance_lcs_static("a", "b") == 2); + ok1(edit_distance_lcs_static("b", "a") == 2); + + /* Trivial search cases */ + ok1(edit_distance_lcs_static("a", "bcdef") == 6); + ok1(edit_distance_lcs_static("a", "bcadef") == 5); + ok1(edit_distance_lcs_static("acdef", "b") == 6); + ok1(edit_distance_lcs_static("abcdef", "b") == 5); + + /* Common prefix with single-char distance */ + ok1(edit_distance_lcs_static("aa", "ab") == 2); + ok1(edit_distance_lcs_static("ab", "aa") == 2); + + /* Common suffix with single-char distance */ + ok1(edit_distance_lcs_static("aa", "ba") == 2); + ok1(edit_distance_lcs_static("ba", "aa") == 2); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_lcs_static("ab", "ba") == 2); + ok1(edit_distance_lcs_static("abc", "de") == 5); + ok1(edit_distance_lcs_static("abc", "def") == 6); + + /* (Restricted) Transposition */ + ok1(edit_distance_lcs_static("abcd", "ecbf") == 6); + + /* (Unrestricted) Transposition */ + ok1(edit_distance_lcs_static("ca", "abc") == 3); + + /* Insert + Delete + Sub */ + ok1(edit_distance_lcs_static("abcde", "xcdef") == 4); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_lcs_static("aabcc", "bccdd") == 4); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_lcs_static("bccdd", "aabcc") == 4); +} + +static void test_lev(void) +{ + /* Trivial cases */ + ok1(edit_distance_lev_static("", "") == 0); + ok1(edit_distance_lev_static("a", "") == 1); + ok1(edit_distance_lev_static("", "a") == 1); + ok1(edit_distance_lev_static("a", "a") == 0); + ok1(edit_distance_lev_static("a", "b") == 1); + ok1(edit_distance_lev_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_lev_static("a", "bcdef") == 5); + ok1(edit_distance_lev_static("a", "bcadef") == 5); + ok1(edit_distance_lev_static("acdef", "b") == 5); + ok1(edit_distance_lev_static("abcdef", "b") == 5); + + /* Common prefix with single-char distance */ + ok1(edit_distance_lev_static("aa", "ab") == 1); + ok1(edit_distance_lev_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_lev_static("aa", "ba") == 1); + ok1(edit_distance_lev_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_lev_static("ab", "ba") == 2); + ok1(edit_distance_lev_static("abc", "de") == 3); + ok1(edit_distance_lev_static("abc", "def") == 3); + + /* (Restricted) Transposition */ + ok1(edit_distance_lev_static("abcd", "ecbf") == 4); + + /* (Unrestricted) Transposition */ + ok1(edit_distance_lev_static("ca", "abc") == 3); + + /* Insert + Delete + Sub */ + ok1(edit_distance_lev_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_lev_static("aabcc", "bccdd") == 4); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_lev_static("bccdd", "aabcc") == 4); +} + +static void test_rdl(void) +{ + /* Trivial cases */ + ok1(edit_distance_rdl_static("", "") == 0); + ok1(edit_distance_rdl_static("a", "") == 1); + ok1(edit_distance_rdl_static("", "a") == 1); + ok1(edit_distance_rdl_static("a", "a") == 0); + ok1(edit_distance_rdl_static("a", "b") == 1); + ok1(edit_distance_rdl_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_rdl_static("a", "bcdef") == 5); + ok1(edit_distance_rdl_static("a", "bcadef") == 5); + ok1(edit_distance_rdl_static("acdef", "b") == 5); + ok1(edit_distance_rdl_static("abcdef", "b") == 5); + + /* Common prefix with single-char distance */ + ok1(edit_distance_rdl_static("aa", "ab") == 1); + ok1(edit_distance_rdl_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_rdl_static("aa", "ba") == 1); + ok1(edit_distance_rdl_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_rdl_static("ab", "ba") == 1); + ok1(edit_distance_rdl_static("abc", "de") == 3); + ok1(edit_distance_rdl_static("abc", "def") == 3); + + /* (Restricted) Transposition */ + ok1(edit_distance_rdl_static("abcd", "ecbf") == 3); + + /* (Unrestricted) Transposition */ + ok1(edit_distance_rdl_static("ca", "abc") == 3); + + /* Insert + Delete + Sub */ + ok1(edit_distance_rdl_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_rdl_static("aabcc", "bccdd") == 4); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_rdl_static("bccdd", "aabcc") == 4); +} + +static void test_dl(void) +{ + /* Trivial cases */ + ok1(edit_distance_dl_static("", "") == 0); + ok1(edit_distance_dl_static("a", "") == 1); + ok1(edit_distance_dl_static("", "a") == 1); + ok1(edit_distance_dl_static("a", "a") == 0); + ok1(edit_distance_dl_static("a", "b") == 1); + ok1(edit_distance_dl_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_dl_static("a", "bcdef") == 5); + ok1(edit_distance_dl_static("a", "bcadef") == 5); + ok1(edit_distance_dl_static("acdef", "b") == 5); + ok1(edit_distance_dl_static("abcdef", "b") == 5); + + /* Common prefix with single-char distance */ + ok1(edit_distance_dl_static("aa", "ab") == 1); + ok1(edit_distance_dl_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_dl_static("aa", "ba") == 1); + ok1(edit_distance_dl_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_dl_static("ab", "ba") == 1); + ok1(edit_distance_dl_static("abc", "de") == 3); + ok1(edit_distance_dl_static("abc", "def") == 3); + + /* (Restricted) Transposition */ + ok1(edit_distance_dl_static("abcd", "ecbf") == 3); + + /* (Unrestricted) Transposition */ + ok1(edit_distance_dl_static("ca", "abc") == 2); + + /* Insert + Delete + Sub */ + ok1(edit_distance_dl_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_dl_static("aabcc", "bccdd") == 4); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_dl_static("bccdd", "aabcc") == 4); +} + +int main(void) +{ + plan_tests(89); + + test_lcs(); + test_lev(); + test_rdl(); + test_dl(); + + /* Unsupported edit distance measure */ + enum ed_measure badmeasure = (enum ed_measure)-1; + ok1(edit_distance("ab", 2, "ba", 2, badmeasure) == (ed_dist)-1); + + return exit_status(); +} diff --git a/ccan/edit_distance/test/run-types-short.c b/ccan/edit_distance/test/run-types-short.c new file mode 100644 index 00000000..80922038 --- /dev/null +++ b/ccan/edit_distance/test/run-types-short.c @@ -0,0 +1,40 @@ +/** @file + * Runnable tests for the edit_distance module using custom element and + * distance types. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ + +#include /* USHRT_MAX */ + +#include +#include + +#define ed_elem unsigned short +#define ed_dist short +#define ED_HASH_ELEM(e) e +#define ED_HASH_MAX USHRT_MAX + +#include +#include +#include +#include +#include + +int main(void) +{ + const unsigned short src[] = { 0, 1, 2, 3 }; + const unsigned short tgt[] = { 4, 2, 1, 5 }; + ed_size slen = ARRAY_SIZE(src); + ed_size tlen = ARRAY_SIZE(tgt); + + plan_tests(4); + + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3); + + return exit_status(); +} diff --git a/ccan/edit_distance/test/run-types-struct.c b/ccan/edit_distance/test/run-types-struct.c new file mode 100644 index 00000000..477ec5f2 --- /dev/null +++ b/ccan/edit_distance/test/run-types-struct.c @@ -0,0 +1,56 @@ +/** @file + * Runnable tests for the edit_distance module using custom element and + * distance types. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ + +#include /* UCHAR_MAX */ + +#include +#include + +struct color16 { + unsigned char r:5; + unsigned char g:6; + unsigned char b:5; +}; + +#define ed_elem struct color16 +#define ED_ELEM_EQUAL(e, f) (e.r == f.r && e.g == f.g && e.b == f.b) +#define ED_HASH_ELEM(e) ((e.r << 11) | (e.g << 5) | e.b) +#define ED_HASH_MAX USHRT_MAX + +#include +#include +#include +#include +#include + +int main(void) +{ + const struct color16 src[] = { + {0, 0, 0}, + {1, 1, 1}, + {2, 2, 2}, + {3, 3, 3} + }; + const struct color16 tgt[] = { + {4, 4, 4}, + {2, 2, 2}, + {1, 1, 1}, + {5, 5, 5} + }; + ed_size slen = ARRAY_SIZE(src); + ed_size tlen = ARRAY_SIZE(tgt); + + plan_tests(4); + + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3); + + return exit_status(); +} diff --git a/ccan/edit_distance/test/run-types-wchar.c b/ccan/edit_distance/test/run-types-wchar.c new file mode 100644 index 00000000..3123a2b1 --- /dev/null +++ b/ccan/edit_distance/test/run-types-wchar.c @@ -0,0 +1,60 @@ +/** @file + * Runnable tests for the edit_distance module using custom element and + * distance types. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ + +#include /* UCHAR_MAX */ +#include /* wmemchr */ + +#include +#include + +#define ed_elem wchar_t +#define ed_dist double +#define ED_HAS_ELEM(arr, elem, len) (wmemchr(arr, elem, len) != NULL) +/* Since we only use ASCII characters */ +#define ED_HASH_ELEM(e) (unsigned char)e +#define ED_HASH_MAX UCHAR_MAX +#define ED_HASH_ON_STACK + +#include +#include +#include +#include +#include + +int main(void) +{ + const wchar_t src[] = L"abcd"; + const wchar_t tgt[] = L"ecbf"; + ed_size slen = ARRAY_SIZE(src) - 1; + ed_size tlen = ARRAY_SIZE(tgt) - 1; + + plan_tests(16); + + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6.0); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4.0); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3.0); + ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3.0); + + /* Test empty strings work as expected */ + ok1(edit_distance(src, slen, tgt, 0, EDIT_DISTANCE_LCS) == 4.0); + ok1(edit_distance(src, 0, tgt, tlen, EDIT_DISTANCE_LCS) == 4.0); + + /* Test ED_HAS_ELEM works as expected */ + ok1(edit_distance(L"c", 1, tgt, tlen, EDIT_DISTANCE_LCS) == 3.0); + ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_LCS) == 5.0); + ok1(edit_distance(src, slen, L"c", 1, EDIT_DISTANCE_LCS) == 3.0); + ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_LCS) == 5.0); + ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_LEV) == 4.0); + ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_LEV) == 4.0); + ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_RDL) == 4.0); + ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_RDL) == 4.0); + ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_DL) == 4.0); + ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_DL) == 4.0); + + return exit_status(); +} diff --git a/ccan/edit_distance/test/run-weights.c b/ccan/edit_distance/test/run-weights.c new file mode 100644 index 00000000..f6b470d4 --- /dev/null +++ b/ccan/edit_distance/test/run-weights.c @@ -0,0 +1,282 @@ +/** @file + * Runnable tests for the edit_distance module using custom costs/weights. + * + * @copyright 2016 Kevin Locke + * MIT license - see LICENSE file for details + */ + +#include + +#include +#include + +#define ED_DEL_COST(e) (e == 'a' ? 2 : 1) +#define ED_INS_COST(e) (e == 'b' ? 2 : 1) +#define ED_SUB_COST(e, f) (f == 'c' && e == 'd' ? 3 : 1) +#define ED_TRA_COST(e, f) (e == 'e' && f == 'f' ? 3 : 1) + +#include +#include +#include +#include +#include + +#define edit_distance_lcs_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_LCS) +#define edit_distance_lev_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_LEV) +#define edit_distance_rdl_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_RDL) +#define edit_distance_dl_static(arr1, arr2) \ + edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \ + EDIT_DISTANCE_DL) + +static void test_lcs(void) +{ + /* Trivial cases */ + ok1(edit_distance_lcs_static("", "") == 0); + ok1(edit_distance_lcs_static("a", "") == 2); + ok1(edit_distance_lcs_static("", "a") == 1); + ok1(edit_distance_lcs_static("a", "a") == 0); + ok1(edit_distance_lcs_static("a", "b") == 4); + ok1(edit_distance_lcs_static("b", "a") == 2); + + /* Trivial search cases */ + ok1(edit_distance_lcs_static("a", "bcdef") == 8); + ok1(edit_distance_lcs_static("a", "bcadef") == 6); + ok1(edit_distance_lcs_static("acdef", "b") == 8); + ok1(edit_distance_lcs_static("abcdef", "b") == 6); + + /* Common prefix with single-char distance */ + ok1(edit_distance_lcs_static("aa", "ab") == 4); + ok1(edit_distance_lcs_static("ab", "aa") == 2); + + /* Common suffix with single-char distance */ + ok1(edit_distance_lcs_static("aa", "ba") == 4); + ok1(edit_distance_lcs_static("ba", "aa") == 2); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_lcs_static("ab", "ba") == 3); + ok1(edit_distance_lcs_static("abc", "de") == 6); + ok1(edit_distance_lcs_static("abc", "def") == 7); + ok1(edit_distance_lcs_static("de", "bdef") == 3); + + /* Transposition + Insert */ + ok1(edit_distance_lcs_static("ca", "abc") == 4); + + /* Insert + Delete + Sub */ + ok1(edit_distance_lcs_static("abcde", "xcdef") == 5); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_lcs_static("aabcc", "bccdd") == 6); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_lcs_static("bccdd", "aabcc") == 4); +} + +static void test_lev(void) +{ + /* Trivial cases */ + ok1(edit_distance_lev_static("", "") == 0); + ok1(edit_distance_lev_static("a", "") == 2); + ok1(edit_distance_lev_static("", "a") == 1); + ok1(edit_distance_lev_static("a", "a") == 0); + ok1(edit_distance_lev_static("a", "b") == 1); + ok1(edit_distance_lev_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_lev_static("a", "bcdef") == 5); + ok1(edit_distance_lev_static("a", "bcadef") == 6); + ok1(edit_distance_lev_static("acdef", "b") == 5); + ok1(edit_distance_lev_static("abcdef", "b") == 6); + + /* Common prefix with single-char distance */ + ok1(edit_distance_lev_static("aa", "ab") == 1); + ok1(edit_distance_lev_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_lev_static("aa", "ba") == 1); + ok1(edit_distance_lev_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_lev_static("ab", "ba") == 2); + ok1(edit_distance_lev_static("abc", "de") == 3); + ok1(edit_distance_lev_static("abc", "def") == 3); + ok1(edit_distance_lev_static("de", "bdef") == 3); + + /* Transposition + Insert */ + ok1(edit_distance_lev_static("ca", "abc") == 3); + + /* Insert + Delete + Sub */ + ok1(edit_distance_lev_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_lev_static("aabcc", "bccdd") == 5); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_lev_static("bccdd", "aabcc") == 4); +} + +static void test_rdl(void) +{ + /* Trivial cases */ + ok1(edit_distance_rdl_static("", "") == 0); + ok1(edit_distance_rdl_static("a", "") == 2); + ok1(edit_distance_rdl_static("", "a") == 1); + ok1(edit_distance_rdl_static("a", "a") == 0); + ok1(edit_distance_rdl_static("a", "b") == 1); + ok1(edit_distance_rdl_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_rdl_static("a", "bcdef") == 5); + ok1(edit_distance_rdl_static("a", "bcadef") == 6); + ok1(edit_distance_rdl_static("acdef", "b") == 5); + ok1(edit_distance_rdl_static("abcdef", "b") == 6); + + /* Common prefix with single-char distance */ + ok1(edit_distance_rdl_static("aa", "ab") == 1); + ok1(edit_distance_rdl_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_rdl_static("aa", "ba") == 1); + ok1(edit_distance_rdl_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_rdl_static("ab", "ba") == 1); + ok1(edit_distance_rdl_static("abc", "de") == 3); + ok1(edit_distance_rdl_static("abc", "def") == 3); + ok1(edit_distance_rdl_static("de", "bdef") == 3); + + /* Transposition + Insert */ + ok1(edit_distance_rdl_static("ca", "abc") == 3); + + /* Transpose Weight */ + ok1(edit_distance_rdl_static("ef", "fe") == 2); + ok1(edit_distance_rdl_static("fe", "ef") == 1); + + /* Insert + Delete + Sub */ + ok1(edit_distance_rdl_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_rdl_static("aabcc", "bccdd") == 5); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_rdl_static("bccdd", "aabcc") == 4); +} + +static void test_dl(void) +{ + /* Trivial cases */ + ok1(edit_distance_dl_static("", "") == 0); + ok1(edit_distance_dl_static("a", "") == 2); + ok1(edit_distance_dl_static("", "a") == 1); + ok1(edit_distance_dl_static("a", "a") == 0); + ok1(edit_distance_dl_static("a", "b") == 1); + ok1(edit_distance_dl_static("b", "a") == 1); + + /* Trivial search cases */ + ok1(edit_distance_dl_static("a", "bcdef") == 5); + ok1(edit_distance_dl_static("a", "bcadef") == 6); + ok1(edit_distance_dl_static("acdef", "b") == 5); + ok1(edit_distance_dl_static("abcdef", "b") == 6); + + /* Common prefix with single-char distance */ + ok1(edit_distance_dl_static("aa", "ab") == 1); + ok1(edit_distance_dl_static("ab", "aa") == 1); + + /* Common suffix with single-char distance */ + ok1(edit_distance_dl_static("aa", "ba") == 1); + ok1(edit_distance_dl_static("ba", "aa") == 1); + + /* Non-optimized cases (require Wagner-Fischer matrix) */ + ok1(edit_distance_dl_static("ab", "ba") == 1); + ok1(edit_distance_dl_static("abc", "de") == 3); + ok1(edit_distance_dl_static("abc", "def") == 3); + ok1(edit_distance_dl_static("de", "bdef") == 3); + + /* Transposition + Insert */ + ok1(edit_distance_dl_static("ca", "abc") == 3); + + /* Transpose Weight */ + ok1(edit_distance_dl_static("ef", "fe") == 2); + ok1(edit_distance_dl_static("fe", "ef") == 1); + + /* Insert + Delete + Sub */ + ok1(edit_distance_dl_static("abcde", "xcdef") == 3); + + /* Distance depends on multiple deletions in final row. */ + ok1(edit_distance_dl_static("aabcc", "bccdd") == 5); + + /* Distance depends on multiple insertions in final column. */ + ok1(edit_distance_dl_static("bccdd", "aabcc") == 4); +} + +/* Test edit_distance calculation around the stack threshold to ensure memory + * is allocated and freed correctly and stack overflow does not occur. + * + * Note: This test is done when ED_COST_IS_SYMMETRIC is not defined so that + * tgt can be small to make the test run quickly (with ED_COST_IS_SYMMETRIC the + * min length would need to be above the threshold). + */ +static void test_mem_use(void) +{ + char tgt[] = "BC"; + char src[ED_STACK_ELEMS + 1]; + for (size_t i = 0; i < ED_STACK_ELEMS; ++i) { + src[i] = (char)('A' + (i % 26)); + } + src[ED_STACK_ELEMS] = '\0'; + + for (ed_size tlen = 1; tlen < 3; ++tlen) { + ed_size slen = ED_STACK_ELEMS; + /* Above threshold, causes allocation */ + ok(edit_distance_lcs(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_lcs(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_lev(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_lev(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_rdl(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_rdl(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_dl(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_dl(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + + /* Below threshold, no allocation */ + --slen; + ok(edit_distance_lcs(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_lcs(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_lev(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_lev(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_rdl(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_rdl(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + ok(edit_distance_dl(src, slen, tgt, tlen) == slen - tlen, + "edit_distance_dl(\"%.3s..., %u, \"%.*s\", %u) == %u", + src, slen, (int)tlen, tgt, tlen, slen - tlen); + } +} + +int main(void) +{ + plan_tests(109); + + test_lcs(); + test_lev(); + test_rdl(); + test_dl(); + + test_mem_use(); + + /* Unsupported edit distance measure */ + enum ed_measure badmeasure = (enum ed_measure)-1; + ok1(edit_distance("ab", 2, "ba", 2, badmeasure) == (ed_dist)-1); + + return exit_status(); +} -- 2.39.2