--- /dev/null
+../../licenses/BSD-MIT
\ No newline at end of file
--- /dev/null
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+
+/**
+ * edit_distance - calculate the edit distance between two strings
+ *
+ * The edit distance quantifies the similarity between two strings based on the
+ * number of modifications necessary to turn one string into the other. There
+ * are several edit distance measures which differ in the operations which are
+ * permitted and the cost (aka weight) of the operations. This module provides
+ * functions for calculating the Longest Common Subsequence (LCS), Levenshtein,
+ * and Damerau-Levenshtein (restricted and unrestricted) distances. Weighted
+ * versions of these functions can be created by defining cost functions as
+ * preprocessor macros when compiling this module. Distances over other array
+ * types (e.g. wide strings, integers, structs) can be accomplished by defining
+ * the element type and equality test macros.
+ *
+ * Example:
+ * #include <limits.h> // UINT_MAX
+ * #include <stdio.h> // fprintf, printf
+ * #include <stdlib.h> // EXIT_*
+ * #include <string.h> // strlen
+ *
+ * #include <ccan/edit_distance/edit_distance.h>
+ *
+ * const char *counties[] = {
+ * "Anglesey",
+ * "Brecknockshire",
+ * "Cardiganshire",
+ * "Carmarthenshire",
+ * "Caernarfonshire",
+ * "Denbighshire",
+ * "Flintshire",
+ * "Glamorgan",
+ * "Merionethshire",
+ * "Monmouthshire",
+ * "Montgomeryshire",
+ * "Pembrokeshire",
+ * "Radnorshire",
+ * NULL
+ * };
+ *
+ * int main(int argc, char *argv[])
+ * {
+ * if (argc != 2) {
+ * fprintf(stderr, "Usage: %s <guess>\n", argv[0]);
+ * return EXIT_FAILURE;
+ * }
+ * const char *guess = argv[1];
+ *
+ * unsigned int bestdist = UINT_MAX;
+ * const char *bestcounty = NULL;
+ * for (const char **pcounty = counties; *pcounty; ++pcounty) {
+ * const char *county = *pcounty;
+ * unsigned int dist = edit_distance(guess, strlen(guess),
+ * county, strlen(county), EDIT_DISTANCE_RDL);
+ * if (dist < bestdist) {
+ * bestdist = dist;
+ * bestcounty = county;
+ * }
+ * }
+ *
+ * printf("You were probably trying to spell %s\n", bestcounty);
+ * return EXIT_SUCCESS;
+ * }
+ * License: MIT
+ * Author: Kevin Locke <kevin@kevinlocke.name>
+ */
+int main(int argc, char *argv[])
+{
+ /* Expect exactly one argument */
+ if (argc != 2) {
+ return 1;
+ }
+
+ if (strcmp(argv[1], "depends") == 0) {
+ return 0;
+ }
+
+ if (strcmp(argv[1], "testdepends") == 0) {
+ printf("ccan/array_size\n");
+ printf("ccan/build_assert\n");
+ printf("ccan/tap\n");
+ return 0;
+ }
+
+ return 1;
+}
--- /dev/null
+/** @file
+ * Defines the tunable parameters for the edit_distance functions which can
+ * be changed at module-compile-time.
+ *
+ * The macros in this header can be redefined, either by editing this file or
+ * providing external definitions. This makes it possible to calculate the
+ * weighted edit distance with arbitrary weighting.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#ifndef CCAN_EDIT_DISTANCE_PARAMS_H
+#define CCAN_EDIT_DISTANCE_PARAMS_H
+
+#ifndef ed_elem
+# include <string.h> /* memchr */
+#endif
+
+#ifndef ED_HASH_ELEM
+# include <limits.h> /* UCHAR_MAX */
+#endif
+
+#if !defined(ED_DEL_COST) && \
+ !defined(ED_INS_COST) && \
+ !defined(ED_SUB_COST) && \
+ !defined(ED_TRA_COST)
+/** Defined if costs are symmetric.
+ * (i.e. <code>distance(a, b) == distance(b, a) forall a,b</code>) */
+# define ED_COST_IS_SYMMETRIC
+#endif
+
+#ifndef ED_DEL_COST
+/** Cost to delete element @p e. */
+# define ED_DEL_COST(e) 1
+/** Defined if the cost to delete is the same for all elements. */
+# define ED_DEL_COST_CONST
+#endif
+
+#ifndef ED_INS_COST
+/** Cost to insert element @p e. */
+# define ED_INS_COST(e) 1
+/** Defined if the cost to insert is the same for all elements. */
+# define ED_INS_COST_CONST
+#endif
+
+#ifndef ED_SUB_COST
+/** Cost to substitute element @p f for element @p e (i.e. replace @p e with
+ * @p f ).
+ *
+ * All algorithms which use this macro expect that <code>ED_SUB_COST(e, f) +
+ * ED_SUB_COST(f, g) >= ED_SUB_COST(e, g)</code>. They do not check for any
+ * substitution chains and will not find minimal distances when this assumption
+ * does not hold.
+ *
+ * Although it is possible to set <code>ED_SUB_COST(e, f) >= ED_DEL_COST(e) +
+ * ED_INS_COST(f)</code>. If this is true for all @p e and @p f, consider
+ * using edit_distance_lcs() instead of edit_distance_lev() which calculates
+ * the same result more quickly by not considering substitutions which are
+ * never less costly than delete + insert.
+ */
+# define ED_SUB_COST(e, f) 1
+/** Defined if the cost to substitute is the same for all element pairs. */
+# define ED_SUB_COST_CONST
+#endif
+
+#ifndef ED_TRA_COST
+/** Cost to transpose elements @p ef to @p fe.
+ *
+ * @c edit_distance_dl requires <code>2 * ED_TRA_COST(e, f) >= ED_INS_COST(g) +
+ * ED_DEL_COST(h)</code> for all @c e,f,g,h. See Trace Theorem 3 of
+ * Wagner-Lowrance. @cite Wagner75
+ */
+# define ED_TRA_COST(e, f) 1
+/** Defined if the cost to transpose is the same for all element pairs. */
+# define ED_TRA_COST_CONST
+#endif
+
+#if defined(DOXYGEN)
+/** Optimized array search function for single-array-heavy workloads.
+ *
+ * The expression generated by this macro must evaluate to @c true in a boolean
+ * context when @p elem is contained in the first @p len elements of @p arr and
+ * @c false otherwise. @p len is guaranteed to be greater than 0.
+ *
+ * This macro is currently only used when #ED_INS_COST_CONST and
+ * #ED_SUB_COST_CONST are defined.
+ *
+ * @note
+ * Only define this macro when the search function is highly optimized and when
+ * single-element arrays are expected to be passed frequently, because it adds
+ * a conditional branch to each execution of the distance functions with
+ * non-empty arguments.
+ */
+# define ED_HAS_ELEM(arr, elem, len) (memchr(arr, elem, len) != NULL)
+#endif
+
+#ifndef ED_ELEM_EQUAL
+/** Are two elements equal? */
+# define ED_ELEM_EQUAL(elem1, elem2) (elem1 == elem2)
+#endif
+
+#if !defined(ED_HASH_ELEM) && !defined(ed_elem)
+/** Hash an element to a unique value in <code>[0, ED_HASH_MAX]</code>.
+ *
+ * Note: This module does not use a "real" dictionary. Hash collisions are
+ * not expected nor supported. If elements can not be uniquely mapped to a
+ * reasonable range, consider implementing a "real" dictionary.
+ */
+# define ED_HASH_ELEM(e) ((unsigned char)e)
+/** Maximum value that can be returned from #ED_HASH_ELEM */
+# define ED_HASH_MAX UCHAR_MAX
+/** Can an array of #ED_HASH_MAX ::ed_dist values be stored on the stack? */
+# define ED_HASH_ON_STACK
+#endif
+
+#ifndef ED_STACK_ELEMS
+/** Maximum number of elements that will be allocated on the stack.
+ *
+ * Note: The threshold does not need to be tight since with
+ * <code>ED_STACK_ELEMS > ~100</code> algorithm cost will dominate @c malloc
+ * cost anyway.
+ */
+# define ED_STACK_ELEMS 512
+#endif
+
+#endif
--- /dev/null
+/** @file
+ * Private macros and functions for use by the edit_distance implementation.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#ifndef CCAN_EDIT_DISTANCE_PRIVATE_H
+#define CCAN_EDIT_DISTANCE_PRIVATE_H
+
+#include "edit_distance.h"
+
+/** Unsafe (arguments evaluated multiple times) 3-value minimum. */
+#define ED_MIN2(a, b) ((a) < (b) ? (a) : (b))
+
+/** Unsafe (arguments evaluated multiple times) 3-value minimum. */
+#define ED_MIN3(a, b, c) \
+ ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+
+/** Swap variables @p a and @p b of a given type. */
+#define ED_SWAP(a, b, Type) \
+ do { Type swaptmp = a; a = b; b = swaptmp; } while (0)
+
+/** Number of elements in triangular matrix (with diagonal).
+ * @param rc Number of rows (or columns, since square). */
+#define ED_TMAT_SIZE(rc) (((rc) + 1) * (rc) / 2)
+
+/** Index of element in lower triangular matrix. */
+#define ED_TMAT_IND(r, c) (ED_TMAT_SIZE(r) + c)
+
+/**
+ * Calculates non-trivial LCS distance (for internal use).
+ * @private
+ * @param src Source array to calculate distance from (smaller when symmetric).
+ * @param slen Number of elements in @p src to consider (must be > 0).
+ * @param tgt Target array to calculate distance to.
+ * @param tlen Number of elements in @p tgt to consider (must be > 0).
+ * @return LCS distance from @p src[0..slen-1] to @p tgt[0..tlen-1].
+ */
+ed_dist edit_distance_lcs(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen);
+
+/**
+ * Calculates non-trivial Levenshtein distance (for internal use).
+ * @private
+ * @param src Source array to calculate distance from (smaller when symmetric).
+ * @param slen Number of elements in @p src to consider (must be > 0).
+ * @param tgt Target array to calculate distance to.
+ * @param tlen Number of elements in @p tgt to consider (must be > 0).
+ * @return Levenshtein distance from @p src[0..slen-1] to @p tgt[0..tlen-1].
+ */
+ed_dist edit_distance_lev(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen);
+
+/**
+ * Calculates non-trivial Restricted Damerau-Levenshtein distance (for internal
+ * use).
+ * @private
+ * @param src Source array to calculate distance from (smaller when symmetric).
+ * @param slen Number of elements in @p src to consider (must be > 0).
+ * @param tgt Target array to calculate distance to.
+ * @param tlen Number of elements in @p tgt to consider (must be > 0).
+ * @return Restricted Damerau-Levenshtein distance from @p src[0..slen-1] to
+ * @p tgt[0..tlen-1].
+ */
+ed_dist edit_distance_rdl(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen);
+
+/**
+ * Calculates non-trivial Damerau-Levenshtein distance (for internal use).
+ * @private
+ * @param src Source array to calculate distance from (smaller when symmetric).
+ * @param slen Number of elements in @p src to consider (must be > 0).
+ * @param tgt Target array to calculate distance to.
+ * @param tlen Number of elements in @p tgt to consider (must be > 0).
+ * @return Damerau-Levenshtein distance from @p src[0..slen-1] to
+ * @p tgt[0..tlen-1].
+ */
+ed_dist edit_distance_dl(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen);
+
+#endif
--- /dev/null
+@article{Boytsov11,
+ doi = {10.1145/1963190.1963191},
+ year = {2011},
+ month = {may},
+ publisher = {Association for Computing Machinery ({ACM})},
+ volume = {16},
+ pages = {1},
+ author = {Leonid Boytsov},
+ title = {Indexing methods for approximate dictionary searching},
+ journal = {Journal of Experimental Algorithmics}
+}
+@article{Damerau64,
+ doi = {10.1145/363958.363994},
+ year = {1964},
+ month = {mar},
+ publisher = {Association for Computing Machinery ({ACM})},
+ volume = {7},
+ number = {3},
+ pages = {171--176},
+ author = {Fred J. Damerau},
+ title = {A technique for computer detection and correction of spelling errors},
+ journal = {Communications of the {ACM}}
+}
+@inproceedings{Levenshtein66,
+ title = {Binary codes capable of correcting deletions, insertions and reversals},
+ author = {Vladimir I. Levenshtein},
+ booktitle = {Soviet Physics Doklady},
+ volume = {10},
+ number = {8},
+ pages = {707--710},
+ year = {1966},
+ month = {feb}
+}
+@article{Wagner74,
+ doi = {10.1145/321796.321811},
+ year = {1974},
+ month = {jan},
+ publisher = {Association for Computing Machinery ({ACM})},
+ volume = {21},
+ number = {1},
+ pages = {168--173},
+ author = {Robert A. Wagner and Michael J. Fischer},
+ title = {The String-to-String Correction Problem},
+ journal = {Journal of the {ACM}}
+}
+@article{Wagner75,
+ title = {An Extension of the String-to-String Correction Problem},
+ author = {Robert A. Wagner and Roy Lowrance},
+ journal = {Journal of the {ACM}},
+ volume = {22},
+ number = {2},
+ pages = {177--183},
+ year = {1975},
+ month = {apr},
+ publisher = {Association for Computing Machinery ({ACM})},
+ doi = {10.1145/321879.321880}
+}
--- /dev/null
+/** @file
+ * Defines shared edit distance functions.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#include "edit_distance-params.h"
+#include "edit_distance-private.h"
+
+ed_dist edit_distance(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen, enum ed_measure measure)
+{
+ /* Remove common prefix. */
+ while (slen > 0 && tlen > 0 && ED_ELEM_EQUAL(src[0], tgt[0])) {
+ ++src;
+ ++tgt;
+ --slen;
+ --tlen;
+ }
+
+ /* Remove common suffix. */
+ while (slen > 0 && tlen > 0 &&
+ ED_ELEM_EQUAL(src[slen - 1], tgt[tlen - 1])) {
+ --slen;
+ --tlen;
+ }
+
+#if defined(ED_COST_IS_SYMMETRIC)
+ /* Use smaller array for src. */
+ if (slen > tlen) {
+ ED_SWAP(src, tgt, const ed_elem *);
+ ED_SWAP(slen, tlen, ed_size);
+ }
+#endif
+
+ /* Early return when all insertions. */
+ if (slen == 0) {
+#ifdef ED_INS_COST_CONST
+ return (ed_dist)tlen *ED_INS_COST();
+#else
+ ed_dist result = 0;
+ for (ed_size i = 0; i < tlen; ++i) {
+ result += ED_INS_COST(tgt[i]);
+ }
+ return result;
+#endif
+ }
+#ifndef ED_COST_IS_SYMMETRIC
+ /* Early return when all deletions. */
+ if (tlen == 0) {
+# ifdef ED_DEL_COST_CONST
+ return (ed_dist)slen *ED_DEL_COST();
+# else
+ ed_dist result = 0;
+ for (ed_size i = 0; i < slen; ++i) {
+ result += ED_DEL_COST(src[i]);
+ }
+ return result;
+# endif
+ }
+#endif
+
+#if defined(ED_HAS_ELEM) && \
+ defined(ED_INS_COST_CONST) && \
+ defined(ED_SUB_COST_CONST)
+ /* Fast search for single-element source. */
+ if (slen == 1) {
+ /* Always tlen - 1 inserts (of non-src elements). */
+ ed_dist result = (ed_dist)(tlen - 1) * ED_INS_COST();
+ if (!ED_HAS_ELEM(tgt, src[0], tlen)) {
+ if (measure == EDIT_DISTANCE_LCS) {
+ /* If src is not present, delete + insert. */
+ result += ED_DEL_COST(src[0]) + ED_INS_COST();
+ } else {
+ /* If src is not present, substitute it out. */
+ result += ED_SUB_COST(,);
+ }
+ }
+ return result;
+ }
+#endif
+
+#if !defined(ED_COST_IS_SYMMETRIC) && \
+ defined(ED_HAS_ELEM) && \
+ defined(ED_DEL_COST_CONST) && \
+ defined(ED_SUB_COST_CONST)
+ /* Fast search for single-element target. */
+ if (tlen == 1) {
+ /* Always slen - 1 deletes (of non-tgt elements). */
+ ed_dist result = (ed_dist)(slen - 1) * ED_DEL_COST();
+ if (!ED_HAS_ELEM(src, tgt[0], slen)) {
+ if (measure == EDIT_DISTANCE_LCS) {
+ /* If tgt is not present, delete + insert. */
+ result += ED_INS_COST(tgt[0]) + ED_DEL_COST();
+ } else {
+ /* If tgt is not present, substitute it out. */
+ result += ED_SUB_COST(,);
+ }
+ }
+ return result;
+ }
+#endif
+
+ switch (measure) {
+ case EDIT_DISTANCE_LCS:
+ return edit_distance_lcs(src, slen, tgt, tlen);
+ case EDIT_DISTANCE_LEV:
+ return edit_distance_lev(src, slen, tgt, tlen);
+ case EDIT_DISTANCE_RDL:
+ return edit_distance_rdl(src, slen, tgt, tlen);
+ case EDIT_DISTANCE_DL:
+ return edit_distance_dl(src, slen, tgt, tlen);
+ }
+
+ return (ed_dist)-1;
+}
--- /dev/null
+/** @file
+ * Main header file for edit_distance which defines the module API.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#ifndef CCAN_EDIT_DISTANCE_H
+#define CCAN_EDIT_DISTANCE_H
+
+#include <stddef.h> /* size_t */
+
+#ifndef ed_dist
+/**
+ * ed_dist - Type in which the edit distance is expressed.
+ *
+ * The name @c ed_dist can be defined to another algebraic type (or this
+ * typedef can be edited) when compiling this module.
+ */
+typedef unsigned int ed_dist;
+#endif
+
+#ifndef ed_elem
+/**
+ * ed_elem - Element type of arrays for which the edit distance is calculated.
+ *
+ * Changing this type often requires changing #ED_HAS_ELEM, #ED_HASH_ELEM, and
+ * #ED_HASH_MAX. It requires changing #ED_ELEM_EQUAL for non-primitive types.
+ * These can be changed in edit_distance-params.h or defined externally.
+ */
+typedef char ed_elem;
+#endif
+
+#ifndef ed_size
+/**
+ * ed_size - Type in which the array size is expressed.
+ *
+ * The name @c ed_size can be defined to another algebraic type (or this
+ * typedef can be edited) when compiling this module. If @c ed_size is a
+ * signed type, the caller must ensure passed values are non-negative.
+ * @c ed_size must be large enough to hold @c max(slen,tlen)+2, it does not
+ * need to be large enough to hold @c slen*tlen.
+ *
+ * Note: @c ed_size is only likely to have a noticeable impact in
+ * edit_distance_dl() which maintains an array of @c #ED_HASH_MAX+1 ::ed_size
+ * values.
+ */
+typedef unsigned int ed_size;
+#endif
+
+/**
+ * ed_measure - Supported edit distance measures.
+ */
+enum ed_measure {
+ /**
+ * Longest Common Subsequence (LCS) distance.
+ *
+ * The LCS distance is an edit distance measured by the number of
+ * insert and delete operations necessary to turn @p src into @p tgt.
+ *
+ * This implementation uses an iterative version of the Wagner-Fischer
+ * algorithm @cite Wagner74 which requires <code>O(slen * tlen)</code>
+ * time and <code>min(slen, tlen) + 1</code> space.
+ */
+ EDIT_DISTANCE_LCS = 1,
+ /**
+ * Levenshtein distance.
+ *
+ * The Levenshtein distance is an edit distance measured by the number
+ * of insert, delete, and substitute operations necessary to turn
+ * @p src into @p tgt as described by Vladimir Levenshtein.
+ * @cite Levenshtein66
+ *
+ * This implementation uses a modified version of the Wagner-Fischer
+ * algorithm @cite Wagner74 which requires <code>O(slen * tlen)</code>
+ * time and only <code>min(slen, tlen) + 1</code> space.
+ */
+ EDIT_DISTANCE_LEV,
+ /**
+ * Restricted Damerau-Levenshtein distance (aka Optimal String
+ * Alignment distance).
+ *
+ * The Restricted Damerau-Levenshtein distance is an edit distance
+ * measured by the number of insert, delete, substitute, and transpose
+ * operations necessary to turn @p src into @p tgt with the restriction
+ * that no substring is edited more than once (equivalently, that no
+ * edits overlap). @cite Boytsov11
+ *
+ * This implementation uses a modified version of the Wagner-Fischer
+ * algorithm @cite Wagner74 which requires <code>O(slen * tlen)</code>
+ * time and only <code>2 * min(slen, tlen) + 2</code> space.
+ */
+ EDIT_DISTANCE_RDL,
+ /**
+ * Damerau-Levenshtein distance.
+ *
+ * The Damerau-Levenshtein distance is an edit distance measured by the
+ * number of insert, delete, substitute, and transpose operations
+ * necessary to turn @p src into @p tgt . @cite Damerau64
+ *
+ * This implementation uses the Lowrance-Wagner algorithm @cite Wagner75
+ * which takes <code>O(slen * tlen)</code> time and
+ * <code>O(slen * tlen)</code> space.
+ */
+ EDIT_DISTANCE_DL,
+};
+
+/**
+ * edit_distance - Calculates the edit distance between two arrays.
+ *
+ * @param src Source array to calculate distance from.
+ * @param slen Number of elements in @p src to consider.
+ * @param tgt Target array to calculate distance to.
+ * @param tlen Number of elements in @p tgt to consider.
+ * @param measure Edit distance measure to calculate.
+ * @return Edit distance from @p src[0..slen-1] to @p tgt[0..tlen-1]. (i.e.
+ * The minimal sum of the weights of the operations necessary to convert
+ * @p src[0..slen-1] into @p tgt[0..tlen-1].)
+ *
+ * @code
+ * Example:
+ * const char *source = "kitten";
+ * const char *target = "sitting";
+ * assert(edit_distance(source, strlen(source),
+ * target, strlen(target),
+ * EDIT_DISTANCE_DL) == 3);
+ * Example_End: @endcode
+ */
+ed_dist edit_distance(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen,
+ enum ed_measure measure);
+#endif
--- /dev/null
+/** @file
+ * Defines Damerau-Levenshtein distance functions.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#include <stdlib.h> /* free, malloc */
+
+#include "edit_distance.h"
+#include "edit_distance-params.h"
+#include "edit_distance-private.h"
+
+ed_dist edit_distance_dl(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen)
+{
+ /* Optimization: Avoid malloc when distance matrix can fit on the stack.
+ */
+ ed_dist stackdist[ED_STACK_ELEMS];
+
+ /* Lowrance-Wagner distance matrix, in row-major order. */
+ size_t matsize = ((size_t)slen + 2) * (tlen + 2);
+ ed_dist *distmem = matsize <= ED_STACK_ELEMS ? stackdist :
+ malloc(matsize * sizeof(ed_dist));
+ ed_dist *dist = distmem;
+
+#ifdef ED_HASH_ON_STACK
+ ed_size lasttgt[ED_HASH_MAX + 1] = { 0 };
+#else
+ ed_size *lasttgt = calloc(ED_HASH_MAX + 1, sizeof(ed_size));
+#endif
+
+ /* Upper bound on distance between strings. */
+ ed_dist maxdist = 0;
+
+#ifdef ED_DEL_COST_CONST
+ maxdist += (ed_dist)slen *ED_DEL_COST();
+#else
+ /* Lower-triangular matrix of deletion costs.
+ * delcost[i2, i1] is cost to delete src[i1..i2-1].
+ * delcost[i, i] is 0. */
+ ed_dist *delcost = malloc(ED_TMAT_SIZE(slen + 1) * sizeof(ed_dist));
+ ed_dist *delcostitr = delcost;
+ ed_dist *delcostprevitr = delcost;
+ *delcostitr++ = 0;
+ for (ed_size i2 = 1; i2 <= slen; ++i2) {
+ ed_dist costi2 = ED_DEL_COST(src[i2 - 1]);
+ for (ed_size i1 = 0; i1 < i2; ++i1) {
+ *delcostitr++ = *delcostprevitr++ + costi2;
+ }
+ *delcostitr++ = 0;
+ }
+ maxdist += delcost[ED_TMAT_IND(slen, 0)];
+#endif
+
+#ifdef ED_INS_COST_CONST
+ maxdist += (ed_dist)tlen *ED_INS_COST();
+#else
+ /* Lower-triangular matrix of insertion costs.
+ * inscost[j2, j1] is cost to insert tgt[j1..j2-1].
+ * inscost[j, j] is 0. */
+ ed_dist *inscost = malloc(ED_TMAT_SIZE(tlen + 1) * sizeof(ed_dist));
+ ed_dist *inscostitr = inscost;
+ ed_dist *inscostprevitr = inscost;
+ *inscostitr++ = 0;
+ for (ed_size j2 = 1; j2 <= tlen; ++j2) {
+ ed_dist costj2 = ED_INS_COST(tgt[j2 - 1]);
+ for (ed_size j1 = 0; j1 < j2; ++j1) {
+ *inscostitr++ = *inscostprevitr++ + costj2;
+ }
+ *inscostitr++ = 0;
+ }
+ maxdist += inscost[ED_TMAT_IND(tlen, 0)];
+#endif
+
+ /* Initialize first row with maximal cost */
+ for (ed_size i = 0; i < slen + 2; ++i) {
+ dist[i] = maxdist;
+ }
+
+ /* Position dist to match other algorithms. dist[-1] will be maxdist */
+ dist += slen + 3;
+
+ /* Initialize row with cost to delete src[0..i-1] */
+ dist[-1] = maxdist;
+ dist[0] = 0;
+ for (ed_size i = 1; i <= slen; ++i) {
+ dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
+ }
+
+ for (ed_size j = 1; j <= tlen; ++j) {
+ /* Largest y < i such that src[y] = tgt[j] */
+ ed_size lastsrc = 0;
+ ed_dist *prevdist = dist;
+ dist += slen + 2;
+ dist[-1] = maxdist;
+ dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]);
+
+ /* Loop invariant: dist[i] is the edit distance between first j
+ * elements of tgt and first i elements of src.
+ *
+ * Loop invariant: lasttgt[ED_HASH_ELEM(c)] holds the largest
+ * x < j such that tgt[x-1] = c or 0 if no such x exists.
+ */
+ for (ed_size i = 1; i <= slen; ++i) {
+ ed_size i1 = lastsrc;
+ ed_size j1 = lasttgt[ED_HASH_ELEM(src[i - 1])];
+
+ if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
+ /* Same as tgt upto j-2, src upto i-2. */
+ dist[i] = prevdist[i - 1];
+ lastsrc = i;
+ } else {
+ /* Insertion is tgt upto j-2, src upto i-1
+ * + insert tgt[j-1] */
+ ed_dist insdist =
+ prevdist[i] + ED_INS_COST(tgt[j - 1]);
+
+ /* Deletion is tgt upto j-1, src upto i-2
+ * + delete src[i-1] */
+ ed_dist deldist =
+ dist[i - 1] + ED_DEL_COST(src[i - 1]);
+
+ /* Substitution is tgt upto j-2, src upto i-2
+ * + substitute tgt[j-1] for src[i-1] */
+ ed_dist subdist = prevdist[i - 1] +
+ ED_SUB_COST(src[i - 1], tgt[j - 1]);
+
+ /* Use best distance available */
+ dist[i] = ED_MIN3(insdist, deldist, subdist);
+
+ ed_dist swpdist =
+ distmem[(size_t)j1 * (slen + 2) + i1];
+#ifdef ED_INS_COST_CONST
+ swpdist +=
+ (ed_dist)(j - j1 - 1) * ED_INS_COST();
+#else
+ swpdist += inscost[ED_TMAT_IND(j - 1, j1)];
+#endif
+#ifdef ED_TRA_COST_CONST
+ swpdist += ED_TRA_COST(,);
+#else
+ if (i1 > 0) {
+ swpdist +=
+ ED_TRA_COST(src[i1 - 1],
+ src[i - 1]);
+ }
+#endif
+#ifdef ED_DEL_COST_CONST
+ swpdist +=
+ (ed_dist)(i - i1 - 1) * ED_DEL_COST();
+#else
+ swpdist += delcost[ED_TMAT_IND(i - 1, i1)];
+#endif
+
+ dist[i] = ED_MIN2(dist[i], swpdist);
+ }
+ }
+
+ lasttgt[ED_HASH_ELEM(tgt[j - 1])] = j;
+ }
+
+#ifndef ED_HASH_ON_STACK
+ free(lasttgt);
+#endif
+
+#ifndef ED_DEL_COST_CONST
+ free(delcost);
+#endif
+
+#ifndef ED_INS_COST_CONST
+ free(inscost);
+#endif
+
+ ed_dist total = dist[slen];
+ if (distmem != stackdist) {
+ free(distmem);
+ }
+ return total;
+}
--- /dev/null
+/** @file
+ * Defines Longest Common Subsequence distance functions.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#include <stdlib.h> /* free, malloc */
+
+#include "edit_distance.h"
+#include "edit_distance-params.h"
+#include "edit_distance-private.h"
+
+ed_dist edit_distance_lcs(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen)
+{
+ /* Optimization: Avoid malloc when row of distance matrix can fit on
+ * the stack.
+ */
+ ed_dist stackdist[ED_STACK_ELEMS];
+
+ /* One row of the Wagner-Fischer distance matrix. */
+ ed_dist *dist = slen < ED_STACK_ELEMS ? stackdist :
+ malloc((slen + 1) * sizeof(ed_dist));
+
+ /* Initialize row with cost to delete src[0..i-1] */
+ dist[0] = 0;
+ for (ed_size i = 1; i <= slen; ++i) {
+ dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
+ }
+
+ for (ed_size j = 1; j <= tlen; ++j) {
+ /* Value for dist[j-1][i-1] (one row up, one col left). */
+ ed_dist diagdist = dist[0];
+ dist[0] = dist[0] + ED_INS_COST(tgt[j - 1]);
+
+ /* Loop invariant: dist[i] is the edit distance between first j
+ * elements of tgt and first i elements of src.
+ */
+ for (ed_size i = 1; i <= slen; ++i) {
+ ed_dist nextdiagdist = dist[i];
+
+ if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
+ /* Same as tgt upto j-2, src upto i-2. */
+ dist[i] = diagdist;
+ } else {
+ /* Insertion is tgt upto j-2, src upto i-1
+ * + insert tgt[j-1] */
+ ed_dist insdist =
+ dist[i] + ED_INS_COST(tgt[j - 1]);
+
+ /* Deletion is tgt upto j-1, src upto i-2
+ * + delete src[i-1] */
+ ed_dist deldist =
+ dist[i - 1] + ED_DEL_COST(src[i - 1]);
+
+ /* Use best distance available */
+ dist[i] = ED_MIN2(insdist, deldist);
+ }
+
+ diagdist = nextdiagdist;
+ }
+ }
+
+ ed_dist total = dist[slen];
+ if (dist != stackdist) {
+ free(dist);
+ }
+ return total;
+}
--- /dev/null
+/** @file
+ * Defines Levenshtein distance functions.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#include <stdlib.h> /* free, malloc */
+
+#include "edit_distance.h"
+#include "edit_distance-params.h"
+#include "edit_distance-private.h"
+
+ed_dist edit_distance_lev(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen)
+{
+ /* Optimization: Avoid malloc when row of distance matrix can fit on
+ * the stack.
+ */
+ ed_dist stackdist[ED_STACK_ELEMS];
+
+ /* One row of the Wagner-Fischer distance matrix. */
+ ed_dist *dist = slen < ED_STACK_ELEMS ? stackdist :
+ malloc((slen + 1) * sizeof(ed_dist));
+
+ /* Initialize row with cost to delete src[0..i-1] */
+ dist[0] = 0;
+ for (ed_size i = 1; i <= slen; ++i) {
+ dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
+ }
+
+ for (ed_size j = 1; j <= tlen; ++j) {
+ /* Value for dist[j-1][i-1] (one row up, one col left). */
+ ed_dist diagdist = dist[0];
+ dist[0] = dist[0] + ED_INS_COST(tgt[j - 1]);
+
+ /* Loop invariant: dist[i] is the edit distance between first j
+ * elements of tgt and first i elements of src.
+ */
+ for (ed_size i = 1; i <= slen; ++i) {
+ ed_dist nextdiagdist = dist[i];
+
+ if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
+ /* Same as tgt upto j-2, src upto i-2. */
+ dist[i] = diagdist;
+ } else {
+ /* Insertion is tgt upto j-2, src upto i-1
+ * + insert tgt[j-1] */
+ ed_dist insdist =
+ dist[i] + ED_INS_COST(tgt[j - 1]);
+
+ /* Deletion is tgt upto j-1, src upto i-2
+ * + delete src[i-1] */
+ ed_dist deldist =
+ dist[i - 1] + ED_DEL_COST(src[i - 1]);
+
+ /* Substitution is tgt upto j-2, src upto i-2
+ * + substitute tgt[j-1] for src[i-1] */
+ ed_dist subdist = diagdist +
+ ED_SUB_COST(src[i - 1], tgt[j - 1]);
+
+ /* Use best distance available */
+ dist[i] = ED_MIN3(insdist, deldist, subdist);
+ }
+
+ diagdist = nextdiagdist;
+ }
+ }
+
+ ed_dist total = dist[slen];
+ if (dist != stackdist) {
+ free(dist);
+ }
+ return total;
+}
--- /dev/null
+/** @file
+ * Defines Restricted Damerau-Levenshtein distance functions.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+#include <stdlib.h> /* free, malloc */
+
+#include "edit_distance.h"
+#include "edit_distance-params.h"
+#include "edit_distance-private.h"
+
+ed_dist edit_distance_rdl(const ed_elem *src, ed_size slen,
+ const ed_elem *tgt, ed_size tlen)
+{
+ /* Optimization: Avoid malloc when required rows of distance matrix can
+ * fit on the stack.
+ */
+ ed_dist stackdist[ED_STACK_ELEMS];
+
+ /* Two rows of the Wagner-Fischer distance matrix. */
+ ed_dist *distmem, *dist, *prevdist;
+ if (slen < ED_STACK_ELEMS / 2) {
+ distmem = stackdist;
+ dist = distmem;
+ prevdist = distmem + slen + 1;
+ } else {
+ distmem = malloc((slen + 1) * sizeof(ed_dist) * 2);
+ dist = distmem;
+ prevdist = distmem + slen + 1;
+ }
+
+ /* Initialize row with cost to delete src[0..i-1] */
+ dist[0] = 0;
+ for (ed_size i = 1; i <= slen; ++i) {
+ dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
+ }
+
+ for (ed_size j = 1; j <= tlen; ++j) {
+ /* Value for dist[j-2][i-1] (two rows up, one col left). */
+ /* Note: dist[0] is not initialized when j == 1, var unused. */
+ ed_dist diagdist1 = prevdist[0];
+ /* Value for dist[j-2][i-2] (two rows up, two cols left).
+ * Initialization value only used to placate GCC. */
+ ed_dist diagdist2 = 0;
+
+ ED_SWAP(dist, prevdist, ed_dist *);
+
+ dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]);
+
+ /* Loop invariant: dist[i] is the edit distance between first j
+ * elements of tgt and first i elements of src.
+ */
+ for (ed_size i = 1; i <= slen; ++i) {
+ ed_dist nextdiagdist = dist[i];
+
+ if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
+ /* Same as tgt upto j-2, src upto i-2. */
+ dist[i] = prevdist[i - 1];
+ } else {
+ /* Insertion is tgt upto j-2, src upto i-1
+ * + insert tgt[j-1] */
+ ed_dist insdist =
+ prevdist[i] + ED_INS_COST(tgt[j - 1]);
+
+ /* Deletion is tgt upto j-1, src upto i-2
+ * + delete src[i-1] */
+ ed_dist deldist =
+ dist[i - 1] + ED_DEL_COST(src[i - 1]);
+
+ /* Substitution is tgt upto j-2, src upto i-2
+ * + substitute tgt[j-1] for src[i-1] */
+ ed_dist subdist = prevdist[i - 1] +
+ ED_SUB_COST(src[i - 1], tgt[j - 1]);
+
+ /* Use best distance available */
+ dist[i] = ED_MIN3(insdist, deldist, subdist);
+
+ if (j > 1 && i > 1 &&
+ ED_ELEM_EQUAL(src[i - 2], tgt[j - 1]) &&
+ ED_ELEM_EQUAL(src[i - 1], tgt[j - 2])) {
+ ed_dist tradist = diagdist2 +
+ ED_TRA_COST(src[j - 2], src[j - 1]);
+ dist[i] = ED_MIN2(dist[i], tradist);
+ }
+ }
+
+ diagdist2 = diagdist1;
+ diagdist1 = nextdiagdist;
+ }
+ }
+
+ ed_dist total = dist[slen];
+ if (distmem != stackdist) {
+ free(distmem);
+ }
+ return total;
+}
--- /dev/null
+/** @file
+ * Runnable public API tests of the edit_distance module.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+
+#include <string.h>
+
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/edit_distance/edit_distance.h>
+#include <ccan/tap/tap.h>
+
+#define edit_distance_lcs_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_LCS)
+#define edit_distance_lev_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_LEV)
+#define edit_distance_rdl_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_RDL)
+#define edit_distance_dl_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_DL)
+
+static void test_lcs(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_lcs_static("", "") == 0);
+ ok1(edit_distance_lcs_static("a", "") == 1);
+ ok1(edit_distance_lcs_static("", "a") == 1);
+ ok1(edit_distance_lcs_static("a", "a") == 0);
+ ok1(edit_distance_lcs_static("a", "b") == 2);
+ ok1(edit_distance_lcs_static("b", "a") == 2);
+
+ /* Trivial search cases */
+ ok1(edit_distance_lcs_static("a", "bcdef") == 6);
+ ok1(edit_distance_lcs_static("a", "bcadef") == 5);
+ ok1(edit_distance_lcs_static("acdef", "b") == 6);
+ ok1(edit_distance_lcs_static("abcdef", "b") == 5);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_lcs_static("aa", "ab") == 2);
+ ok1(edit_distance_lcs_static("ab", "aa") == 2);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_lcs_static("aa", "ba") == 2);
+ ok1(edit_distance_lcs_static("ba", "aa") == 2);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_lcs_static("ab", "ba") == 2);
+ ok1(edit_distance_lcs_static("abc", "de") == 5);
+ ok1(edit_distance_lcs_static("abc", "def") == 6);
+
+ /* (Restricted) Transposition */
+ ok1(edit_distance_lcs_static("abcd", "ecbf") == 6);
+
+ /* (Unrestricted) Transposition */
+ ok1(edit_distance_lcs_static("ca", "abc") == 3);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_lcs_static("abcde", "xcdef") == 4);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_lcs_static("aabcc", "bccdd") == 4);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_lcs_static("bccdd", "aabcc") == 4);
+}
+
+static void test_lev(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_lev_static("", "") == 0);
+ ok1(edit_distance_lev_static("a", "") == 1);
+ ok1(edit_distance_lev_static("", "a") == 1);
+ ok1(edit_distance_lev_static("a", "a") == 0);
+ ok1(edit_distance_lev_static("a", "b") == 1);
+ ok1(edit_distance_lev_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_lev_static("a", "bcdef") == 5);
+ ok1(edit_distance_lev_static("a", "bcadef") == 5);
+ ok1(edit_distance_lev_static("acdef", "b") == 5);
+ ok1(edit_distance_lev_static("abcdef", "b") == 5);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_lev_static("aa", "ab") == 1);
+ ok1(edit_distance_lev_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_lev_static("aa", "ba") == 1);
+ ok1(edit_distance_lev_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_lev_static("ab", "ba") == 2);
+ ok1(edit_distance_lev_static("abc", "de") == 3);
+ ok1(edit_distance_lev_static("abc", "def") == 3);
+
+ /* (Restricted) Transposition */
+ ok1(edit_distance_lev_static("abcd", "ecbf") == 4);
+
+ /* (Unrestricted) Transposition */
+ ok1(edit_distance_lev_static("ca", "abc") == 3);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_lev_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_lev_static("aabcc", "bccdd") == 4);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_lev_static("bccdd", "aabcc") == 4);
+}
+
+static void test_rdl(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_rdl_static("", "") == 0);
+ ok1(edit_distance_rdl_static("a", "") == 1);
+ ok1(edit_distance_rdl_static("", "a") == 1);
+ ok1(edit_distance_rdl_static("a", "a") == 0);
+ ok1(edit_distance_rdl_static("a", "b") == 1);
+ ok1(edit_distance_rdl_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_rdl_static("a", "bcdef") == 5);
+ ok1(edit_distance_rdl_static("a", "bcadef") == 5);
+ ok1(edit_distance_rdl_static("acdef", "b") == 5);
+ ok1(edit_distance_rdl_static("abcdef", "b") == 5);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_rdl_static("aa", "ab") == 1);
+ ok1(edit_distance_rdl_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_rdl_static("aa", "ba") == 1);
+ ok1(edit_distance_rdl_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_rdl_static("ab", "ba") == 1);
+ ok1(edit_distance_rdl_static("abc", "de") == 3);
+ ok1(edit_distance_rdl_static("abc", "def") == 3);
+
+ /* (Restricted) Transposition */
+ ok1(edit_distance_rdl_static("abcd", "ecbf") == 3);
+
+ /* (Unrestricted) Transposition */
+ ok1(edit_distance_rdl_static("ca", "abc") == 3);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_rdl_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_rdl_static("aabcc", "bccdd") == 4);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_rdl_static("bccdd", "aabcc") == 4);
+}
+
+static void test_dl(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_dl_static("", "") == 0);
+ ok1(edit_distance_dl_static("a", "") == 1);
+ ok1(edit_distance_dl_static("", "a") == 1);
+ ok1(edit_distance_dl_static("a", "a") == 0);
+ ok1(edit_distance_dl_static("a", "b") == 1);
+ ok1(edit_distance_dl_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_dl_static("a", "bcdef") == 5);
+ ok1(edit_distance_dl_static("a", "bcadef") == 5);
+ ok1(edit_distance_dl_static("acdef", "b") == 5);
+ ok1(edit_distance_dl_static("abcdef", "b") == 5);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_dl_static("aa", "ab") == 1);
+ ok1(edit_distance_dl_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_dl_static("aa", "ba") == 1);
+ ok1(edit_distance_dl_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_dl_static("ab", "ba") == 1);
+ ok1(edit_distance_dl_static("abc", "de") == 3);
+ ok1(edit_distance_dl_static("abc", "def") == 3);
+
+ /* (Restricted) Transposition */
+ ok1(edit_distance_dl_static("abcd", "ecbf") == 3);
+
+ /* (Unrestricted) Transposition */
+ ok1(edit_distance_dl_static("ca", "abc") == 2);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_dl_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_dl_static("aabcc", "bccdd") == 4);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_dl_static("bccdd", "aabcc") == 4);
+}
+
+int main(void)
+{
+ plan_tests(89);
+
+ test_lcs();
+ test_lev();
+ test_rdl();
+ test_dl();
+
+ /* Unsupported edit distance measure */
+ enum ed_measure badmeasure = (enum ed_measure)-1;
+ ok1(edit_distance("ab", 2, "ba", 2, badmeasure) == (ed_dist)-1);
+
+ return exit_status();
+}
--- /dev/null
+/** @file
+ * Runnable tests for the edit_distance module using custom element and
+ * distance types.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+
+#include <limits.h> /* USHRT_MAX */
+
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+
+#define ed_elem unsigned short
+#define ed_dist short
+#define ED_HASH_ELEM(e) e
+#define ED_HASH_MAX USHRT_MAX
+
+#include <ccan/edit_distance/edit_distance.c>
+#include <ccan/edit_distance/edit_distance_dl.c>
+#include <ccan/edit_distance/edit_distance_lcs.c>
+#include <ccan/edit_distance/edit_distance_lev.c>
+#include <ccan/edit_distance/edit_distance_rdl.c>
+
+int main(void)
+{
+ const unsigned short src[] = { 0, 1, 2, 3 };
+ const unsigned short tgt[] = { 4, 2, 1, 5 };
+ ed_size slen = ARRAY_SIZE(src);
+ ed_size tlen = ARRAY_SIZE(tgt);
+
+ plan_tests(4);
+
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3);
+
+ return exit_status();
+}
--- /dev/null
+/** @file
+ * Runnable tests for the edit_distance module using custom element and
+ * distance types.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+
+#include <limits.h> /* UCHAR_MAX */
+
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+
+struct color16 {
+ unsigned char r:5;
+ unsigned char g:6;
+ unsigned char b:5;
+};
+
+#define ed_elem struct color16
+#define ED_ELEM_EQUAL(e, f) (e.r == f.r && e.g == f.g && e.b == f.b)
+#define ED_HASH_ELEM(e) ((e.r << 11) | (e.g << 5) | e.b)
+#define ED_HASH_MAX USHRT_MAX
+
+#include <ccan/edit_distance/edit_distance.c>
+#include <ccan/edit_distance/edit_distance_dl.c>
+#include <ccan/edit_distance/edit_distance_lcs.c>
+#include <ccan/edit_distance/edit_distance_lev.c>
+#include <ccan/edit_distance/edit_distance_rdl.c>
+
+int main(void)
+{
+ const struct color16 src[] = {
+ {0, 0, 0},
+ {1, 1, 1},
+ {2, 2, 2},
+ {3, 3, 3}
+ };
+ const struct color16 tgt[] = {
+ {4, 4, 4},
+ {2, 2, 2},
+ {1, 1, 1},
+ {5, 5, 5}
+ };
+ ed_size slen = ARRAY_SIZE(src);
+ ed_size tlen = ARRAY_SIZE(tgt);
+
+ plan_tests(4);
+
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3);
+
+ return exit_status();
+}
--- /dev/null
+/** @file
+ * Runnable tests for the edit_distance module using custom element and
+ * distance types.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+
+#include <limits.h> /* UCHAR_MAX */
+#include <wchar.h> /* wmemchr */
+
+#include <ccan/array_size/array_size.h>
+#include <ccan/tap/tap.h>
+
+#define ed_elem wchar_t
+#define ed_dist double
+#define ED_HAS_ELEM(arr, elem, len) (wmemchr(arr, elem, len) != NULL)
+/* Since we only use ASCII characters */
+#define ED_HASH_ELEM(e) (unsigned char)e
+#define ED_HASH_MAX UCHAR_MAX
+#define ED_HASH_ON_STACK
+
+#include <ccan/edit_distance/edit_distance.c>
+#include <ccan/edit_distance/edit_distance_dl.c>
+#include <ccan/edit_distance/edit_distance_lcs.c>
+#include <ccan/edit_distance/edit_distance_lev.c>
+#include <ccan/edit_distance/edit_distance_rdl.c>
+
+int main(void)
+{
+ const wchar_t src[] = L"abcd";
+ const wchar_t tgt[] = L"ecbf";
+ ed_size slen = ARRAY_SIZE(src) - 1;
+ ed_size tlen = ARRAY_SIZE(tgt) - 1;
+
+ plan_tests(16);
+
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LCS) == 6.0);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_LEV) == 4.0);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_RDL) == 3.0);
+ ok1(edit_distance(src, slen, tgt, tlen, EDIT_DISTANCE_DL) == 3.0);
+
+ /* Test empty strings work as expected */
+ ok1(edit_distance(src, slen, tgt, 0, EDIT_DISTANCE_LCS) == 4.0);
+ ok1(edit_distance(src, 0, tgt, tlen, EDIT_DISTANCE_LCS) == 4.0);
+
+ /* Test ED_HAS_ELEM works as expected */
+ ok1(edit_distance(L"c", 1, tgt, tlen, EDIT_DISTANCE_LCS) == 3.0);
+ ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_LCS) == 5.0);
+ ok1(edit_distance(src, slen, L"c", 1, EDIT_DISTANCE_LCS) == 3.0);
+ ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_LCS) == 5.0);
+ ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_LEV) == 4.0);
+ ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_LEV) == 4.0);
+ ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_RDL) == 4.0);
+ ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_RDL) == 4.0);
+ ok1(edit_distance(L"z", 1, tgt, tlen, EDIT_DISTANCE_DL) == 4.0);
+ ok1(edit_distance(src, slen, L"z", 1, EDIT_DISTANCE_DL) == 4.0);
+
+ return exit_status();
+}
--- /dev/null
+/** @file
+ * Runnable tests for the edit_distance module using custom costs/weights.
+ *
+ * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
+ * MIT license - see LICENSE file for details
+ */
+
+#include <string.h>
+
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/tap/tap.h>
+
+#define ED_DEL_COST(e) (e == 'a' ? 2 : 1)
+#define ED_INS_COST(e) (e == 'b' ? 2 : 1)
+#define ED_SUB_COST(e, f) (f == 'c' && e == 'd' ? 3 : 1)
+#define ED_TRA_COST(e, f) (e == 'e' && f == 'f' ? 3 : 1)
+
+#include <ccan/edit_distance/edit_distance.c>
+#include <ccan/edit_distance/edit_distance_dl.c>
+#include <ccan/edit_distance/edit_distance_lcs.c>
+#include <ccan/edit_distance/edit_distance_lev.c>
+#include <ccan/edit_distance/edit_distance_rdl.c>
+
+#define edit_distance_lcs_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_LCS)
+#define edit_distance_lev_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_LEV)
+#define edit_distance_rdl_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_RDL)
+#define edit_distance_dl_static(arr1, arr2) \
+ edit_distance(arr1, sizeof arr1 - 1, arr2, sizeof arr2 - 1, \
+ EDIT_DISTANCE_DL)
+
+static void test_lcs(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_lcs_static("", "") == 0);
+ ok1(edit_distance_lcs_static("a", "") == 2);
+ ok1(edit_distance_lcs_static("", "a") == 1);
+ ok1(edit_distance_lcs_static("a", "a") == 0);
+ ok1(edit_distance_lcs_static("a", "b") == 4);
+ ok1(edit_distance_lcs_static("b", "a") == 2);
+
+ /* Trivial search cases */
+ ok1(edit_distance_lcs_static("a", "bcdef") == 8);
+ ok1(edit_distance_lcs_static("a", "bcadef") == 6);
+ ok1(edit_distance_lcs_static("acdef", "b") == 8);
+ ok1(edit_distance_lcs_static("abcdef", "b") == 6);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_lcs_static("aa", "ab") == 4);
+ ok1(edit_distance_lcs_static("ab", "aa") == 2);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_lcs_static("aa", "ba") == 4);
+ ok1(edit_distance_lcs_static("ba", "aa") == 2);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_lcs_static("ab", "ba") == 3);
+ ok1(edit_distance_lcs_static("abc", "de") == 6);
+ ok1(edit_distance_lcs_static("abc", "def") == 7);
+ ok1(edit_distance_lcs_static("de", "bdef") == 3);
+
+ /* Transposition + Insert */
+ ok1(edit_distance_lcs_static("ca", "abc") == 4);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_lcs_static("abcde", "xcdef") == 5);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_lcs_static("aabcc", "bccdd") == 6);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_lcs_static("bccdd", "aabcc") == 4);
+}
+
+static void test_lev(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_lev_static("", "") == 0);
+ ok1(edit_distance_lev_static("a", "") == 2);
+ ok1(edit_distance_lev_static("", "a") == 1);
+ ok1(edit_distance_lev_static("a", "a") == 0);
+ ok1(edit_distance_lev_static("a", "b") == 1);
+ ok1(edit_distance_lev_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_lev_static("a", "bcdef") == 5);
+ ok1(edit_distance_lev_static("a", "bcadef") == 6);
+ ok1(edit_distance_lev_static("acdef", "b") == 5);
+ ok1(edit_distance_lev_static("abcdef", "b") == 6);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_lev_static("aa", "ab") == 1);
+ ok1(edit_distance_lev_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_lev_static("aa", "ba") == 1);
+ ok1(edit_distance_lev_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_lev_static("ab", "ba") == 2);
+ ok1(edit_distance_lev_static("abc", "de") == 3);
+ ok1(edit_distance_lev_static("abc", "def") == 3);
+ ok1(edit_distance_lev_static("de", "bdef") == 3);
+
+ /* Transposition + Insert */
+ ok1(edit_distance_lev_static("ca", "abc") == 3);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_lev_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_lev_static("aabcc", "bccdd") == 5);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_lev_static("bccdd", "aabcc") == 4);
+}
+
+static void test_rdl(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_rdl_static("", "") == 0);
+ ok1(edit_distance_rdl_static("a", "") == 2);
+ ok1(edit_distance_rdl_static("", "a") == 1);
+ ok1(edit_distance_rdl_static("a", "a") == 0);
+ ok1(edit_distance_rdl_static("a", "b") == 1);
+ ok1(edit_distance_rdl_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_rdl_static("a", "bcdef") == 5);
+ ok1(edit_distance_rdl_static("a", "bcadef") == 6);
+ ok1(edit_distance_rdl_static("acdef", "b") == 5);
+ ok1(edit_distance_rdl_static("abcdef", "b") == 6);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_rdl_static("aa", "ab") == 1);
+ ok1(edit_distance_rdl_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_rdl_static("aa", "ba") == 1);
+ ok1(edit_distance_rdl_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_rdl_static("ab", "ba") == 1);
+ ok1(edit_distance_rdl_static("abc", "de") == 3);
+ ok1(edit_distance_rdl_static("abc", "def") == 3);
+ ok1(edit_distance_rdl_static("de", "bdef") == 3);
+
+ /* Transposition + Insert */
+ ok1(edit_distance_rdl_static("ca", "abc") == 3);
+
+ /* Transpose Weight */
+ ok1(edit_distance_rdl_static("ef", "fe") == 2);
+ ok1(edit_distance_rdl_static("fe", "ef") == 1);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_rdl_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_rdl_static("aabcc", "bccdd") == 5);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_rdl_static("bccdd", "aabcc") == 4);
+}
+
+static void test_dl(void)
+{
+ /* Trivial cases */
+ ok1(edit_distance_dl_static("", "") == 0);
+ ok1(edit_distance_dl_static("a", "") == 2);
+ ok1(edit_distance_dl_static("", "a") == 1);
+ ok1(edit_distance_dl_static("a", "a") == 0);
+ ok1(edit_distance_dl_static("a", "b") == 1);
+ ok1(edit_distance_dl_static("b", "a") == 1);
+
+ /* Trivial search cases */
+ ok1(edit_distance_dl_static("a", "bcdef") == 5);
+ ok1(edit_distance_dl_static("a", "bcadef") == 6);
+ ok1(edit_distance_dl_static("acdef", "b") == 5);
+ ok1(edit_distance_dl_static("abcdef", "b") == 6);
+
+ /* Common prefix with single-char distance */
+ ok1(edit_distance_dl_static("aa", "ab") == 1);
+ ok1(edit_distance_dl_static("ab", "aa") == 1);
+
+ /* Common suffix with single-char distance */
+ ok1(edit_distance_dl_static("aa", "ba") == 1);
+ ok1(edit_distance_dl_static("ba", "aa") == 1);
+
+ /* Non-optimized cases (require Wagner-Fischer matrix) */
+ ok1(edit_distance_dl_static("ab", "ba") == 1);
+ ok1(edit_distance_dl_static("abc", "de") == 3);
+ ok1(edit_distance_dl_static("abc", "def") == 3);
+ ok1(edit_distance_dl_static("de", "bdef") == 3);
+
+ /* Transposition + Insert */
+ ok1(edit_distance_dl_static("ca", "abc") == 3);
+
+ /* Transpose Weight */
+ ok1(edit_distance_dl_static("ef", "fe") == 2);
+ ok1(edit_distance_dl_static("fe", "ef") == 1);
+
+ /* Insert + Delete + Sub */
+ ok1(edit_distance_dl_static("abcde", "xcdef") == 3);
+
+ /* Distance depends on multiple deletions in final row. */
+ ok1(edit_distance_dl_static("aabcc", "bccdd") == 5);
+
+ /* Distance depends on multiple insertions in final column. */
+ ok1(edit_distance_dl_static("bccdd", "aabcc") == 4);
+}
+
+/* Test edit_distance calculation around the stack threshold to ensure memory
+ * is allocated and freed correctly and stack overflow does not occur.
+ *
+ * Note: This test is done when ED_COST_IS_SYMMETRIC is not defined so that
+ * tgt can be small to make the test run quickly (with ED_COST_IS_SYMMETRIC the
+ * min length would need to be above the threshold).
+ */
+static void test_mem_use(void)
+{
+ char tgt[] = "BC";
+ char src[ED_STACK_ELEMS + 1];
+ for (size_t i = 0; i < ED_STACK_ELEMS; ++i) {
+ src[i] = (char)('A' + (i % 26));
+ }
+ src[ED_STACK_ELEMS] = '\0';
+
+ for (ed_size tlen = 1; tlen < 3; ++tlen) {
+ ed_size slen = ED_STACK_ELEMS;
+ /* Above threshold, causes allocation */
+ ok(edit_distance_lcs(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_lcs(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_lev(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_lev(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_rdl(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_rdl(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_dl(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_dl(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+
+ /* Below threshold, no allocation */
+ --slen;
+ ok(edit_distance_lcs(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_lcs(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_lev(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_lev(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_rdl(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_rdl(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ ok(edit_distance_dl(src, slen, tgt, tlen) == slen - tlen,
+ "edit_distance_dl(\"%.3s..., %u, \"%.*s\", %u) == %u",
+ src, slen, (int)tlen, tgt, tlen, slen - tlen);
+ }
+}
+
+int main(void)
+{
+ plan_tests(109);
+
+ test_lcs();
+ test_lev();
+ test_rdl();
+ test_dl();
+
+ test_mem_use();
+
+ /* Unsupported edit distance measure */
+ enum ed_measure badmeasure = (enum ed_measure)-1;
+ ok1(edit_distance("ab", 2, "ba", 2, badmeasure) == (ed_dist)-1);
+
+ return exit_status();
+}