2 * Defines Damerau-Levenshtein distance functions.
4 * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
5 * MIT license - see LICENSE file for details
7 #include <stdlib.h> /* free, malloc */
9 #include "edit_distance.h"
10 #include "edit_distance-params.h"
11 #include "edit_distance-private.h"
13 ed_dist edit_distance_dl(const ed_elem *src, ed_size slen,
14 const ed_elem *tgt, ed_size tlen)
16 /* Optimization: Avoid malloc when distance matrix can fit on the stack.
18 ed_dist stackdist[ED_STACK_DIST_VALS];
20 /* Lowrance-Wagner distance matrix, in row-major order. */
21 size_t matsize = ((size_t)slen + 2) * (tlen + 2);
22 ed_dist *distmem = matsize <= ED_STACK_DIST_VALS ? stackdist :
23 malloc(matsize * sizeof(ed_dist));
24 ed_dist *dist = distmem;
26 #ifdef ED_HASH_ON_STACK
27 ed_size lasttgt[ED_HASH_MAX + 1] = { 0 };
29 ed_size *lasttgt = calloc(ED_HASH_MAX + 1, sizeof(ed_size));
32 /* Upper bound on distance between strings. */
35 #ifdef ED_DEL_COST_CONST
36 maxdist += (ed_dist)slen *ED_DEL_COST();
38 /* Lower-triangular matrix of deletion costs.
39 * delcost[i2, i1] is cost to delete src[i1..i2-1].
40 * delcost[i, i] is 0. */
41 ed_dist *delcost = malloc(ED_TMAT_SIZE(slen + 1) * sizeof(ed_dist));
42 ed_dist *delcostitr = delcost;
43 ed_dist *delcostprevitr = delcost;
45 for (ed_size i2 = 1; i2 <= slen; ++i2) {
46 ed_dist costi2 = ED_DEL_COST(src[i2 - 1]);
47 for (ed_size i1 = 0; i1 < i2; ++i1) {
48 *delcostitr++ = *delcostprevitr++ + costi2;
52 maxdist += delcost[ED_TMAT_IND(slen, 0)];
55 #ifdef ED_INS_COST_CONST
56 maxdist += (ed_dist)tlen *ED_INS_COST();
58 /* Lower-triangular matrix of insertion costs.
59 * inscost[j2, j1] is cost to insert tgt[j1..j2-1].
60 * inscost[j, j] is 0. */
61 ed_dist *inscost = malloc(ED_TMAT_SIZE(tlen + 1) * sizeof(ed_dist));
62 ed_dist *inscostitr = inscost;
63 ed_dist *inscostprevitr = inscost;
65 for (ed_size j2 = 1; j2 <= tlen; ++j2) {
66 ed_dist costj2 = ED_INS_COST(tgt[j2 - 1]);
67 for (ed_size j1 = 0; j1 < j2; ++j1) {
68 *inscostitr++ = *inscostprevitr++ + costj2;
72 maxdist += inscost[ED_TMAT_IND(tlen, 0)];
75 /* Initialize first row with maximal cost */
76 for (ed_size i = 0; i < slen + 2; ++i) {
80 /* Position dist to match other algorithms. dist[-1] will be maxdist */
83 /* Initialize row with cost to delete src[0..i-1] */
86 for (ed_size i = 1; i <= slen; ++i) {
87 dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
90 for (ed_size j = 1; j <= tlen; ++j) {
91 /* Largest y < i such that src[y] = tgt[j] */
93 ed_dist *prevdist = dist;
96 dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]);
98 /* Loop invariant: dist[i] is the edit distance between first j
99 * elements of tgt and first i elements of src.
101 * Loop invariant: lasttgt[ED_HASH_ELEM(c)] holds the largest
102 * x < j such that tgt[x-1] = c or 0 if no such x exists.
104 for (ed_size i = 1; i <= slen; ++i) {
105 ed_size i1 = lastsrc;
106 ed_size j1 = lasttgt[ED_HASH_ELEM(src[i - 1])];
108 if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
109 /* Same as tgt upto j-2, src upto i-2. */
110 dist[i] = prevdist[i - 1];
113 /* Insertion is tgt upto j-2, src upto i-1
114 * + insert tgt[j-1] */
116 prevdist[i] + ED_INS_COST(tgt[j - 1]);
118 /* Deletion is tgt upto j-1, src upto i-2
119 * + delete src[i-1] */
121 dist[i - 1] + ED_DEL_COST(src[i - 1]);
123 /* Substitution is tgt upto j-2, src upto i-2
124 * + substitute tgt[j-1] for src[i-1] */
125 ed_dist subdist = prevdist[i - 1] +
126 ED_SUB_COST(src[i - 1], tgt[j - 1]);
128 /* Use best distance available */
129 dist[i] = ED_MIN3(insdist, deldist, subdist);
132 distmem[(size_t)j1 * (slen + 2) + i1];
133 #ifdef ED_INS_COST_CONST
135 (ed_dist)(j - j1 - 1) * ED_INS_COST();
137 swpdist += inscost[ED_TMAT_IND(j - 1, j1)];
139 #ifdef ED_TRA_COST_CONST
140 swpdist += ED_TRA_COST(,);
144 ED_TRA_COST(src[i1 - 1],
148 #ifdef ED_DEL_COST_CONST
150 (ed_dist)(i - i1 - 1) * ED_DEL_COST();
152 swpdist += delcost[ED_TMAT_IND(i - 1, i1)];
155 dist[i] = ED_MIN2(dist[i], swpdist);
159 lasttgt[ED_HASH_ELEM(tgt[j - 1])] = j;
162 #ifndef ED_HASH_ON_STACK
166 #ifndef ED_DEL_COST_CONST
170 #ifndef ED_INS_COST_CONST
174 ed_dist total = dist[slen];
175 if (distmem != stackdist) {