]> git.ozlabs.org Git - ccan/blob - ccan/edit_distance/edit_distance_dl.c
base64: fix for unsigned chars (e.g. ARM).
[ccan] / ccan / edit_distance / edit_distance_dl.c
1 /** @file
2  * Defines Damerau-Levenshtein distance functions.
3  *
4  * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
5  *            MIT license - see LICENSE file for details
6  */
7 #include <stdlib.h>             /* free, malloc */
8
9 #include "edit_distance.h"
10 #include "edit_distance-params.h"
11 #include "edit_distance-private.h"
12
13 ed_dist edit_distance_dl(const ed_elem *src, ed_size slen,
14                          const ed_elem *tgt, ed_size tlen)
15 {
16         /* Optimization: Avoid malloc when distance matrix can fit on the stack.
17          */
18         ed_dist stackdist[ED_STACK_DIST_VALS];
19
20         /* Lowrance-Wagner distance matrix, in row-major order. */
21         size_t matsize = ((size_t)slen + 2) * (tlen + 2);
22         ed_dist *distmem = matsize <= ED_STACK_DIST_VALS ? stackdist :
23             malloc(matsize * sizeof(ed_dist));
24         ed_dist *dist = distmem;
25
26 #ifdef ED_HASH_ON_STACK
27         ed_size lasttgt[ED_HASH_MAX + 1] = { 0 };
28 #else
29         ed_size *lasttgt = calloc(ED_HASH_MAX + 1, sizeof(ed_size));
30 #endif
31
32         /* Upper bound on distance between strings. */
33         ed_dist maxdist = 0;
34
35 #ifdef ED_DEL_COST_CONST
36         maxdist += (ed_dist)slen *ED_DEL_COST();
37 #else
38         /* Lower-triangular matrix of deletion costs.
39          * delcost[i2, i1] is cost to delete src[i1..i2-1].
40          * delcost[i, i] is 0. */
41         ed_dist *delcost = malloc(ED_TMAT_SIZE(slen + 1) * sizeof(ed_dist));
42         ed_dist *delcostitr = delcost;
43         ed_dist *delcostprevitr = delcost;
44         ed_size i2, i1;
45         *delcostitr++ = 0;
46         for (i2 = 1; i2 <= slen; ++i2) {
47                 ed_dist costi2 = ED_DEL_COST(src[i2 - 1]);
48                 for (i1 = 0; i1 < i2; ++i1) {
49                         *delcostitr++ = *delcostprevitr++ + costi2;
50                 }
51                 *delcostitr++ = 0;
52         }
53         maxdist += delcost[ED_TMAT_IND(slen, 0)];
54 #endif
55
56 #ifdef ED_INS_COST_CONST
57         maxdist += (ed_dist)tlen *ED_INS_COST();
58 #else
59         /* Lower-triangular matrix of insertion costs.
60          * inscost[j2, j1] is cost to insert tgt[j1..j2-1].
61          * inscost[j, j] is 0. */
62         ed_dist *inscost = malloc(ED_TMAT_SIZE(tlen + 1) * sizeof(ed_dist));
63         ed_dist *inscostitr = inscost;
64         ed_dist *inscostprevitr = inscost;
65         ed_size j2, j1;
66         *inscostitr++ = 0;
67         for (j2 = 1; j2 <= tlen; ++j2) {
68                 ed_dist costj2 = ED_INS_COST(tgt[j2 - 1]);
69                 for (j1 = 0; j1 < j2; ++j1) {
70                         *inscostitr++ = *inscostprevitr++ + costj2;
71                 }
72                 *inscostitr++ = 0;
73         }
74         maxdist += inscost[ED_TMAT_IND(tlen, 0)];
75 #endif
76
77         /* Initialize first row with maximal cost */
78         ed_size i, j;
79         for (i = 0; i < slen + 2; ++i) {
80                 dist[i] = maxdist;
81         }
82
83         /* Position dist to match other algorithms.  dist[-1] will be maxdist */
84         dist += slen + 3;
85
86         /* Initialize row with cost to delete src[0..i-1] */
87         dist[-1] = maxdist;
88         dist[0] = 0;
89         for (i = 1; i <= slen; ++i) {
90                 dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
91         }
92
93         for (j = 1; j <= tlen; ++j) {
94                 /* Largest y < i such that src[y] = tgt[j] */
95                 ed_size lastsrc = 0;
96                 ed_dist *prevdist = dist;
97                 dist += slen + 2;
98                 dist[-1] = maxdist;
99                 dist[0] = prevdist[0] + ED_INS_COST(tgt[j - 1]);
100
101                 /* Loop invariant: dist[i] is the edit distance between first j
102                  * elements of tgt and first i elements of src.
103                  *
104                  * Loop invariant: lasttgt[ED_HASH_ELEM(c)] holds the largest
105                  * x < j such that tgt[x-1] = c or 0 if no such x exists.
106                  */
107                 ed_size i;
108                 for (i = 1; i <= slen; ++i) {
109                         ed_size i1 = lastsrc;
110                         ed_size j1 = lasttgt[ED_HASH_ELEM(src[i - 1])];
111
112                         if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
113                                 /* Same as tgt upto j-2, src upto i-2. */
114                                 dist[i] = prevdist[i - 1];
115                                 lastsrc = i;
116                         } else {
117                                 /* Insertion is tgt upto j-2, src upto i-1
118                                  * + insert tgt[j-1] */
119                                 ed_dist insdist =
120                                     prevdist[i] + ED_INS_COST(tgt[j - 1]);
121
122                                 /* Deletion is tgt upto j-1, src upto i-2
123                                  * + delete src[i-1] */
124                                 ed_dist deldist =
125                                     dist[i - 1] + ED_DEL_COST(src[i - 1]);
126
127                                 /* Substitution is tgt upto j-2, src upto i-2
128                                  * + substitute tgt[j-1] for src[i-1] */
129                                 ed_dist subdist = prevdist[i - 1] +
130                                     ED_SUB_COST(src[i - 1], tgt[j - 1]);
131
132                                 /* Use best distance available */
133                                 dist[i] = ED_MIN3(insdist, deldist, subdist);
134
135                                 ed_dist swpdist =
136                                     distmem[(size_t)j1 * (slen + 2) + i1];
137 #ifdef ED_INS_COST_CONST
138                                 swpdist +=
139                                     (ed_dist)(j - j1 - 1) * ED_INS_COST();
140 #else
141                                 swpdist += inscost[ED_TMAT_IND(j - 1, j1)];
142 #endif
143 #ifdef ED_TRA_COST_CONST
144                                 swpdist += ED_TRA_COST(,);
145 #else
146                                 if (i1 > 0) {
147                                         swpdist +=
148                                             ED_TRA_COST(src[i1 - 1],
149                                                         src[i - 1]);
150                                 }
151 #endif
152 #ifdef ED_DEL_COST_CONST
153                                 swpdist +=
154                                     (ed_dist)(i - i1 - 1) * ED_DEL_COST();
155 #else
156                                 swpdist += delcost[ED_TMAT_IND(i - 1, i1)];
157 #endif
158
159                                 dist[i] = ED_MIN2(dist[i], swpdist);
160                         }
161                 }
162
163                 lasttgt[ED_HASH_ELEM(tgt[j - 1])] = j;
164         }
165
166 #ifndef ED_HASH_ON_STACK
167         free(lasttgt);
168 #endif
169
170 #ifndef ED_DEL_COST_CONST
171         free(delcost);
172 #endif
173
174 #ifndef ED_INS_COST_CONST
175         free(inscost);
176 #endif
177
178         ed_dist total = dist[slen];
179         if (distmem != stackdist) {
180                 free(distmem);
181         }
182         return total;
183 }