]> git.ozlabs.org Git - ccan/blob - ccan/edit_distance/edit_distance_lcs.c
edit_distance: calculate edit distance between strings
[ccan] / ccan / edit_distance / edit_distance_lcs.c
1 /** @file
2  * Defines Longest Common Subsequence distance functions.
3  *
4  * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
5  *            MIT license - see LICENSE file for details
6  */
7 #include <stdlib.h>             /* free, malloc */
8
9 #include "edit_distance.h"
10 #include "edit_distance-params.h"
11 #include "edit_distance-private.h"
12
13 ed_dist edit_distance_lcs(const ed_elem *src, ed_size slen,
14                           const ed_elem *tgt, ed_size tlen)
15 {
16         /* Optimization: Avoid malloc when row of distance matrix can fit on
17          * the stack.
18          */
19         ed_dist stackdist[ED_STACK_ELEMS];
20
21         /* One row of the Wagner-Fischer distance matrix. */
22         ed_dist *dist = slen < ED_STACK_ELEMS ? stackdist :
23             malloc((slen + 1) * sizeof(ed_dist));
24
25         /* Initialize row with cost to delete src[0..i-1] */
26         dist[0] = 0;
27         for (ed_size i = 1; i <= slen; ++i) {
28                 dist[i] = dist[i - 1] + ED_DEL_COST(src[i - 1]);
29         }
30
31         for (ed_size j = 1; j <= tlen; ++j) {
32                 /* Value for dist[j-1][i-1] (one row up, one col left). */
33                 ed_dist diagdist = dist[0];
34                 dist[0] = dist[0] + ED_INS_COST(tgt[j - 1]);
35
36                 /* Loop invariant: dist[i] is the edit distance between first j
37                  * elements of tgt and first i elements of src.
38                  */
39                 for (ed_size i = 1; i <= slen; ++i) {
40                         ed_dist nextdiagdist = dist[i];
41
42                         if (ED_ELEM_EQUAL(src[i - 1], tgt[j - 1])) {
43                                 /* Same as tgt upto j-2, src upto i-2. */
44                                 dist[i] = diagdist;
45                         } else {
46                                 /* Insertion is tgt upto j-2, src upto i-1
47                                  * + insert tgt[j-1] */
48                                 ed_dist insdist =
49                                     dist[i] + ED_INS_COST(tgt[j - 1]);
50
51                                 /* Deletion is tgt upto j-1, src upto i-2
52                                  * + delete src[i-1] */
53                                 ed_dist deldist =
54                                     dist[i - 1] + ED_DEL_COST(src[i - 1]);
55
56                                 /* Use best distance available */
57                                 dist[i] = ED_MIN2(insdist, deldist);
58                         }
59
60                         diagdist = nextdiagdist;
61                 }
62         }
63
64         ed_dist total = dist[slen];
65         if (dist != stackdist) {
66                 free(dist);
67         }
68         return total;
69 }