2 * Defines the tunable parameters for the edit_distance functions which can
3 * be changed at module-compile-time.
5 * The macros in this header can be redefined, either by editing this file or
6 * providing external definitions. This makes it possible to calculate the
7 * weighted edit distance with arbitrary weighting.
9 * @copyright 2016 Kevin Locke <kevin@kevinlocke.name>
10 * MIT license - see LICENSE file for details
12 #ifndef CCAN_EDIT_DISTANCE_PARAMS_H
13 #define CCAN_EDIT_DISTANCE_PARAMS_H
16 # include <string.h> /* memchr */
20 # include <limits.h> /* UCHAR_MAX */
23 #if !defined(ED_DEL_COST) && \
24 !defined(ED_INS_COST) && \
25 !defined(ED_SUB_COST) && \
27 /** Defined if costs are symmetric.
28 * (i.e. <code>distance(a, b) == distance(b, a) forall a,b</code>) */
29 # define ED_COST_IS_SYMMETRIC
33 /** Cost to delete element @p e. */
34 # define ED_DEL_COST(e) 1
35 /** Defined if the cost to delete is the same for all elements. */
36 # define ED_DEL_COST_CONST
40 /** Cost to insert element @p e. */
41 # define ED_INS_COST(e) 1
42 /** Defined if the cost to insert is the same for all elements. */
43 # define ED_INS_COST_CONST
47 /** Cost to substitute element @p f for element @p e (i.e. replace @p e with
50 * All algorithms which use this macro expect that <code>ED_SUB_COST(e, f) +
51 * ED_SUB_COST(f, g) >= ED_SUB_COST(e, g)</code>. They do not check for any
52 * substitution chains and will not find minimal distances when this assumption
55 * Although it is possible to set <code>ED_SUB_COST(e, f) >= ED_DEL_COST(e) +
56 * ED_INS_COST(f)</code>. If this is true for all @p e and @p f, consider
57 * using edit_distance_lcs() instead of edit_distance_lev() which calculates
58 * the same result more quickly by not considering substitutions which are
59 * never less costly than delete + insert.
61 # define ED_SUB_COST(e, f) 1
62 /** Defined if the cost to substitute is the same for all element pairs. */
63 # define ED_SUB_COST_CONST
67 /** Cost to transpose elements @p ef to @p fe.
69 * @c edit_distance_dl requires <code>2 * ED_TRA_COST(e, f) >= ED_INS_COST(g) +
70 * ED_DEL_COST(h)</code> for all @c e,f,g,h. See Trace Theorem 3 of
71 * Wagner-Lowrance. @cite Wagner75
73 # define ED_TRA_COST(e, f) 1
74 /** Defined if the cost to transpose is the same for all element pairs. */
75 # define ED_TRA_COST_CONST
79 /** Optimized array search function for single-array-heavy workloads.
81 * The expression generated by this macro must evaluate to @c true in a boolean
82 * context when @p elem is contained in the first @p len elements of @p arr and
83 * @c false otherwise. @p len is guaranteed to be greater than 0.
85 * This macro is currently only used when #ED_INS_COST_CONST and
86 * #ED_SUB_COST_CONST are defined.
89 * Only define this macro when the search function is highly optimized and when
90 * single-element arrays are expected to be passed frequently, because it adds
91 * a conditional branch to each execution of the distance functions with
92 * non-empty arguments.
94 # define ED_HAS_ELEM(arr, elem, len) (memchr(arr, elem, len) != NULL)
98 /** Are two elements equal? */
99 # define ED_ELEM_EQUAL(elem1, elem2) (elem1 == elem2)
102 #if !defined(ED_HASH_ELEM) && !defined(ed_elem)
103 /** Hash an element to a unique value in <code>[0, ED_HASH_MAX]</code>.
105 * Note: This module does not use a "real" dictionary. Hash collisions are
106 * not expected nor supported. If elements can not be uniquely mapped to a
107 * reasonable range, consider implementing a "real" dictionary.
109 # define ED_HASH_ELEM(e) ((unsigned char)e)
110 /** Maximum value that can be returned from #ED_HASH_ELEM */
111 # define ED_HASH_MAX UCHAR_MAX
112 /** Can an array of #ED_HASH_MAX ::ed_dist values be stored on the stack?
113 * @see #ED_STACK_DIST_VALS
115 # define ED_HASH_ON_STACK
118 #ifndef ED_STACK_DIST_VALS
119 /** Maximum number of ::ed_dist values that will be allocated on the stack.
121 * The edit distance algorithms which use a dynamic programming (all currently
122 * supported algorithms) can store intermediate values on the stack to avoid
123 * the overhead of calling @c malloc. This macro defines the maximum number
124 * of intermediate distance values which can be stored on the stack for a
125 * single function call. It should be large enough to cover common cases,
126 * where possible, and small enough to avoid overflowing the stack or frame
127 * size limits. The algorithms have the following requirements:
129 * - ed_measure::EDIT_DISTANCE_LCS and ed_measure::EDIT_DISTANCE_LEV require
130 * @c min(slen,tlen) values when #ED_COST_IS_SYMMETRIC is defined, @c slen
132 * - ed_measure::EDIT_DISTANCE_RDL requires @c 2*min(slen,tlen) values when
133 * #ED_COST_IS_SYMMETRIC is defined, @c 2*slen otherwise.
134 * - ed_measure::EDIT_DISTANCE_DL requires @c slen*tlen values (in addition to
135 * the #ED_HASH_MAX values stored on the stack if #ED_HASH_ON_STACK is
138 * This value does not need to be a tight bound, since when @c slen*tlen is
139 * greater than around 10,000 the algorithm cost will dominate the @c malloc
142 * @see #ED_HASH_ON_STACK
144 # define ED_STACK_DIST_VALS 512