2 * Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 unsigned char *cur; /* End of string; insertion point for new bytes */
35 unsigned char *end; /* End of buffer */
36 unsigned char *start; /* Beginning of string */
39 /* sb is evaluated multiple times in these macros. */
40 #define sb_size(sb) ((size_t)((sb)->cur - (sb)->start))
41 #define sb_avail(sb) ((size_t)((sb)->end - (sb)->cur))
43 /* sb and need may be evaluated multiple times. */
44 #define sb_need(sb, need) do { \
45 if (sb_avail(sb) < (need)) \
46 if (sb_grow(sb, need) != 0) \
50 static int sb_init(SB *sb)
52 sb->start = malloc(17);
53 if (sb->start == NULL)
56 sb->end = sb->start + 16;
60 static int sb_grow(SB *sb, size_t need)
62 size_t length = sb->cur - sb->start;
63 size_t alloc = sb->end - sb->start;
68 } while (alloc < length + need);
70 tmp = realloc(sb->start, alloc + 1);
74 sb->cur = tmp + length;
75 sb->end = tmp + alloc;
79 static int sb_putc(SB *sb, unsigned char c)
89 static int sb_write(SB *sb, const void *data, size_t size)
92 memcpy(sb->cur, data, size);
100 static void sb_return(SB *sb, void **data_out, size_t *length_out)
104 *data_out = sb->start;
108 *length_out = sb->cur - sb->start;
111 static void sb_discard(SB *sb, void **data_out, size_t *length_out)
121 * The first byte in a patch is the "patch type", which indicates how the
122 * patch is formatted. This keeps the patch format flexible while retaining
123 * backward compatibility. Patches produced with an older version of
124 * the library can be applied with a newer version.
127 * Contains nothing more than the content of the new text.
130 * A string of copy, skip, and insert instructions for generating the new
131 * string from the old.
133 * copy(size): Copy @size bytes from old to new.
134 * skip(size): Skip @size bytes of old.
135 * insert(text): Insert @text into new.
137 * The syntax is as follows:
139 * copy: instruction_byte(1) size
140 * skip: instruction_byte(2) size
141 * insert: instruction_byte(3) size $size*OCTET
143 * 0 <= size_param_length <= 4
144 * instruction_byte(op) = op | size_param_length << 2
145 * size: $size_param_length*OCTET
146 * -- size is an unsigned integer encoded in big endian.
147 * -- However, if size_param_length is 0, the operation size is 1.
149 * Simply put, an instruction starts with an opcode and operation size.
150 * An insert instruction is followed by the bytes to be inserted.
152 #define PT_LITERAL 10
159 static unsigned int bytes_needed_for_size(uint32_t size)
163 else if (size <= 0xFF)
165 else if (size <= 0xFFFF)
167 else if (size <= 0xFFFFFF)
177 * BDELTA_MEMORY: Memory allocation failed
179 static BDELTAcode csi32_emit_op(SB *patch_out, int op, uint32_t size, const char **new_)
182 unsigned int size_param_length;
186 assert(op >= 1 && op <= 3);
190 size_param_length = bytes_needed_for_size(size);
192 need = 1 + size_param_length;
195 sb_need(patch_out, need);
197 *patch_out->cur++ = (unsigned int)op | size_param_length << 2;
198 for (i = size_param_length, tmp = size; i-- > 0; tmp >>= 8)
199 patch_out->cur[i] = tmp & 0xFF;
200 patch_out->cur += size_param_length;
209 memcpy(patch_out->cur, *new_, size);
210 patch_out->cur += size;
220 return BDELTA_MEMORY;
224 * On success, returns 1, advances *sp past the parsed text, and sets *op_out and *size_out.
225 * On error or EOF, returns 0.
227 static int csi32_parse_op(
228 const unsigned char **sp, const unsigned char *e,
229 int *op_out, uint32_t *size_out)
231 const unsigned char *s = *sp;
234 unsigned int size_param_length;
240 size_param_length = *s >> 2;
242 if (op == 0 || size_param_length > 4)
245 if (size_param_length == 0) {
248 if ((size_t)(e - s) < size_param_length)
251 for (i = 0; i < size_param_length; i++) {
257 /* Make sure insert data fits in the patch, but don't consume it. */
258 if (op == OP_INSERT && (size_t)(e - s) < size)
268 * bdelta uses the algorithm described in:
270 * Myers, E. (1986). An O(ND) Difference Algorithm and Its Variations.
271 * Retrieved from http://www.xmailserver.org/diff2.pdf
273 * The pseudocode in Myers' paper (Figure 2) uses an array called V,
274 * where (V[k], V[k] - k) is the endpoint of the furthest-reaching
275 * D-path ending on diagonal k.
277 * The structure below holds the V array for every iteration of the outer loop.
278 * Because each iteration produces D+1 values, a triangle is formed:
281 * -5 -4 -3 -2 -1 0 1 2 3 4 5
282 * ----------------------------------
286 * | \ skip 1, then copy 1
288 * D | / insert 1, then copy 2
290 * | \ skip 1, then copy 1
295 * @data will literally contain: 0 0 1 2 2 3 3 4 5 5 3 4 5 7 7 3 4 5 7
297 * To convert this to an edit script, we first climb back to the top,
298 * using the same procedure as was used when the triangle was generated:
300 * If k = -D, climb right (the only way we can go).
301 * If k = +D, climb left (the only way we can go).
302 * Otherwise, favor the greater number.
303 * If the numbers are the same, climb left.
305 * Finally, we convert the descent to the solution to a patch script:
307 * The top number n corresponds to:
310 * A descent left from a to b corresponds to:
314 * A descent right from a to b corresponds to:
323 uint32_t *solution_ptr;
330 * BDELTA_MEMORY: Memory allocation failed
331 * BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
333 static BDELTAcode build_triangle(
334 const char *old, uint32_t old_size,
335 const char *new_, uint32_t new_size,
337 Triangle *triangle_out)
342 uint32_t *vprev; /* position within previous row */
343 uint32_t *v; /* position within current row */
344 uint32_t *vcur; /* beginning of current row */
345 size_t data_alloc = 16;
347 memset(triangle_out, 0, sizeof(*triangle_out));
349 data = malloc(data_alloc * sizeof(*data));
351 return BDELTA_MEMORY;
353 /* Allow dmax < 0 to mean "no limit". */
355 d_max = old_size + new_size;
358 * Compute the farthest-reaching 0-path so the loop after this
359 * will have a "previous" row to start with.
361 for (x = 0; x < old_size && x < new_size && old[x] == new_[x]; )
364 if (x >= old_size && x >= new_size) {
365 /* Strings are equal, so return a triangle with one row (a dot). */
366 assert(x == old_size && x == new_size);
367 triangle_out->data = data;
368 triangle_out->solution_d = 0;
369 triangle_out->solution_k = 0;
370 triangle_out->solution_ptr = data;
377 * Here is the core of the Myers diff algorithm.
379 * This is a direct translation of the pseudocode in Myers' paper,
380 * with implementation-specific adaptations:
382 * * Every V array is preserved per iteration of the outer loop.
383 * This is necessary so we can determine the actual patch, not just
384 * the length of the shortest edit string. See the coment above
385 * the definition of Triangle for an in-depth explanation.
387 * * Array items are stored consecutively so as to not waste space.
389 * * The buffer holding the V arrays is expanded dynamically.
391 for (d = 1; d <= d_max; d++, vprev = vcur, vcur = v) {
392 /* Ensure that the buffer has enough room for this row. */
393 if ((size_t)(v - data + d + 1) > data_alloc) {
394 size_t vprev_idx = vprev - data;
395 size_t v_idx = v - data;
396 size_t vcur_idx = vcur - data;
401 } while ((size_t)(v - data + d + 1) > data_alloc);
403 tmp = realloc(data, data_alloc * sizeof(*data));
406 return BDELTA_MEMORY;
410 /* Relocate pointers to the buffer we just expanded. */
411 vprev = data + vprev_idx;
413 vcur = data + vcur_idx;
416 for (k = -d; k <= d; k += 2, vprev++) {
417 if (k == -d || (k != d && vprev[-1] < vprev[0]))
422 while (x < old_size && y < new_size && old[x] == new_[y])
425 if (x >= old_size && y >= new_size) {
426 /* Shortest edit string found. */
427 assert(x == old_size && y == new_size);
428 triangle_out->data = data;
429 triangle_out->solution_d = d;
430 triangle_out->solution_k = k;
431 triangle_out->solution_ptr = v - 1;
438 return BDELTA_INTERNAL_DMAX_EXCEEDED;
442 * Trace a solution back to the top, returning a string of instructions
443 * for descending from the top to the solution.
445 * An instruction is one of the following:
449 * 0: Finished. You should be at the solution now.
451 * If memory allocation fails, this function will return NULL.
453 static signed char *climb_triangle(const Triangle *triangle)
455 signed char *descent;
459 assert(triangle->solution_d >= 0);
461 descent = malloc(triangle->solution_d + 1);
464 d = triangle->solution_d;
465 k = triangle->solution_k;
466 p = triangle->solution_ptr;
470 if (k == -d || (k != d && *(p-d-1) < *(p-d))) {
487 * Generate the actual patch, given data produced by build_triangle and
488 * climb_triangle. new_ is needed for the content of the insertions.
490 * See the comment above the definition of Triangle. It concisely documents
491 * how a descent down the triangle corresponds to a patch script.
493 * The resulting patch, including the patch type byte, is appended to patch_out.
498 * BDELTA_MEMORY: Memory allocation failed
500 static BDELTAcode descent_to_patch(
501 const signed char *descent,
502 const Triangle *triangle,
503 const char *new_, uint32_t new_size,
506 const char *new_end = new_ + new_size;
507 uint32_t *p = triangle->data;
513 uint32_t pending_length = 0;
514 uint32_t copy_length;
516 if (sb_putc(patch_out, PT_CSI32) != 0)
517 return BDELTA_MEMORY;
520 if (csi32_emit_op(patch_out, OP_COPY, *p, &new_) != BDELTA_OK)
521 return BDELTA_MEMORY;
524 for (; *descent != 0; descent++, p = p2) {
530 current_op = OP_INSERT;
532 copy_length = *p2 - *p;
538 current_op = OP_SKIP;
540 copy_length = *p2 - *p - 1;
543 if (pending_op == current_op) {
546 if (pending_op != 0) {
547 if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
548 return BDELTA_MEMORY;
550 pending_op = current_op;
554 if (copy_length > 0) {
555 if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
556 return BDELTA_MEMORY;
558 if (csi32_emit_op(patch_out, OP_COPY, copy_length, &new_) != BDELTA_OK)
559 return BDELTA_MEMORY;
562 assert(d == triangle->solution_d);
563 assert(k == triangle->solution_k);
564 assert(p == triangle->solution_ptr);
566 /* Emit the last pending op, unless it's a skip. */
567 if (pending_op != 0 && pending_op != OP_SKIP) {
568 if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
569 return BDELTA_MEMORY;
572 assert(new_ == new_end);
577 * Generate a patch using Myers' O(ND) algorithm.
579 * The patch is appended to @patch_out, which must be initialized before calling.
584 * BDELTA_MEMORY: Memory allocation failed
585 * BDELTA_INTERNAL_INPUTS_TOO_LARGE: Input sizes are too large
586 * BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
588 static BDELTAcode diff_myers(
589 const char *old, size_t old_size,
590 const char *new_, size_t new_size,
594 signed char *descent;
597 /* Make sure old_size + new_size does not overflow int or uint32_t. */
598 if (old_size >= UINT32_MAX ||
599 new_size >= UINT32_MAX - old_size ||
600 old_size >= (unsigned int)INT_MAX ||
601 new_size >= (unsigned int)INT_MAX - old_size)
602 return BDELTA_INTERNAL_INPUTS_TOO_LARGE;
604 rc = build_triangle(old, old_size, new_, new_size, 1000, &triangle);
608 descent = climb_triangle(&triangle);
612 if (descent_to_patch(descent, &triangle, new_, new_size, patch_out) != BDELTA_OK)
623 return BDELTA_MEMORY;
626 BDELTAcode bdelta_diff(
627 const void *old, size_t old_size,
628 const void *new_, size_t new_size,
629 void **patch_out, size_t *patch_size_out)
633 if (sb_init(&patch) != 0)
637 goto emit_new_literally;
639 if (diff_myers(old, old_size, new_, new_size, &patch) != BDELTA_OK)
640 goto emit_new_literally;
642 if (sb_size(&patch) > new_size) {
644 * A literal copy of new is no longer than this patch.
645 * All that for nothing.
647 goto emit_new_literally;
651 * Verify that patch, when applied to old, produces the correct text.
652 * If it doesn't, it's a bug, but fall back to a simple emit
653 * to avert data corruption.
663 patch.start, patch.cur - patch.start,
664 &result, &result_size
669 correct = (result_size == new_size &&
670 memcmp(result, new_, new_size) == 0);
684 goto emit_new_literally;
688 sb_return(&patch, patch_out, patch_size_out);
692 if (patch.cur != patch.start) {
694 if (sb_init(&patch) != 0)
697 if (sb_putc(&patch, PT_LITERAL) != 0 || sb_write(&patch, new_, new_size) != 0)
699 sb_return(&patch, patch_out, patch_size_out);
703 sb_discard(&patch, patch_out, patch_size_out);
704 return BDELTA_MEMORY;
711 * BDELTA_PATCH_INVALID: Patch is malformed
712 * BDELTA_PATCH_MISMATCH: Old string is too small
713 * BDELTA_MEMORY: Memory allocation failed
715 static BDELTAcode patch_csi32(
716 const unsigned char *o, const unsigned char *oe,
717 const unsigned char *p, const unsigned char *pe,
723 while (csi32_parse_op(&p, pe, &op, &size)) {
724 if ((op == OP_COPY || op == OP_SKIP) && (size_t)(oe - o) < size) {
725 /* Copy or skip instruction exceeds length of old string. */
726 return BDELTA_PATCH_MISMATCH;
728 if (op == OP_COPY || op == OP_INSERT)
729 sb_need(new_out, size);
732 case OP_COPY: /* Copy @size bytes from old string. */
733 memcpy(new_out->cur, o, size);
734 new_out->cur += size;
738 case OP_SKIP: /* Skip @size bytes of old string. */
742 case OP_INSERT: /* Insert @size new bytes (from the patch script). */
743 memcpy(new_out->cur, p, size);
744 new_out->cur += size;
753 return BDELTA_PATCH_INVALID;
758 return BDELTA_MEMORY;
761 BDELTAcode bdelta_patch(
762 const void *old, size_t old_size,
763 const void *patch, size_t patch_size,
764 void **new_out, size_t *new_size_out)
766 const unsigned char *o = old;
767 const unsigned char *oe = o + old_size;
768 const unsigned char *p = patch;
769 const unsigned char *pe = p + patch_size;
773 if (sb_init(&result) != 0) {
779 rc = BDELTA_PATCH_INVALID;
785 if (sb_write(&result, p, pe - p) != 0) {
792 rc = patch_csi32(o, oe, p, pe, &result);
798 rc = BDELTA_PATCH_INVALID;
802 sb_return(&result, new_out, new_size_out);
806 sb_discard(&result, new_out, new_size_out);
810 const char *bdelta_strerror(BDELTAcode code)
816 return "Could not allocate memory";
817 case BDELTA_PATCH_INVALID:
818 return "Patch is invalid";
819 case BDELTA_PATCH_MISMATCH:
820 return "Patch applied to wrong data";
822 case BDELTA_INTERNAL_DMAX_EXCEEDED:
823 return "Difference threshold exceeded (internal error)";
824 case BDELTA_INTERNAL_INPUTS_TOO_LARGE:
825 return "Inputs are too large (internal error)";
828 return "Invalid error code";
832 void bdelta_perror(const char *s, BDELTAcode code)
834 if (s != NULL && *s != '\0')
835 fprintf(stderr, "%s: %s\n", s, bdelta_strerror(code));
837 fprintf(stderr, "%s\n", bdelta_strerror(code));