Find the longest match in the suffix-sorted array

Fixes #2

The current implementation performs a binary search through the
suffix-sorted array to find byte sequence matches between old and new
files. However, the algorithm is not optimal when it repeatedly matches
very short strings, leading to performance issues as reported in issue
 #2.

This commit changes the algorithm to consider *all* matches encountered
during the binary search and choose the longest of these matches.

The overall performance impact is yet to be determined, but it appears
to yield a small percentage increase in diff creation time (expected)
and a large percentage *decrease* for the case of diffing the files from
issue #2. In my testing, the creation time there decreases from approx
64 minutes to 4.5 seconds.

Signed-off-by: Patrick McCarty <patrick.mccarty@intel.com>
This commit is contained in:
Patrick McCarty
2017-12-19 15:15:39 -08:00
parent d039492824
commit 85b1c5345b

View File

@@ -88,27 +88,58 @@ static int64_t matchlen(u_char *old, int64_t oldsize, u_char *new,
return i;
}
int64_t max_len = 0;
/**
* Finds the longest matching array of bytes between the OLD and NEW file. The
* old file is suffix-sorted; the suffix-sorted array is stored at I, and
* indices to search between are indicated by ST (start) and EN (end). Returns
* the length of the match, and POS is updated to the position of the match
* within OLD.
*/
static int64_t search(int64_t *I, u_char *old, int64_t oldsize,
u_char *new, int64_t newsize, int64_t st, int64_t en,
int64_t *pos)
{
int64_t x, y;
/* Initialize max_len for the binary search */
if (st == 0 && en == oldsize) {
max_len = matchlen(old, oldsize, new, newsize);
*pos = I[st];
}
/* The binary search terminates here when "en" and "st" are adjacent
* indices in the suffix-sorted array. */
if (en - st < 2) {
x = matchlen(old + I[st], oldsize - I[st], new, newsize);
y = matchlen(old + I[en], oldsize - I[en], new, newsize);
if (x > y) {
if (x > max_len) {
max_len = x;
*pos = I[st];
return x;
} else {
*pos = I[en];
return y;
}
y = matchlen(old + I[en], oldsize - I[en], new, newsize);
if (y > max_len) {
max_len = y;
*pos = I[en];
}
return max_len;
}
x = st + (en - st) / 2;
if (memcmp(old + I[x], new, MIN(oldsize - I[x], newsize)) < 0) {
int64_t length = MIN(oldsize - I[x], newsize);
u_char *oldoffset = old + I[x];
/* This match *could* be the longest one, so check for that here */
int64_t tmp = matchlen(oldoffset, length, new, length);
if (tmp > max_len) {
max_len = tmp;
*pos = I[x];
}
/* Determine how to continue the binary search */
if (memcmp(oldoffset, new, length) < 0) {
return search(I, old, oldsize, new, newsize, x, en, pos);
} else {
return search(I, old, oldsize, new, newsize, st, x, pos);