From 702a89034cf7185725901e4a82cba9234858a04d Mon Sep 17 00:00:00 2001 From: Wayne Davison Date: Sun, 26 Feb 2006 00:29:34 +0000 Subject: [PATCH] An improved way to hash the checksum data (used by the sender). --- dynamic_hash.diff | 200 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 dynamic_hash.diff diff --git a/dynamic_hash.diff b/dynamic_hash.diff new file mode 100644 index 0000000..4b09aea --- /dev/null +++ b/dynamic_hash.diff @@ -0,0 +1,200 @@ +This patch makes the processing of large really large files more efficient +by making sure that the sender's hash table is large enough to hold all the +checksum entries without being overloaded. It also makes the hasing of +normal sized files use slightly less memory than before. + +http://lists.samba.org/archive/rsync/2005-March/011875.html + +--- old/match.c ++++ new/match.c +@@ -26,11 +26,6 @@ extern int append_mode; + + int updating_basis_file; + +-typedef unsigned short tag; +- +-#define TABLESIZE (1<<16) +-#define NULL_TAG (-1) +- + static int false_alarms; + static int tag_hits; + static int matches; +@@ -42,47 +37,36 @@ static int total_matches; + + extern struct stats stats; + +-struct target { +- tag t; +- int32 i; +-}; +- +-static struct target *targets; +- +-static int32 *tag_table; +- +-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF) +-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16) +- +-static int compare_targets(struct target *t1,struct target *t2) +-{ +- return (int)t1->t - (int)t2->t; +-} ++static int32 tablesize, tablealloc; ++static int32 *sum_table; + ++#define gettag2(s1,s2) gettag((s1) + ((s2)<<16)) ++#define gettag(sum) ((sum)%tablesize) + + static void build_hash_table(struct sum_struct *s) + { +- int32 i; ++ int32 i, t; + +- if (!tag_table) +- tag_table = new_array(int32, TABLESIZE); ++ /* Dynamically calculate the hash table size so that the hash load ++ * is always about 80%. This number must be odd or s2 will not be ++ * able to span the entire set. */ ++ tablesize = (s->count/8) * 10 + 11; ++ if (tablesize < 65537) ++ tablesize = 65537; /* a prime number */ ++ if (tablesize > tablealloc) { ++ tablealloc = tablesize; ++ sum_table = realloc_array(sum_table, int32, tablealloc); ++ if (!sum_table) ++ out_of_memory("build_hash_table"); ++ } + +- targets = new_array(struct target, s->count); +- if (!tag_table || !targets) +- out_of_memory("build_hash_table"); ++ memset(sum_table, 0xFF, tablesize * sizeof (int32)); + + for (i = 0; i < s->count; i++) { +- targets[i].i = i; +- targets[i].t = gettag(s->sums[i].sum1); ++ t = gettag(s->sums[i].sum1); ++ s->sums[i].chain = sum_table[t]; ++ sum_table[t] = i; + } +- +- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets); +- +- for (i = 0; i < TABLESIZE; i++) +- tag_table[i] = NULL_TAG; +- +- for (i = s->count; i-- > 0; ) +- tag_table[targets[i].t] = i; + } + + +@@ -176,20 +160,16 @@ static void hash_search(int f,struct sum + } + + do { +- tag t = gettag2(s1,s2); ++ int32 i, t = gettag2(s1,s2); + int done_csum2 = 0; +- int32 j = tag_table[t]; + + if (verbose > 4) + rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum); + +- if (j == NULL_TAG) +- goto null_tag; +- + sum = (s1 & 0xffff) | (s2 << 16); + tag_hits++; +- do { +- int32 l, i = targets[j].i; ++ for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) { ++ int32 l; + + if (sum != s->sums[i].sum1) + continue; +@@ -205,9 +185,10 @@ static void hash_search(int f,struct sum + && !(s->sums[i].flags & SUMFLG_SAME_OFFSET)) + continue; + +- if (verbose > 3) +- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n", +- (double)offset,(double)j,(double)i,sum); ++ if (verbose > 3) { ++ rprintf(FINFO,"potential match at %.0f i=%ld sum=%08x\n", ++ (double)offset, (long)i, sum); ++ } + + if (!done_csum2) { + map = (schar *)map_ptr(buf,offset,l); +@@ -224,23 +205,23 @@ static void hash_search(int f,struct sum + * one with an identical offset, so we prefer that over + * the following want_i optimization. */ + if (updating_basis_file) { +- do { +- int32 i2 = targets[j].i; ++ int32 i2; ++ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) { + if (s->sums[i2].offset != offset) + continue; + if (i2 != i) { + if (sum != s->sums[i2].sum1) +- break; ++ continue; + if (memcmp(sum2, s->sums[i2].sum2, + s->s2length) != 0) +- break; ++ continue; + i = i2; + } + /* This chunk was at the same offset on + * both the sender and the receiver. */ + s->sums[i].flags |= SUMFLG_SAME_OFFSET; + goto set_want_i; +- } while (++j < s->count && targets[j].t == t); ++ } + } + + /* we've found a match, but now check to see +@@ -266,9 +247,8 @@ static void hash_search(int f,struct sum + s2 = sum >> 16; + matches++; + break; +- } while (++j < s->count && targets[j].t == t); ++ } + +- null_tag: + backup = offset - last_match; + /* We sometimes read 1 byte prior to last_match... */ + if (backup < 0) +@@ -375,11 +355,6 @@ void match_sums(int f, struct sum_struct + rprintf(FINFO,"sending file_sum\n"); + write_buf(f,file_sum,MD4_SUM_LENGTH); + +- if (targets) { +- free(targets); +- targets=NULL; +- } +- + if (verbose > 2) + rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n", + false_alarms, tag_hits, matches); +--- old/rsync.h ++++ new/rsync.h +@@ -560,6 +560,7 @@ struct sum_buf { + OFF_T offset; /**< offset in file of this chunk */ + int32 len; /**< length of chunk of file */ + uint32 sum1; /**< simple checksum */ ++ int32 chain; /**< hash-table chaining */ + short flags; /**< flag bits */ + char sum2[SUM_LENGTH]; /**< checksum */ + }; +--- old/sender.c ++++ new/sender.c +@@ -92,6 +92,7 @@ static struct sum_struct *receive_sums(i + + s->sums[i].offset = offset; + s->sums[i].flags = 0; ++ s->sums[i].chain = 0; + + if (i == s->count-1 && s->remainder != 0) + s->sums[i].len = s->remainder; -- 2.34.1