X-Git-Url: https://mattmccutchen.net/rsync/rsync-patches.git/blobdiff_plain/e1a060026ac0b80fb080dc17494a2036ad21397e..6e9495c76ccda5a38bb1fa68b19ff22e8c7a30ad:/dynamic_hash.diff diff --git a/dynamic_hash.diff b/dynamic_hash.diff index d5b7e56..af065c1 100644 --- a/dynamic_hash.diff +++ b/dynamic_hash.diff @@ -1,32 +1,44 @@ This patch makes the processing of large really large files more efficient by making sure that the sender's hash table is large enough to hold all the -checksum entries without being overloaded. Unfortunately, the code adds a -modulus calculation for (up to) every byte of the source file, which slows -down the code for normal file sizes (e.g. 4 CPU seconds slower on a Pentium -III when copying a 65 MB file without very much matching data). +checksum entries without being overloaded. -This was udapted for the latest codebase from a patch written by Shachar +Updated to use the current hashtable method when possible, and the new +hashtable method (which requires a modulus calculation for up to every byte +of the source file) only on large files that need a larger hashtable size. +This avoids slowing down files that don't need the extra-large hashtable. + +This was updated for the latest codebase from a patch written by Shachar Shemesh. +To use this patch, run these commands for a successful build: + + patch -p1 >16) -+#define SUM2HASH(sum) ((sum)%tablesize) + #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) + #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) - static void build_hash_table(struct sum_struct *s) +-static int32 build_hash_table(struct sum_struct *s, int32 start) ++#define BIG_SUM2HASH(sum) ((sum)%tablesize) ++ ++static void build_hash_table(struct sum_struct *s) { - int32 i; -+ uint32 prior_size = tablesize; +- int32 i, end = s->count; ++ static uint32 alloc_size; ++ int32 i; - if (!hash_table) { - hash_table = new_array(int32, TABLESIZE); @@ -34,32 +46,104 @@ Shemesh. + * for big files is about 80%. This number must be odd or s2 will + * not be able to span the entire set. */ + tablesize = (uint32)(s->count/8) * 10 + 11; -+ if (tablesize < 65537) -+ tablesize = 65537; /* a prime number */ -+ if (tablesize != prior_size) { ++ if (tablesize < TRADITIONAL_TABLESIZE) ++ tablesize = TRADITIONAL_TABLESIZE; ++ if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) { + if (hash_table) + free(hash_table); + hash_table = new_array(int32, tablesize); if (!hash_table) out_of_memory("build_hash_table"); ++ alloc_size = tablesize; } - memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); +- +- if (end - start > TABLESIZE*8/10) +- end = start + TABLESIZE*8/10; +- +- for (i = start; i < end; i++) { +- uint32 t = SUM2HASH(s->sums[i].sum1); +- s->sums[i].chain = hash_table[t]; +- hash_table[t] = i; +- } + memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]); - for (i = 0; i < s->count; i++) { - uint32 t = SUM2HASH(s->sums[i].sum1); -@@ -162,11 +169,11 @@ static void hash_search(int f,struct sum +- if (verbose > 2) { +- rprintf(FINFO, "built hash table for entries %ld - %ld\n", +- (long)start, (long)end - 1); ++ if (tablesize == TRADITIONAL_TABLESIZE) { ++ for (i = 0; i < s->count; i++) { ++ uint32 t = SUM2HASH(s->sums[i].sum1); ++ s->sums[i].chain = hash_table[t]; ++ hash_table[t] = i; ++ } ++ } else { ++ for (i = 0; i < s->count; i++) { ++ uint32 t = BIG_SUM2HASH(s->sums[i].sum1); ++ s->sums[i].chain = hash_table[t]; ++ hash_table[t] = i; ++ } + } +- +- return end; + } + + +@@ -130,8 +141,8 @@ static void matched(int f, struct sum_st + static void hash_search(int f,struct sum_struct *s, + struct map_struct *buf, OFF_T len) + { +- OFF_T offset, end, reset = 0; +- int32 k, want_i, backup, sum_pos = 0; ++ OFF_T offset, end; ++ int32 k, want_i, backup; + char sum2[SUM_LENGTH]; + uint32 s1, s2, sum; + int more; +@@ -169,24 +180,21 @@ static void hash_search(int f,struct sum + int done_csum2 = 0; + int32 i; + +- if (offset >= reset) { +- sum_pos = build_hash_table(s, sum_pos); +- if (sum_pos == s->count) +- reset = len; +- else +- reset = sum_pos * s->blength; +- } +- + if (verbose > 4) { + rprintf(FINFO, "offset=%.0f sum=%04x%04x\n", (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); } - i = hash_table[SUM2HASH2(s1,s2)]; -+ sum = (s1 & 0xffff) | (s2 << 16); -+ i = hash_table[SUM2HASH(sum)]; - if (i < 0) - goto null_hash; +- if (i < 0) +- goto null_hash; ++ if (tablesize == TRADITIONAL_TABLESIZE) { ++ if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0) ++ goto null_hash; ++ sum = (s1 & 0xffff) | (s2 << 16); ++ } else { ++ sum = (s1 & 0xffff) | (s2 << 16); ++ if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0) ++ goto null_hash; ++ } - sum = (s1 & 0xffff) | (s2 << 16); hash_hits++; do { int32 l; +@@ -354,6 +362,11 @@ void match_sums(int f, struct sum_struct + } + + if (len > 0 && s->count > 0) { ++ build_hash_table(s); ++ ++ if (verbose > 2) ++ rprintf(FINFO,"built hash table\n"); ++ + hash_search(f, s, buf, len); + + if (verbose > 2)