From 84495354e9d02c3c1d87229516f97b393f15d590 Mon Sep 17 00:00:00 2001 From: Wayne Davison Date: Tue, 9 Oct 2007 06:17:07 +0000 Subject: [PATCH] Improved to avoid a hashtable slowdown for files that don't need the extra-large hashtable. --- dynamic_hash.diff | 69 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/dynamic_hash.diff b/dynamic_hash.diff index f9a41d5..3388bd9 100644 --- a/dynamic_hash.diff +++ b/dynamic_hash.diff @@ -1,9 +1,11 @@ This patch makes the processing of large really large files more efficient by making sure that the sender's hash table is large enough to hold all the -checksum entries without being overloaded. Unfortunately, the code adds a -modulus calculation for (up to) every byte of the source file, which slows -down the code for normal file sizes (e.g. 4 CPU seconds slower on a Pentium -III when copying a 65 MB file without very much matching data). +checksum entries without being overloaded. + +Updated to use the current hashtable method when possible, and the new +hashtable method (which requires a modulus calculation for up to every byte +of the source file) only on large files that need a larger hashtable size. +This avoids slowing down files that don't need the extra-large hashtable. This was updated for the latest codebase from a patch written by Shachar Shemesh. @@ -16,23 +18,25 @@ To use this patch, run these commands for a successful build: --- old/match.c +++ new/match.c -@@ -40,24 +40,31 @@ static int total_matches; +@@ -39,29 +39,50 @@ static int total_matches; extern struct stats stats; -#define TABLESIZE (1<<16) -- ++#define TRADITIONAL_TABLESIZE (1<<16) + +static uint32 tablesize; static int32 *hash_table; --#define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) --#define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) -+#define SUM2HASH(sum) ((sum)%tablesize) + #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) + #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) ++#define BIG_SUM2HASH(sum) ((sum)%tablesize) ++ static void build_hash_table(struct sum_struct *s) { ++ static uint32 alloc_size; int32 i; -+ uint32 prior_size = tablesize; - if (!hash_table) { - hash_table = new_array(int32, TABLESIZE); @@ -40,30 +44,55 @@ To use this patch, run these commands for a successful build: + * for big files is about 80%. This number must be odd or s2 will + * not be able to span the entire set. */ + tablesize = (uint32)(s->count/8) * 10 + 11; -+ if (tablesize < 65537) -+ tablesize = 65537; /* a prime number */ -+ if (tablesize != prior_size) { ++ if (tablesize < TRADITIONAL_TABLESIZE) ++ tablesize = TRADITIONAL_TABLESIZE; ++ if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) { + if (hash_table) + free(hash_table); + hash_table = new_array(int32, tablesize); if (!hash_table) out_of_memory("build_hash_table"); ++ alloc_size = tablesize; } - memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); + memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]); - for (i = 0; i < s->count; i++) { - uint32 t = SUM2HASH(s->sums[i].sum1); -@@ -165,11 +172,11 @@ static void hash_search(int f,struct sum +- for (i = 0; i < s->count; i++) { +- uint32 t = SUM2HASH(s->sums[i].sum1); +- s->sums[i].chain = hash_table[t]; +- hash_table[t] = i; ++ if (tablesize == TRADITIONAL_TABLESIZE) { ++ for (i = 0; i < s->count; i++) { ++ uint32 t = SUM2HASH(s->sums[i].sum1); ++ s->sums[i].chain = hash_table[t]; ++ hash_table[t] = i; ++ } ++ } else { ++ for (i = 0; i < s->count; i++) { ++ uint32 t = BIG_SUM2HASH(s->sums[i].sum1); ++ s->sums[i].chain = hash_table[t]; ++ hash_table[t] = i; ++ } + } + } + +@@ -164,11 +185,16 @@ static void hash_search(int f,struct sum (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); } - i = hash_table[SUM2HASH2(s1,s2)]; -+ sum = (s1 & 0xffff) | (s2 << 16); -+ i = hash_table[SUM2HASH(sum)]; - if (i < 0) - goto null_hash; +- if (i < 0) +- goto null_hash; ++ if (tablesize == TRADITIONAL_TABLESIZE) { ++ if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0) ++ goto null_hash; ++ sum = (s1 & 0xffff) | (s2 << 16); ++ } else { ++ sum = (s1 & 0xffff) | (s2 << 16); ++ if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0) ++ goto null_hash; ++ } - sum = (s1 & 0xffff) | (s2 << 16); hash_hits++; -- 2.34.1