This patch makes the processing of large really large files more efficient by making sure that the sender's hash table is large enough to hold all the checksum entries without being overloaded. Updated to use the current hashtable method when possible, and the new hashtable method (which requires a modulus calculation for up to every byte of the source file) only on large files that need a larger hashtable size. This avoids slowing down files that don't need the extra-large hashtable. This was updated for the latest codebase from a patch written by Shachar Shemesh. To use this patch, run these commands for a successful build: patch -p1 >16) -static int32 build_hash_table(struct sum_struct *s, int32 start) +#define BIG_SUM2HASH(sum) ((sum)%tablesize) + +static void build_hash_table(struct sum_struct *s) { - int32 i, end = s->count; + static uint32 alloc_size; + int32 i; - if (!hash_table) { - hash_table = new_array(int32, TABLESIZE); + /* Dynamically calculate the hash table size so that the hash load + * for big files is about 80%. This number must be odd or s2 will + * not be able to span the entire set. */ + tablesize = (uint32)(s->count/8) * 10 + 11; + if (tablesize < TRADITIONAL_TABLESIZE) + tablesize = TRADITIONAL_TABLESIZE; + if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) { + if (hash_table) + free(hash_table); + hash_table = new_array(int32, tablesize); if (!hash_table) out_of_memory("build_hash_table"); + alloc_size = tablesize; } - memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); - - if (end - start > TABLESIZE*8/10) - end = start + TABLESIZE*8/10; - - for (i = start; i < end; i++) { - uint32 t = SUM2HASH(s->sums[i].sum1); - s->sums[i].chain = hash_table[t]; - hash_table[t] = i; - } + memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]); - if (verbose > 2) { - rprintf(FINFO, "built hash table for entries %ld - %ld\n", - (long)start, (long)end - 1); + if (tablesize == TRADITIONAL_TABLESIZE) { + for (i = 0; i < s->count; i++) { + uint32 t = SUM2HASH(s->sums[i].sum1); + s->sums[i].chain = hash_table[t]; + hash_table[t] = i; + } + } else { + for (i = 0; i < s->count; i++) { + uint32 t = BIG_SUM2HASH(s->sums[i].sum1); + s->sums[i].chain = hash_table[t]; + hash_table[t] = i; + } } - - return end; } @@ -130,8 +141,8 @@ static void matched(int f, struct sum_st static void hash_search(int f,struct sum_struct *s, struct map_struct *buf, OFF_T len) { - OFF_T offset, end, reset = 0; - int32 k, want_i, backup, sum_pos = 0; + OFF_T offset, end; + int32 k, want_i, backup; char sum2[SUM_LENGTH]; uint32 s1, s2, sum; int more; @@ -169,21 +180,21 @@ static void hash_search(int f,struct sum int done_csum2 = 0; int32 i; - if (offset >= reset) { - sum_pos = build_hash_table(s, sum_pos); - reset = sum_pos * s->blength; - } - if (verbose > 4) { rprintf(FINFO, "offset=%.0f sum=%04x%04x\n", (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); } - i = hash_table[SUM2HASH2(s1,s2)]; - if (i < 0) - goto null_hash; + if (tablesize == TRADITIONAL_TABLESIZE) { + if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0) + goto null_hash; + sum = (s1 & 0xffff) | (s2 << 16); + } else { + sum = (s1 & 0xffff) | (s2 << 16); + if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0) + goto null_hash; + } - sum = (s1 & 0xffff) | (s2 << 16); hash_hits++; do { int32 l; @@ -351,6 +362,11 @@ void match_sums(int f, struct sum_struct } if (len > 0 && s->count > 0) { + build_hash_table(s); + + if (verbose > 2) + rprintf(FINFO,"built hash table\n"); + hash_search(f, s, buf, len); if (verbose > 2)