| 1 | This patch makes the processing of large really large files more efficient |
| 2 | by making sure that the sender's hash table is large enough to hold all the |
| 3 | checksum entries without being overloaded. Unfortunately, the code adds a |
| 4 | modulus calculation for (up to) every byte of the source file, which slows |
| 5 | down the code for normal file sizes (e.g. 4 CPU seconds slower on a Pentium |
| 6 | III when copying a 65 MB file without very much matching data). |
| 7 | |
| 8 | This was updated for the latest codebase from a patch written by Shachar |
| 9 | Shemesh. |
| 10 | |
| 11 | --- old/match.c |
| 12 | +++ new/match.c |
| 13 | @@ -37,24 +37,31 @@ static int total_matches; |
| 14 | |
| 15 | extern struct stats stats; |
| 16 | |
| 17 | -#define TABLESIZE (1<<16) |
| 18 | - |
| 19 | +static uint32 tablesize; |
| 20 | static int32 *hash_table; |
| 21 | |
| 22 | -#define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) |
| 23 | -#define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) |
| 24 | +#define SUM2HASH(sum) ((sum)%tablesize) |
| 25 | |
| 26 | static void build_hash_table(struct sum_struct *s) |
| 27 | { |
| 28 | int32 i; |
| 29 | + uint32 prior_size = tablesize; |
| 30 | |
| 31 | - if (!hash_table) { |
| 32 | - hash_table = new_array(int32, TABLESIZE); |
| 33 | + /* Dynamically calculate the hash table size so that the hash load |
| 34 | + * for big files is about 80%. This number must be odd or s2 will |
| 35 | + * not be able to span the entire set. */ |
| 36 | + tablesize = (uint32)(s->count/8) * 10 + 11; |
| 37 | + if (tablesize < 65537) |
| 38 | + tablesize = 65537; /* a prime number */ |
| 39 | + if (tablesize != prior_size) { |
| 40 | + if (hash_table) |
| 41 | + free(hash_table); |
| 42 | + hash_table = new_array(int32, tablesize); |
| 43 | if (!hash_table) |
| 44 | out_of_memory("build_hash_table"); |
| 45 | } |
| 46 | |
| 47 | - memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); |
| 48 | + memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]); |
| 49 | |
| 50 | for (i = 0; i < s->count; i++) { |
| 51 | uint32 t = SUM2HASH(s->sums[i].sum1); |
| 52 | @@ -162,11 +169,11 @@ static void hash_search(int f,struct sum |
| 53 | (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); |
| 54 | } |
| 55 | |
| 56 | - i = hash_table[SUM2HASH2(s1,s2)]; |
| 57 | + sum = (s1 & 0xffff) | (s2 << 16); |
| 58 | + i = hash_table[SUM2HASH(sum)]; |
| 59 | if (i < 0) |
| 60 | goto null_hash; |
| 61 | |
| 62 | - sum = (s1 & 0xffff) | (s2 << 16); |
| 63 | hash_hits++; |
| 64 | do { |
| 65 | int32 l; |