Commit | Line | Data |
---|---|---|
702a8903 WD |
1 | This patch makes the processing of large really large files more efficient |
2 | by making sure that the sender's hash table is large enough to hold all the | |
84495354 WD |
3 | checksum entries without being overloaded. |
4 | ||
5 | Updated to use the current hashtable method when possible, and the new | |
6 | hashtable method (which requires a modulus calculation for up to every byte | |
7 | of the source file) only on large files that need a larger hashtable size. | |
8 | This avoids slowing down files that don't need the extra-large hashtable. | |
702a8903 | 9 | |
c62af8d8 | 10 | This was updated for the latest codebase from a patch written by Shachar |
e1a06002 | 11 | Shemesh. |
702a8903 | 12 | |
03019e41 WD |
13 | To use this patch, run these commands for a successful build: |
14 | ||
15 | patch -p1 <patches/dynamic_hash.diff | |
16 | ./configure (optional if already run) | |
17 | make | |
18 | ||
702a8903 WD |
19 | --- old/match.c |
20 | +++ new/match.c | |
0df0745f | 21 | @@ -39,40 +39,51 @@ static int total_matches; |
702a8903 WD |
22 | |
23 | extern struct stats stats; | |
24 | ||
e1a06002 | 25 | -#define TABLESIZE (1<<16) |
84495354 WD |
26 | +#define TRADITIONAL_TABLESIZE (1<<16) |
27 | ||
c541912f | 28 | +static uint32 tablesize; |
e1a06002 | 29 | static int32 *hash_table; |
702a8903 | 30 | |
84495354 WD |
31 | #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) |
32 | #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) | |
702a8903 | 33 | |
0df0745f | 34 | -static int32 build_hash_table(struct sum_struct *s, int32 start) |
84495354 WD |
35 | +#define BIG_SUM2HASH(sum) ((sum)%tablesize) |
36 | + | |
0df0745f | 37 | +static void build_hash_table(struct sum_struct *s) |
702a8903 | 38 | { |
0df0745f | 39 | - int32 i, end = s->count; |
84495354 | 40 | + static uint32 alloc_size; |
0df0745f | 41 | + int32 i; |
702a8903 | 42 | |
e1a06002 WD |
43 | - if (!hash_table) { |
44 | - hash_table = new_array(int32, TABLESIZE); | |
702a8903 | 45 | + /* Dynamically calculate the hash table size so that the hash load |
2b1e5f60 WD |
46 | + * for big files is about 80%. This number must be odd or s2 will |
47 | + * not be able to span the entire set. */ | |
c541912f | 48 | + tablesize = (uint32)(s->count/8) * 10 + 11; |
84495354 WD |
49 | + if (tablesize < TRADITIONAL_TABLESIZE) |
50 | + tablesize = TRADITIONAL_TABLESIZE; | |
51 | + if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) { | |
e1a06002 WD |
52 | + if (hash_table) |
53 | + free(hash_table); | |
54 | + hash_table = new_array(int32, tablesize); | |
55 | if (!hash_table) | |
56 | out_of_memory("build_hash_table"); | |
84495354 | 57 | + alloc_size = tablesize; |
702a8903 WD |
58 | } |
59 | ||
e1a06002 | 60 | - memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); |
0df0745f WD |
61 | - |
62 | - if (end - start > TABLESIZE*8/10) | |
63 | - end = start + TABLESIZE*8/10; | |
64 | - | |
65 | - for (i = start; i < end; i++) { | |
84495354 WD |
66 | - uint32 t = SUM2HASH(s->sums[i].sum1); |
67 | - s->sums[i].chain = hash_table[t]; | |
68 | - hash_table[t] = i; | |
0df0745f WD |
69 | - } |
70 | + memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]); | |
71 | ||
72 | - if (verbose > 2) { | |
73 | - rprintf(FINFO, "built hash table for entries %ld - %ld\n", | |
74 | - (long)start, (long)end - 1); | |
84495354 WD |
75 | + if (tablesize == TRADITIONAL_TABLESIZE) { |
76 | + for (i = 0; i < s->count; i++) { | |
77 | + uint32 t = SUM2HASH(s->sums[i].sum1); | |
78 | + s->sums[i].chain = hash_table[t]; | |
79 | + hash_table[t] = i; | |
80 | + } | |
81 | + } else { | |
82 | + for (i = 0; i < s->count; i++) { | |
83 | + uint32 t = BIG_SUM2HASH(s->sums[i].sum1); | |
84 | + s->sums[i].chain = hash_table[t]; | |
85 | + hash_table[t] = i; | |
86 | + } | |
87 | } | |
0df0745f WD |
88 | - |
89 | - return end; | |
84495354 WD |
90 | } |
91 | ||
0df0745f WD |
92 | |
93 | @@ -130,8 +141,8 @@ static void matched(int f, struct sum_st | |
94 | static void hash_search(int f,struct sum_struct *s, | |
95 | struct map_struct *buf, OFF_T len) | |
96 | { | |
97 | - OFF_T offset, end, reset = 0; | |
98 | - int32 k, want_i, backup, sum_pos = 0; | |
99 | + OFF_T offset, end; | |
100 | + int32 k, want_i, backup; | |
101 | char sum2[SUM_LENGTH]; | |
102 | uint32 s1, s2, sum; | |
103 | int more; | |
104 | @@ -169,21 +180,21 @@ static void hash_search(int f,struct sum | |
105 | int done_csum2 = 0; | |
106 | int32 i; | |
107 | ||
108 | - if (offset >= reset) { | |
109 | - sum_pos = build_hash_table(s, sum_pos); | |
110 | - reset = sum_pos * s->blength; | |
111 | - } | |
112 | - | |
113 | if (verbose > 4) { | |
114 | rprintf(FINFO, "offset=%.0f sum=%04x%04x\n", | |
e1a06002 WD |
115 | (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); |
116 | } | |
702a8903 | 117 | |
e1a06002 | 118 | - i = hash_table[SUM2HASH2(s1,s2)]; |
84495354 WD |
119 | - if (i < 0) |
120 | - goto null_hash; | |
121 | + if (tablesize == TRADITIONAL_TABLESIZE) { | |
122 | + if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0) | |
123 | + goto null_hash; | |
124 | + sum = (s1 & 0xffff) | (s2 << 16); | |
125 | + } else { | |
126 | + sum = (s1 & 0xffff) | (s2 << 16); | |
127 | + if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0) | |
128 | + goto null_hash; | |
129 | + } | |
1db27b7c WD |
130 | |
131 | - sum = (s1 & 0xffff) | (s2 << 16); | |
e1a06002 | 132 | hash_hits++; |
1db27b7c | 133 | do { |
e1a06002 | 134 | int32 l; |
0df0745f WD |
135 | @@ -351,6 +362,11 @@ void match_sums(int f, struct sum_struct |
136 | } | |
137 | ||
138 | if (len > 0 && s->count > 0) { | |
139 | + build_hash_table(s); | |
140 | + | |
141 | + if (verbose > 2) | |
142 | + rprintf(FINFO,"built hash table\n"); | |
143 | + | |
144 | hash_search(f, s, buf, len); | |
145 | ||
146 | if (verbose > 2) |