Fixed a few bugs.
[rsync/rsync-patches.git] / dynamic_hash.diff
CommitLineData
702a8903
WD
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
66d2440b 3checksum entries without being overloaded. It also makes the hashing of
702a8903
WD
4normal sized files use slightly less memory than before.
5
2b1e5f60 6An extended version of a patch by Shachar Shemesh.
702a8903
WD
7
8--- old/match.c
9+++ new/match.c
1db27b7c 10@@ -26,63 +26,47 @@ extern int append_mode;
702a8903
WD
11
12 int updating_basis_file;
13
14-typedef unsigned short tag;
15-
16-#define TABLESIZE (1<<16)
17-#define NULL_TAG (-1)
18-
19 static int false_alarms;
1db27b7c
WD
20-static int tag_hits;
21+static int hash_hits;
702a8903 22 static int matches;
1db27b7c
WD
23 static int64 data_transfer;
24
25 static int total_false_alarms;
26-static int total_tag_hits;
27+static int total_hash_hits;
28 static int total_matches;
702a8903
WD
29
30 extern struct stats stats;
31
32-struct target {
33- tag t;
34- int32 i;
35-};
36-
37-static struct target *targets;
38-
39-static int32 *tag_table;
40-
41-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
42-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
43-
44-static int compare_targets(struct target *t1,struct target *t2)
45-{
46- return (int)t1->t - (int)t2->t;
47-}
c541912f 48+static uint32 tablesize;
702a8903
WD
49+static int32 *sum_table;
50
1db27b7c 51+#define GETTAG(sum) ((sum)%tablesize)
702a8903
WD
52
53 static void build_hash_table(struct sum_struct *s)
54 {
c541912f
WD
55 int32 i;
56+ uint32 prior_size = tablesize;
702a8903
WD
57
58- if (!tag_table)
59- tag_table = new_array(int32, TABLESIZE);
60+ /* Dynamically calculate the hash table size so that the hash load
2b1e5f60
WD
61+ * for big files is about 80%. This number must be odd or s2 will
62+ * not be able to span the entire set. */
c541912f 63+ tablesize = (uint32)(s->count/8) * 10 + 11;
702a8903
WD
64+ if (tablesize < 65537)
65+ tablesize = 65537; /* a prime number */
2b1e5f60
WD
66+ if (tablesize != prior_size) {
67+ free(sum_table);
68+ sum_table = new_array(int32, tablesize);
702a8903
WD
69+ if (!sum_table)
70+ out_of_memory("build_hash_table");
71+ }
72
73- targets = new_array(struct target, s->count);
74- if (!tag_table || !targets)
75- out_of_memory("build_hash_table");
4d89650c 76+ memset(sum_table, 0xFF, tablesize * sizeof sum_table[0]);
702a8903
WD
77
78 for (i = 0; i < s->count; i++) {
79- targets[i].i = i;
80- targets[i].t = gettag(s->sums[i].sum1);
1db27b7c 81+ uint32 t = GETTAG(s->sums[i].sum1);
702a8903
WD
82+ s->sums[i].chain = sum_table[t];
83+ sum_table[t] = i;
84 }
85-
86- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
87-
88- for (i = 0; i < TABLESIZE; i++)
89- tag_table[i] = NULL_TAG;
90-
91- for (i = s->count; i-- > 0; )
92- tag_table[targets[i].t] = i;
93 }
94
95
1db27b7c 96@@ -176,20 +160,21 @@ static void hash_search(int f,struct sum
702a8903
WD
97 }
98
99 do {
100- tag t = gettag2(s1,s2);
702a8903
WD
101 int done_csum2 = 0;
102- int32 j = tag_table[t];
c541912f 103+ int32 i;
1db27b7c 104+ uint32 t;
702a8903
WD
105
106 if (verbose > 4)
107 rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
108
109- if (j == NULL_TAG)
110- goto null_tag;
1db27b7c
WD
111+ t = GETTAG(sum);
112+ i = sum_table[t];
113+ if (i < 0)
114+ goto null_hash;
115
116- sum = (s1 & 0xffff) | (s2 << 16);
117- tag_hits++;
118+ hash_hits++;
119 do {
702a8903 120- int32 l, i = targets[j].i;
702a8903
WD
121+ int32 l;
122
123 if (sum != s->sums[i].sum1)
124 continue;
1db27b7c 125@@ -205,9 +190,11 @@ static void hash_search(int f,struct sum
702a8903
WD
126 && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
127 continue;
128
129- if (verbose > 3)
130- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
131- (double)offset,(double)j,(double)i,sum);
132+ if (verbose > 3) {
4d89650c
WD
133+ rprintf(FINFO,
134+ "potential match at %.0f i=%ld sum=%08x\n",
702a8903
WD
135+ (double)offset, (long)i, sum);
136+ }
137
138 if (!done_csum2) {
139 map = (schar *)map_ptr(buf,offset,l);
1db27b7c 140@@ -224,8 +211,8 @@ static void hash_search(int f,struct sum
702a8903
WD
141 * one with an identical offset, so we prefer that over
142 * the following want_i optimization. */
143 if (updating_basis_file) {
144- do {
145- int32 i2 = targets[j].i;
146+ int32 i2;
147+ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
148 if (s->sums[i2].offset != offset)
149 continue;
150 if (i2 != i) {
1db27b7c 151@@ -240,7 +227,7 @@ static void hash_search(int f,struct sum
702a8903
WD
152 * both the sender and the receiver. */
153 s->sums[i].flags |= SUMFLG_SAME_OFFSET;
154 goto set_want_i;
155- } while (++j < s->count && targets[j].t == t);
156+ }
157 }
158
159 /* we've found a match, but now check to see
1db27b7c 160@@ -266,9 +253,9 @@ static void hash_search(int f,struct sum
702a8903
WD
161 s2 = sum >> 16;
162 matches++;
163 break;
164- } while (++j < s->count && targets[j].t == t);
1db27b7c 165+ } while ((i = s->sums[i].chain) >= 0);
702a8903
WD
166
167- null_tag:
1db27b7c 168+ null_hash:
702a8903
WD
169 backup = offset - last_match;
170 /* We sometimes read 1 byte prior to last_match... */
171 if (backup < 0)
1db27b7c
WD
172@@ -287,6 +274,7 @@ static void hash_search(int f,struct sum
173 s2 += s1;
174 } else
175 --k;
176+ sum = (s1 & 0xffff) | (s2 << 16);
177
178 /* By matching early we avoid re-reading the
179 data 3 times in the case where a token
180@@ -323,7 +311,7 @@ void match_sums(int f, struct sum_struct
181
182 last_match = 0;
183 false_alarms = 0;
184- tag_hits = 0;
185+ hash_hits = 0;
186 matches = 0;
187 data_transfer = 0;
188
189@@ -375,16 +363,11 @@ void match_sums(int f, struct sum_struct
702a8903
WD
190 rprintf(FINFO,"sending file_sum\n");
191 write_buf(f,file_sum,MD4_SUM_LENGTH);
192
193- if (targets) {
194- free(targets);
195- targets=NULL;
196- }
197-
198 if (verbose > 2)
1db27b7c
WD
199- rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
200- false_alarms, tag_hits, matches);
201+ rprintf(FINFO, "false_alarms=%d hash_hits=%d matches=%d\n",
202+ false_alarms, hash_hits, matches);
203
204- total_tag_hits += tag_hits;
205+ total_hash_hits += hash_hits;
206 total_false_alarms += false_alarms;
207 total_matches += matches;
208 stats.literal_data += data_transfer;
209@@ -396,8 +379,7 @@ void match_report(void)
210 return;
211
212 rprintf(FINFO,
213- "total: matches=%d tag_hits=%d false_alarms=%d data=%.0f\n",
214- total_matches,total_tag_hits,
215- total_false_alarms,
216+ "total: matches=%d hash_hits=%d false_alarms=%d data=%.0f\n",
217+ total_matches, total_hash_hits, total_false_alarms,
218 (double)stats.literal_data);
219 }
702a8903
WD
220--- old/rsync.h
221+++ new/rsync.h
222@@ -560,6 +560,7 @@ struct sum_buf {
223 OFF_T offset; /**< offset in file of this chunk */
224 int32 len; /**< length of chunk of file */
225 uint32 sum1; /**< simple checksum */
2b1e5f60 226+ int32 chain; /**< next hash-table collision */
702a8903
WD
227 short flags; /**< flag bits */
228 char sum2[SUM_LENGTH]; /**< checksum */
229 };