An improved way to hash the checksum data (used by the sender).
[rsync/rsync-patches.git] / dynamic_hash.diff
CommitLineData
702a8903
WD
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
3checksum entries without being overloaded. It also makes the hasing of
4normal sized files use slightly less memory than before.
5
6http://lists.samba.org/archive/rsync/2005-March/011875.html
7
8--- old/match.c
9+++ new/match.c
10@@ -26,11 +26,6 @@ extern int append_mode;
11
12 int updating_basis_file;
13
14-typedef unsigned short tag;
15-
16-#define TABLESIZE (1<<16)
17-#define NULL_TAG (-1)
18-
19 static int false_alarms;
20 static int tag_hits;
21 static int matches;
22@@ -42,47 +37,36 @@ static int total_matches;
23
24 extern struct stats stats;
25
26-struct target {
27- tag t;
28- int32 i;
29-};
30-
31-static struct target *targets;
32-
33-static int32 *tag_table;
34-
35-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
36-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
37-
38-static int compare_targets(struct target *t1,struct target *t2)
39-{
40- return (int)t1->t - (int)t2->t;
41-}
42+static int32 tablesize, tablealloc;
43+static int32 *sum_table;
44
45+#define gettag2(s1,s2) gettag((s1) + ((s2)<<16))
46+#define gettag(sum) ((sum)%tablesize)
47
48 static void build_hash_table(struct sum_struct *s)
49 {
50- int32 i;
51+ int32 i, t;
52
53- if (!tag_table)
54- tag_table = new_array(int32, TABLESIZE);
55+ /* Dynamically calculate the hash table size so that the hash load
56+ * is always about 80%. This number must be odd or s2 will not be
57+ * able to span the entire set. */
58+ tablesize = (s->count/8) * 10 + 11;
59+ if (tablesize < 65537)
60+ tablesize = 65537; /* a prime number */
61+ if (tablesize > tablealloc) {
62+ tablealloc = tablesize;
63+ sum_table = realloc_array(sum_table, int32, tablealloc);
64+ if (!sum_table)
65+ out_of_memory("build_hash_table");
66+ }
67
68- targets = new_array(struct target, s->count);
69- if (!tag_table || !targets)
70- out_of_memory("build_hash_table");
71+ memset(sum_table, 0xFF, tablesize * sizeof (int32));
72
73 for (i = 0; i < s->count; i++) {
74- targets[i].i = i;
75- targets[i].t = gettag(s->sums[i].sum1);
76+ t = gettag(s->sums[i].sum1);
77+ s->sums[i].chain = sum_table[t];
78+ sum_table[t] = i;
79 }
80-
81- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
82-
83- for (i = 0; i < TABLESIZE; i++)
84- tag_table[i] = NULL_TAG;
85-
86- for (i = s->count; i-- > 0; )
87- tag_table[targets[i].t] = i;
88 }
89
90
91@@ -176,20 +160,16 @@ static void hash_search(int f,struct sum
92 }
93
94 do {
95- tag t = gettag2(s1,s2);
96+ int32 i, t = gettag2(s1,s2);
97 int done_csum2 = 0;
98- int32 j = tag_table[t];
99
100 if (verbose > 4)
101 rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
102
103- if (j == NULL_TAG)
104- goto null_tag;
105-
106 sum = (s1 & 0xffff) | (s2 << 16);
107 tag_hits++;
108- do {
109- int32 l, i = targets[j].i;
110+ for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) {
111+ int32 l;
112
113 if (sum != s->sums[i].sum1)
114 continue;
115@@ -205,9 +185,10 @@ static void hash_search(int f,struct sum
116 && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
117 continue;
118
119- if (verbose > 3)
120- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
121- (double)offset,(double)j,(double)i,sum);
122+ if (verbose > 3) {
123+ rprintf(FINFO,"potential match at %.0f i=%ld sum=%08x\n",
124+ (double)offset, (long)i, sum);
125+ }
126
127 if (!done_csum2) {
128 map = (schar *)map_ptr(buf,offset,l);
129@@ -224,23 +205,23 @@ static void hash_search(int f,struct sum
130 * one with an identical offset, so we prefer that over
131 * the following want_i optimization. */
132 if (updating_basis_file) {
133- do {
134- int32 i2 = targets[j].i;
135+ int32 i2;
136+ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
137 if (s->sums[i2].offset != offset)
138 continue;
139 if (i2 != i) {
140 if (sum != s->sums[i2].sum1)
141- break;
142+ continue;
143 if (memcmp(sum2, s->sums[i2].sum2,
144 s->s2length) != 0)
145- break;
146+ continue;
147 i = i2;
148 }
149 /* This chunk was at the same offset on
150 * both the sender and the receiver. */
151 s->sums[i].flags |= SUMFLG_SAME_OFFSET;
152 goto set_want_i;
153- } while (++j < s->count && targets[j].t == t);
154+ }
155 }
156
157 /* we've found a match, but now check to see
158@@ -266,9 +247,8 @@ static void hash_search(int f,struct sum
159 s2 = sum >> 16;
160 matches++;
161 break;
162- } while (++j < s->count && targets[j].t == t);
163+ }
164
165- null_tag:
166 backup = offset - last_match;
167 /* We sometimes read 1 byte prior to last_match... */
168 if (backup < 0)
169@@ -375,11 +355,6 @@ void match_sums(int f, struct sum_struct
170 rprintf(FINFO,"sending file_sum\n");
171 write_buf(f,file_sum,MD4_SUM_LENGTH);
172
173- if (targets) {
174- free(targets);
175- targets=NULL;
176- }
177-
178 if (verbose > 2)
179 rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
180 false_alarms, tag_hits, matches);
181--- old/rsync.h
182+++ new/rsync.h
183@@ -560,6 +560,7 @@ struct sum_buf {
184 OFF_T offset; /**< offset in file of this chunk */
185 int32 len; /**< length of chunk of file */
186 uint32 sum1; /**< simple checksum */
187+ int32 chain; /**< hash-table chaining */
188 short flags; /**< flag bits */
189 char sum2[SUM_LENGTH]; /**< checksum */
190 };
191--- old/sender.c
192+++ new/sender.c
193@@ -92,6 +92,7 @@ static struct sum_struct *receive_sums(i
194
195 s->sums[i].offset = offset;
196 s->sums[i].flags = 0;
197+ s->sums[i].chain = 0;
198
199 if (i == s->count-1 && s->remainder != 0)
200 s->sums[i].len = s->remainder;