No need to initialize the chain var of a checksum struct since
[rsync/rsync-patches.git] / dynamic_hash.diff
CommitLineData
702a8903
WD
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
66d2440b 3checksum entries without being overloaded. It also makes the hashing of
702a8903
WD
4normal sized files use slightly less memory than before.
5
2b1e5f60 6An extended version of a patch by Shachar Shemesh.
702a8903
WD
7
8--- old/match.c
9+++ new/match.c
10@@ -26,11 +26,6 @@ extern int append_mode;
11
12 int updating_basis_file;
13
14-typedef unsigned short tag;
15-
16-#define TABLESIZE (1<<16)
17-#define NULL_TAG (-1)
18-
19 static int false_alarms;
20 static int tag_hits;
21 static int matches;
c541912f 22@@ -42,47 +37,37 @@ static int total_matches;
702a8903
WD
23
24 extern struct stats stats;
25
26-struct target {
27- tag t;
28- int32 i;
29-};
30-
31-static struct target *targets;
32-
33-static int32 *tag_table;
34-
35-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
36-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
37-
38-static int compare_targets(struct target *t1,struct target *t2)
39-{
40- return (int)t1->t - (int)t2->t;
41-}
c541912f 42+static uint32 tablesize;
702a8903
WD
43+static int32 *sum_table;
44
45+#define gettag2(s1,s2) gettag((s1) + ((s2)<<16))
46+#define gettag(sum) ((sum)%tablesize)
47
48 static void build_hash_table(struct sum_struct *s)
49 {
c541912f
WD
50 int32 i;
51+ uint32 prior_size = tablesize;
702a8903
WD
52
53- if (!tag_table)
54- tag_table = new_array(int32, TABLESIZE);
55+ /* Dynamically calculate the hash table size so that the hash load
2b1e5f60
WD
56+ * for big files is about 80%. This number must be odd or s2 will
57+ * not be able to span the entire set. */
c541912f 58+ tablesize = (uint32)(s->count/8) * 10 + 11;
702a8903
WD
59+ if (tablesize < 65537)
60+ tablesize = 65537; /* a prime number */
2b1e5f60
WD
61+ if (tablesize != prior_size) {
62+ free(sum_table);
63+ sum_table = new_array(int32, tablesize);
702a8903
WD
64+ if (!sum_table)
65+ out_of_memory("build_hash_table");
66+ }
67
68- targets = new_array(struct target, s->count);
69- if (!tag_table || !targets)
70- out_of_memory("build_hash_table");
2b1e5f60 71+ memset(sum_table, 0xFF, tablesize * sizeof (sum_table[0]));
702a8903
WD
72
73 for (i = 0; i < s->count; i++) {
74- targets[i].i = i;
75- targets[i].t = gettag(s->sums[i].sum1);
c541912f 76+ uint32 t = gettag(s->sums[i].sum1);
702a8903
WD
77+ s->sums[i].chain = sum_table[t];
78+ sum_table[t] = i;
79 }
80-
81- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
82-
83- for (i = 0; i < TABLESIZE; i++)
84- tag_table[i] = NULL_TAG;
85-
86- for (i = s->count; i-- > 0; )
87- tag_table[targets[i].t] = i;
88 }
89
90
c541912f 91@@ -176,20 +161,17 @@ static void hash_search(int f,struct sum
702a8903
WD
92 }
93
94 do {
95- tag t = gettag2(s1,s2);
c541912f 96+ uint32 t = gettag2(s1,s2);
702a8903
WD
97 int done_csum2 = 0;
98- int32 j = tag_table[t];
c541912f 99+ int32 i;
702a8903
WD
100
101 if (verbose > 4)
102 rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
103
104- if (j == NULL_TAG)
105- goto null_tag;
106-
107 sum = (s1 & 0xffff) | (s2 << 16);
108 tag_hits++;
109- do {
110- int32 l, i = targets[j].i;
111+ for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) {
112+ int32 l;
113
114 if (sum != s->sums[i].sum1)
115 continue;
c541912f 116@@ -205,9 +187,10 @@ static void hash_search(int f,struct sum
702a8903
WD
117 && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
118 continue;
119
120- if (verbose > 3)
121- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
122- (double)offset,(double)j,(double)i,sum);
123+ if (verbose > 3) {
124+ rprintf(FINFO,"potential match at %.0f i=%ld sum=%08x\n",
125+ (double)offset, (long)i, sum);
126+ }
127
128 if (!done_csum2) {
129 map = (schar *)map_ptr(buf,offset,l);
c541912f 130@@ -224,23 +207,23 @@ static void hash_search(int f,struct sum
702a8903
WD
131 * one with an identical offset, so we prefer that over
132 * the following want_i optimization. */
133 if (updating_basis_file) {
134- do {
135- int32 i2 = targets[j].i;
136+ int32 i2;
137+ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
138 if (s->sums[i2].offset != offset)
139 continue;
140 if (i2 != i) {
141 if (sum != s->sums[i2].sum1)
142- break;
143+ continue;
144 if (memcmp(sum2, s->sums[i2].sum2,
145 s->s2length) != 0)
146- break;
147+ continue;
148 i = i2;
149 }
150 /* This chunk was at the same offset on
151 * both the sender and the receiver. */
152 s->sums[i].flags |= SUMFLG_SAME_OFFSET;
153 goto set_want_i;
154- } while (++j < s->count && targets[j].t == t);
155+ }
156 }
157
158 /* we've found a match, but now check to see
c541912f 159@@ -266,9 +249,8 @@ static void hash_search(int f,struct sum
702a8903
WD
160 s2 = sum >> 16;
161 matches++;
162 break;
163- } while (++j < s->count && targets[j].t == t);
164+ }
165
166- null_tag:
167 backup = offset - last_match;
168 /* We sometimes read 1 byte prior to last_match... */
169 if (backup < 0)
c541912f 170@@ -375,11 +357,6 @@ void match_sums(int f, struct sum_struct
702a8903
WD
171 rprintf(FINFO,"sending file_sum\n");
172 write_buf(f,file_sum,MD4_SUM_LENGTH);
173
174- if (targets) {
175- free(targets);
176- targets=NULL;
177- }
178-
179 if (verbose > 2)
180 rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
181 false_alarms, tag_hits, matches);
182--- old/rsync.h
183+++ new/rsync.h
184@@ -560,6 +560,7 @@ struct sum_buf {
185 OFF_T offset; /**< offset in file of this chunk */
186 int32 len; /**< length of chunk of file */
187 uint32 sum1; /**< simple checksum */
2b1e5f60 188+ int32 chain; /**< next hash-table collision */
702a8903
WD
189 short flags; /**< flag bits */
190 char sum2[SUM_LENGTH]; /**< checksum */
191 };