Improved a few minor things.
[rsync/rsync-patches.git] / dynamic_hash.diff
... / ...
CommitLineData
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
3checksum entries without being overloaded. It also makes the hashing of
4normal sized files use slightly less memory than before.
5
6An extended version of a patch by Shachar Shemesh.
7
8--- old/match.c
9+++ new/match.c
10@@ -26,11 +26,6 @@ extern int append_mode;
11
12 int updating_basis_file;
13
14-typedef unsigned short tag;
15-
16-#define TABLESIZE (1<<16)
17-#define NULL_TAG (-1)
18-
19 static int false_alarms;
20 static int tag_hits;
21 static int matches;
22@@ -42,47 +37,37 @@ static int total_matches;
23
24 extern struct stats stats;
25
26-struct target {
27- tag t;
28- int32 i;
29-};
30-
31-static struct target *targets;
32-
33-static int32 *tag_table;
34-
35-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
36-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
37-
38-static int compare_targets(struct target *t1,struct target *t2)
39-{
40- return (int)t1->t - (int)t2->t;
41-}
42+static uint32 tablesize;
43+static int32 *sum_table;
44
45+#define gettag2(s1,s2) gettag((s1) + ((s2)<<16))
46+#define gettag(sum) ((sum)%tablesize)
47
48 static void build_hash_table(struct sum_struct *s)
49 {
50 int32 i;
51+ uint32 prior_size = tablesize;
52
53- if (!tag_table)
54- tag_table = new_array(int32, TABLESIZE);
55+ /* Dynamically calculate the hash table size so that the hash load
56+ * for big files is about 80%. This number must be odd or s2 will
57+ * not be able to span the entire set. */
58+ tablesize = (uint32)(s->count/8) * 10 + 11;
59+ if (tablesize < 65537)
60+ tablesize = 65537; /* a prime number */
61+ if (tablesize != prior_size) {
62+ free(sum_table);
63+ sum_table = new_array(int32, tablesize);
64+ if (!sum_table)
65+ out_of_memory("build_hash_table");
66+ }
67
68- targets = new_array(struct target, s->count);
69- if (!tag_table || !targets)
70- out_of_memory("build_hash_table");
71+ memset(sum_table, 0xFF, tablesize * sizeof sum_table[0]);
72
73 for (i = 0; i < s->count; i++) {
74- targets[i].i = i;
75- targets[i].t = gettag(s->sums[i].sum1);
76+ uint32 t = gettag(s->sums[i].sum1);
77+ s->sums[i].chain = sum_table[t];
78+ sum_table[t] = i;
79 }
80-
81- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
82-
83- for (i = 0; i < TABLESIZE; i++)
84- tag_table[i] = NULL_TAG;
85-
86- for (i = s->count; i-- > 0; )
87- tag_table[targets[i].t] = i;
88 }
89
90
91@@ -176,20 +161,17 @@ static void hash_search(int f,struct sum
92 }
93
94 do {
95- tag t = gettag2(s1,s2);
96+ uint32 t = gettag2(s1,s2);
97 int done_csum2 = 0;
98- int32 j = tag_table[t];
99+ int32 i;
100
101 if (verbose > 4)
102 rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
103
104- if (j == NULL_TAG)
105- goto null_tag;
106-
107 sum = (s1 & 0xffff) | (s2 << 16);
108 tag_hits++;
109- do {
110- int32 l, i = targets[j].i;
111+ for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) {
112+ int32 l;
113
114 if (sum != s->sums[i].sum1)
115 continue;
116@@ -205,9 +187,11 @@ static void hash_search(int f,struct sum
117 && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
118 continue;
119
120- if (verbose > 3)
121- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
122- (double)offset,(double)j,(double)i,sum);
123+ if (verbose > 3) {
124+ rprintf(FINFO,
125+ "potential match at %.0f i=%ld sum=%08x\n",
126+ (double)offset, (long)i, sum);
127+ }
128
129 if (!done_csum2) {
130 map = (schar *)map_ptr(buf,offset,l);
131@@ -224,8 +208,8 @@ static void hash_search(int f,struct sum
132 * one with an identical offset, so we prefer that over
133 * the following want_i optimization. */
134 if (updating_basis_file) {
135- do {
136- int32 i2 = targets[j].i;
137+ int32 i2;
138+ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
139 if (s->sums[i2].offset != offset)
140 continue;
141 if (i2 != i) {
142@@ -240,7 +224,7 @@ static void hash_search(int f,struct sum
143 * both the sender and the receiver. */
144 s->sums[i].flags |= SUMFLG_SAME_OFFSET;
145 goto set_want_i;
146- } while (++j < s->count && targets[j].t == t);
147+ }
148 }
149
150 /* we've found a match, but now check to see
151@@ -266,9 +250,8 @@ static void hash_search(int f,struct sum
152 s2 = sum >> 16;
153 matches++;
154 break;
155- } while (++j < s->count && targets[j].t == t);
156+ }
157
158- null_tag:
159 backup = offset - last_match;
160 /* We sometimes read 1 byte prior to last_match... */
161 if (backup < 0)
162@@ -375,11 +358,6 @@ void match_sums(int f, struct sum_struct
163 rprintf(FINFO,"sending file_sum\n");
164 write_buf(f,file_sum,MD4_SUM_LENGTH);
165
166- if (targets) {
167- free(targets);
168- targets=NULL;
169- }
170-
171 if (verbose > 2)
172 rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
173 false_alarms, tag_hits, matches);
174--- old/rsync.h
175+++ new/rsync.h
176@@ -560,6 +560,7 @@ struct sum_buf {
177 OFF_T offset; /**< offset in file of this chunk */
178 int32 len; /**< length of chunk of file */
179 uint32 sum1; /**< simple checksum */
180+ int32 chain; /**< next hash-table collision */
181 short flags; /**< flag bits */
182 char sum2[SUM_LENGTH]; /**< checksum */
183 };