Improved a few minor things.
[rsync/rsync-patches.git] / dynamic_hash.diff
CommitLineData
702a8903
WD
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
66d2440b 3checksum entries without being overloaded. It also makes the hashing of
702a8903
WD
4normal sized files use slightly less memory than before.
5
2b1e5f60 6An extended version of a patch by Shachar Shemesh.
702a8903
WD
7
8--- old/match.c
9+++ new/match.c
10@@ -26,11 +26,6 @@ extern int append_mode;
11
12 int updating_basis_file;
13
14-typedef unsigned short tag;
15-
16-#define TABLESIZE (1<<16)
17-#define NULL_TAG (-1)
18-
19 static int false_alarms;
20 static int tag_hits;
21 static int matches;
c541912f 22@@ -42,47 +37,37 @@ static int total_matches;
702a8903
WD
23
24 extern struct stats stats;
25
26-struct target {
27- tag t;
28- int32 i;
29-};
30-
31-static struct target *targets;
32-
33-static int32 *tag_table;
34-
35-#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
36-#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
37-
38-static int compare_targets(struct target *t1,struct target *t2)
39-{
40- return (int)t1->t - (int)t2->t;
41-}
c541912f 42+static uint32 tablesize;
702a8903
WD
43+static int32 *sum_table;
44
45+#define gettag2(s1,s2) gettag((s1) + ((s2)<<16))
46+#define gettag(sum) ((sum)%tablesize)
47
48 static void build_hash_table(struct sum_struct *s)
49 {
c541912f
WD
50 int32 i;
51+ uint32 prior_size = tablesize;
702a8903
WD
52
53- if (!tag_table)
54- tag_table = new_array(int32, TABLESIZE);
55+ /* Dynamically calculate the hash table size so that the hash load
2b1e5f60
WD
56+ * for big files is about 80%. This number must be odd or s2 will
57+ * not be able to span the entire set. */
c541912f 58+ tablesize = (uint32)(s->count/8) * 10 + 11;
702a8903
WD
59+ if (tablesize < 65537)
60+ tablesize = 65537; /* a prime number */
2b1e5f60
WD
61+ if (tablesize != prior_size) {
62+ free(sum_table);
63+ sum_table = new_array(int32, tablesize);
702a8903
WD
64+ if (!sum_table)
65+ out_of_memory("build_hash_table");
66+ }
67
68- targets = new_array(struct target, s->count);
69- if (!tag_table || !targets)
70- out_of_memory("build_hash_table");
4d89650c 71+ memset(sum_table, 0xFF, tablesize * sizeof sum_table[0]);
702a8903
WD
72
73 for (i = 0; i < s->count; i++) {
74- targets[i].i = i;
75- targets[i].t = gettag(s->sums[i].sum1);
c541912f 76+ uint32 t = gettag(s->sums[i].sum1);
702a8903
WD
77+ s->sums[i].chain = sum_table[t];
78+ sum_table[t] = i;
79 }
80-
81- qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
82-
83- for (i = 0; i < TABLESIZE; i++)
84- tag_table[i] = NULL_TAG;
85-
86- for (i = s->count; i-- > 0; )
87- tag_table[targets[i].t] = i;
88 }
89
90
c541912f 91@@ -176,20 +161,17 @@ static void hash_search(int f,struct sum
702a8903
WD
92 }
93
94 do {
95- tag t = gettag2(s1,s2);
c541912f 96+ uint32 t = gettag2(s1,s2);
702a8903
WD
97 int done_csum2 = 0;
98- int32 j = tag_table[t];
c541912f 99+ int32 i;
702a8903
WD
100
101 if (verbose > 4)
102 rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
103
104- if (j == NULL_TAG)
105- goto null_tag;
106-
107 sum = (s1 & 0xffff) | (s2 << 16);
108 tag_hits++;
109- do {
110- int32 l, i = targets[j].i;
111+ for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) {
112+ int32 l;
113
114 if (sum != s->sums[i].sum1)
115 continue;
4d89650c 116@@ -205,9 +187,11 @@ static void hash_search(int f,struct sum
702a8903
WD
117 && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
118 continue;
119
120- if (verbose > 3)
121- rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
122- (double)offset,(double)j,(double)i,sum);
123+ if (verbose > 3) {
4d89650c
WD
124+ rprintf(FINFO,
125+ "potential match at %.0f i=%ld sum=%08x\n",
702a8903
WD
126+ (double)offset, (long)i, sum);
127+ }
128
129 if (!done_csum2) {
130 map = (schar *)map_ptr(buf,offset,l);
4d89650c 131@@ -224,8 +208,8 @@ static void hash_search(int f,struct sum
702a8903
WD
132 * one with an identical offset, so we prefer that over
133 * the following want_i optimization. */
134 if (updating_basis_file) {
135- do {
136- int32 i2 = targets[j].i;
137+ int32 i2;
138+ for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
139 if (s->sums[i2].offset != offset)
140 continue;
141 if (i2 != i) {
4d89650c 142@@ -240,7 +224,7 @@ static void hash_search(int f,struct sum
702a8903
WD
143 * both the sender and the receiver. */
144 s->sums[i].flags |= SUMFLG_SAME_OFFSET;
145 goto set_want_i;
146- } while (++j < s->count && targets[j].t == t);
147+ }
148 }
149
150 /* we've found a match, but now check to see
4d89650c 151@@ -266,9 +250,8 @@ static void hash_search(int f,struct sum
702a8903
WD
152 s2 = sum >> 16;
153 matches++;
154 break;
155- } while (++j < s->count && targets[j].t == t);
156+ }
157
158- null_tag:
159 backup = offset - last_match;
160 /* We sometimes read 1 byte prior to last_match... */
161 if (backup < 0)
4d89650c 162@@ -375,11 +358,6 @@ void match_sums(int f, struct sum_struct
702a8903
WD
163 rprintf(FINFO,"sending file_sum\n");
164 write_buf(f,file_sum,MD4_SUM_LENGTH);
165
166- if (targets) {
167- free(targets);
168- targets=NULL;
169- }
170-
171 if (verbose > 2)
172 rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
173 false_alarms, tag_hits, matches);
174--- old/rsync.h
175+++ new/rsync.h
176@@ -560,6 +560,7 @@ struct sum_buf {
177 OFF_T offset; /**< offset in file of this chunk */
178 int32 len; /**< length of chunk of file */
179 uint32 sum1; /**< simple checksum */
2b1e5f60 180+ int32 chain; /**< next hash-table collision */
702a8903
WD
181 short flags; /**< flag bits */
182 char sum2[SUM_LENGTH]; /**< checksum */
183 };