Allow for more -p info.
[rsync/rsync-patches.git] / dynamic_hash.diff
... / ...
CommitLineData
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
3checksum entries without being overloaded.
4
5Updated to use the current hashtable method when possible, and the new
6hashtable method (which requires a modulus calculation for up to every byte
7of the source file) only on large files that need a larger hashtable size.
8This avoids slowing down files that don't need the extra-large hashtable.
9
10This was updated for the latest codebase from a patch written by Shachar
11Shemesh.
12
13To use this patch, run these commands for a successful build:
14
15 patch -p1 <patches/dynamic_hash.diff
16 ./configure (optional if already run)
17 make
18
19--- old/match.c
20+++ new/match.c
21@@ -39,40 +39,51 @@ static int total_matches;
22
23 extern struct stats stats;
24
25-#define TABLESIZE (1<<16)
26+#define TRADITIONAL_TABLESIZE (1<<16)
27
28+static uint32 tablesize;
29 static int32 *hash_table;
30
31 #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF)
32 #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16)
33
34-static int32 build_hash_table(struct sum_struct *s, int32 start)
35+#define BIG_SUM2HASH(sum) ((sum)%tablesize)
36+
37+static void build_hash_table(struct sum_struct *s)
38 {
39- int32 i, end = s->count;
40+ static uint32 alloc_size;
41+ int32 i;
42
43- if (!hash_table) {
44- hash_table = new_array(int32, TABLESIZE);
45+ /* Dynamically calculate the hash table size so that the hash load
46+ * for big files is about 80%. This number must be odd or s2 will
47+ * not be able to span the entire set. */
48+ tablesize = (uint32)(s->count/8) * 10 + 11;
49+ if (tablesize < TRADITIONAL_TABLESIZE)
50+ tablesize = TRADITIONAL_TABLESIZE;
51+ if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) {
52+ if (hash_table)
53+ free(hash_table);
54+ hash_table = new_array(int32, tablesize);
55 if (!hash_table)
56 out_of_memory("build_hash_table");
57+ alloc_size = tablesize;
58 }
59
60- memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]);
61-
62- if (end - start > TABLESIZE*8/10)
63- end = start + TABLESIZE*8/10;
64-
65- for (i = start; i < end; i++) {
66- uint32 t = SUM2HASH(s->sums[i].sum1);
67- s->sums[i].chain = hash_table[t];
68- hash_table[t] = i;
69- }
70+ memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]);
71
72- if (verbose > 2) {
73- rprintf(FINFO, "built hash table for entries %ld - %ld\n",
74- (long)start, (long)end - 1);
75+ if (tablesize == TRADITIONAL_TABLESIZE) {
76+ for (i = 0; i < s->count; i++) {
77+ uint32 t = SUM2HASH(s->sums[i].sum1);
78+ s->sums[i].chain = hash_table[t];
79+ hash_table[t] = i;
80+ }
81+ } else {
82+ for (i = 0; i < s->count; i++) {
83+ uint32 t = BIG_SUM2HASH(s->sums[i].sum1);
84+ s->sums[i].chain = hash_table[t];
85+ hash_table[t] = i;
86+ }
87 }
88-
89- return end;
90 }
91
92
93@@ -130,8 +141,8 @@ static void matched(int f, struct sum_st
94 static void hash_search(int f,struct sum_struct *s,
95 struct map_struct *buf, OFF_T len)
96 {
97- OFF_T offset, end, reset = 0;
98- int32 k, want_i, backup, sum_pos = 0;
99+ OFF_T offset, end;
100+ int32 k, want_i, backup;
101 char sum2[SUM_LENGTH];
102 uint32 s1, s2, sum;
103 int more;
104@@ -169,24 +180,21 @@ static void hash_search(int f,struct sum
105 int done_csum2 = 0;
106 int32 i;
107
108- if (offset >= reset) {
109- sum_pos = build_hash_table(s, sum_pos);
110- if (sum_pos == s->count)
111- reset = len;
112- else
113- reset = sum_pos * s->blength;
114- }
115-
116 if (verbose > 4) {
117 rprintf(FINFO, "offset=%.0f sum=%04x%04x\n",
118 (double)offset, s2 & 0xFFFF, s1 & 0xFFFF);
119 }
120
121- i = hash_table[SUM2HASH2(s1,s2)];
122- if (i < 0)
123- goto null_hash;
124+ if (tablesize == TRADITIONAL_TABLESIZE) {
125+ if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
126+ goto null_hash;
127+ sum = (s1 & 0xffff) | (s2 << 16);
128+ } else {
129+ sum = (s1 & 0xffff) | (s2 << 16);
130+ if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
131+ goto null_hash;
132+ }
133
134- sum = (s1 & 0xffff) | (s2 << 16);
135 hash_hits++;
136 do {
137 int32 l;
138@@ -354,6 +362,11 @@ void match_sums(int f, struct sum_struct
139 }
140
141 if (len > 0 && s->count > 0) {
142+ build_hash_table(s);
143+
144+ if (verbose > 2)
145+ rprintf(FINFO,"built hash table\n");
146+
147 hash_search(f, s, buf, len);
148
149 if (verbose > 2)