A little more cleanup.
[rsync/rsync-patches.git] / dynamic_hash.diff
CommitLineData
702a8903
WD
1This patch makes the processing of large really large files more efficient
2by making sure that the sender's hash table is large enough to hold all the
84495354
WD
3checksum entries without being overloaded.
4
5Updated to use the current hashtable method when possible, and the new
6hashtable method (which requires a modulus calculation for up to every byte
7of the source file) only on large files that need a larger hashtable size.
8This avoids slowing down files that don't need the extra-large hashtable.
702a8903 9
c62af8d8 10This was updated for the latest codebase from a patch written by Shachar
e1a06002 11Shemesh.
702a8903 12
03019e41
WD
13To use this patch, run these commands for a successful build:
14
15 patch -p1 <patches/dynamic_hash.diff
16 ./configure (optional if already run)
17 make
18
702a8903
WD
19--- old/match.c
20+++ new/match.c
0df0745f 21@@ -39,40 +39,51 @@ static int total_matches;
702a8903
WD
22
23 extern struct stats stats;
24
e1a06002 25-#define TABLESIZE (1<<16)
84495354
WD
26+#define TRADITIONAL_TABLESIZE (1<<16)
27
c541912f 28+static uint32 tablesize;
e1a06002 29 static int32 *hash_table;
702a8903 30
84495354
WD
31 #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF)
32 #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16)
702a8903 33
0df0745f 34-static int32 build_hash_table(struct sum_struct *s, int32 start)
84495354
WD
35+#define BIG_SUM2HASH(sum) ((sum)%tablesize)
36+
0df0745f 37+static void build_hash_table(struct sum_struct *s)
702a8903 38 {
0df0745f 39- int32 i, end = s->count;
84495354 40+ static uint32 alloc_size;
0df0745f 41+ int32 i;
702a8903 42
e1a06002
WD
43- if (!hash_table) {
44- hash_table = new_array(int32, TABLESIZE);
702a8903 45+ /* Dynamically calculate the hash table size so that the hash load
2b1e5f60
WD
46+ * for big files is about 80%. This number must be odd or s2 will
47+ * not be able to span the entire set. */
c541912f 48+ tablesize = (uint32)(s->count/8) * 10 + 11;
84495354
WD
49+ if (tablesize < TRADITIONAL_TABLESIZE)
50+ tablesize = TRADITIONAL_TABLESIZE;
51+ if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) {
e1a06002
WD
52+ if (hash_table)
53+ free(hash_table);
54+ hash_table = new_array(int32, tablesize);
55 if (!hash_table)
56 out_of_memory("build_hash_table");
84495354 57+ alloc_size = tablesize;
702a8903
WD
58 }
59
e1a06002 60- memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]);
0df0745f
WD
61-
62- if (end - start > TABLESIZE*8/10)
63- end = start + TABLESIZE*8/10;
64-
65- for (i = start; i < end; i++) {
84495354
WD
66- uint32 t = SUM2HASH(s->sums[i].sum1);
67- s->sums[i].chain = hash_table[t];
68- hash_table[t] = i;
0df0745f
WD
69- }
70+ memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]);
71
72- if (verbose > 2) {
73- rprintf(FINFO, "built hash table for entries %ld - %ld\n",
74- (long)start, (long)end - 1);
84495354
WD
75+ if (tablesize == TRADITIONAL_TABLESIZE) {
76+ for (i = 0; i < s->count; i++) {
77+ uint32 t = SUM2HASH(s->sums[i].sum1);
78+ s->sums[i].chain = hash_table[t];
79+ hash_table[t] = i;
80+ }
81+ } else {
82+ for (i = 0; i < s->count; i++) {
83+ uint32 t = BIG_SUM2HASH(s->sums[i].sum1);
84+ s->sums[i].chain = hash_table[t];
85+ hash_table[t] = i;
86+ }
87 }
0df0745f
WD
88-
89- return end;
84495354
WD
90 }
91
0df0745f
WD
92
93@@ -130,8 +141,8 @@ static void matched(int f, struct sum_st
94 static void hash_search(int f,struct sum_struct *s,
95 struct map_struct *buf, OFF_T len)
96 {
97- OFF_T offset, end, reset = 0;
98- int32 k, want_i, backup, sum_pos = 0;
99+ OFF_T offset, end;
100+ int32 k, want_i, backup;
101 char sum2[SUM_LENGTH];
102 uint32 s1, s2, sum;
103 int more;
5bf6d6c5 104@@ -169,24 +180,21 @@ static void hash_search(int f,struct sum
0df0745f
WD
105 int done_csum2 = 0;
106 int32 i;
107
108- if (offset >= reset) {
109- sum_pos = build_hash_table(s, sum_pos);
5bf6d6c5
WD
110- if (sum_pos == s->count)
111- reset = len;
112- else
113- reset = sum_pos * s->blength;
0df0745f
WD
114- }
115-
116 if (verbose > 4) {
117 rprintf(FINFO, "offset=%.0f sum=%04x%04x\n",
e1a06002
WD
118 (double)offset, s2 & 0xFFFF, s1 & 0xFFFF);
119 }
702a8903 120
e1a06002 121- i = hash_table[SUM2HASH2(s1,s2)];
84495354
WD
122- if (i < 0)
123- goto null_hash;
124+ if (tablesize == TRADITIONAL_TABLESIZE) {
125+ if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
126+ goto null_hash;
127+ sum = (s1 & 0xffff) | (s2 << 16);
128+ } else {
129+ sum = (s1 & 0xffff) | (s2 << 16);
130+ if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
131+ goto null_hash;
132+ }
1db27b7c
WD
133
134- sum = (s1 & 0xffff) | (s2 << 16);
e1a06002 135 hash_hits++;
1db27b7c 136 do {
e1a06002 137 int32 l;
5bf6d6c5 138@@ -354,6 +362,11 @@ void match_sums(int f, struct sum_struct
0df0745f
WD
139 }
140
141 if (len > 0 && s->count > 0) {
142+ build_hash_table(s);
143+
144+ if (verbose > 2)
145+ rprintf(FINFO,"built hash table\n");
146+
147 hash_search(f, s, buf, len);
148
149 if (verbose > 2)