| 1 | This patch causes the sender to segment its use of the block-finding |
| 2 | hashtable for really large files. This avoids overloading the |
| 3 | hashtable, and the ensuing slowdown that ensues from trying to find |
| 4 | blocks in an overloaded hashtable. This does mean that the sender |
| 5 | won't notice some migrations of data past segment boundaries, but since |
| 6 | this only affects files with ~1.6GB or more data, and the blocksize is |
| 7 | already so large that we only find really large sequences of matching |
| 8 | data anyway, I don't consider this that big of a loss. |
| 9 | |
| 10 | I also decreased the MAX_BLOCK_SIZE value to something more reasonable. |
| 11 | |
| 12 | To use this patch, run these commands for a successful build: |
| 13 | |
| 14 | patch -p1 <patches/segment_large_hash.diff |
| 15 | ./configure (optional if already run) |
| 16 | make |
| 17 | |
| 18 | --- old/match.c |
| 19 | +++ new/match.c |
| 20 | @@ -46,9 +46,9 @@ static int32 *hash_table; |
| 21 | #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) |
| 22 | #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16) |
| 23 | |
| 24 | -static void build_hash_table(struct sum_struct *s) |
| 25 | +static int32 build_hash_table(struct sum_struct *s, int32 start) |
| 26 | { |
| 27 | - int32 i; |
| 28 | + int32 i, end = s->count; |
| 29 | |
| 30 | if (!hash_table) { |
| 31 | hash_table = new_array(int32, TABLESIZE); |
| 32 | @@ -58,11 +58,21 @@ static void build_hash_table(struct sum_ |
| 33 | |
| 34 | memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]); |
| 35 | |
| 36 | - for (i = 0; i < s->count; i++) { |
| 37 | + if (end - start > TABLESIZE*8/10) |
| 38 | + end = start + TABLESIZE*8/10; |
| 39 | + |
| 40 | + for (i = start; i < end; i++) { |
| 41 | uint32 t = SUM2HASH(s->sums[i].sum1); |
| 42 | s->sums[i].chain = hash_table[t]; |
| 43 | hash_table[t] = i; |
| 44 | } |
| 45 | + |
| 46 | + if (verbose > 2) { |
| 47 | + rprintf(FINFO, "built hash table for entries %ld - %ld\n", |
| 48 | + (long)start, (long)end - 1); |
| 49 | + } |
| 50 | + |
| 51 | + return end; |
| 52 | } |
| 53 | |
| 54 | |
| 55 | @@ -120,8 +130,8 @@ static void matched(int f, struct sum_st |
| 56 | static void hash_search(int f,struct sum_struct *s, |
| 57 | struct map_struct *buf, OFF_T len) |
| 58 | { |
| 59 | - OFF_T offset, end; |
| 60 | - int32 k, want_i, backup; |
| 61 | + OFF_T offset, end, reset = 0; |
| 62 | + int32 k, want_i, backup, sum_pos = 0; |
| 63 | char sum2[SUM_LENGTH]; |
| 64 | uint32 s1, s2, sum; |
| 65 | int more; |
| 66 | @@ -159,6 +169,11 @@ static void hash_search(int f,struct sum |
| 67 | int done_csum2 = 0; |
| 68 | int32 i; |
| 69 | |
| 70 | + if (offset >= reset) { |
| 71 | + sum_pos = build_hash_table(s, sum_pos); |
| 72 | + reset = sum_pos * s->blength; |
| 73 | + } |
| 74 | + |
| 75 | if (verbose > 4) { |
| 76 | rprintf(FINFO, "offset=%.0f sum=%04x%04x\n", |
| 77 | (double)offset, s2 & 0xFFFF, s1 & 0xFFFF); |
| 78 | @@ -336,11 +351,6 @@ void match_sums(int f, struct sum_struct |
| 79 | } |
| 80 | |
| 81 | if (len > 0 && s->count > 0) { |
| 82 | - build_hash_table(s); |
| 83 | - |
| 84 | - if (verbose > 2) |
| 85 | - rprintf(FINFO,"built hash table\n"); |
| 86 | - |
| 87 | hash_search(f, s, buf, len); |
| 88 | |
| 89 | if (verbose > 2) |
| 90 | --- old/rsync.h |
| 91 | +++ new/rsync.h |
| 92 | @@ -121,7 +121,7 @@ |
| 93 | #define CHUNK_SIZE (32*1024) |
| 94 | #define MAX_MAP_SIZE (256*1024) |
| 95 | #define IO_BUFFER_SIZE (4092) |
| 96 | -#define MAX_BLOCK_SIZE ((int32)1 << 29) |
| 97 | +#define MAX_BLOCK_SIZE ((int32)1 << 17) |
| 98 | |
| 99 | #define IOERR_GENERAL (1<<0) /* For backward compatibility, this must == 1 */ |
| 100 | #define IOERR_VANISHED (1<<1) |