From 195bd906a2bacbce7455405180bfecf60819a60d Mon Sep 17 00:00:00 2001 From: "J.W. Schultz" Date: Thu, 10 Apr 2003 02:04:58 +0000 Subject: [PATCH] - Per-file dynamic block size is now sqrt(file length). - The per-file checksum size is determined according to an algorythm provided by Donovan Baarda which reduces the probability of rsync algorithm corrupting data and falling back using the whole md4 checksums. --- NEWS | 26 +++++++++++- generator.c | 119 +++++++++++++++++++++++++++++++++++++--------------- options.c | 4 +- rsync.h | 2 + 4 files changed, 113 insertions(+), 38 deletions(-) diff --git a/NEWS b/NEWS index 4a9406ce..ec1966d7 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,5 @@ NEWS for rsync version 2.5.7 -Protocol: 26 (unchanged) +Protocol: 27 (changed) Changes since version 2.5.6: ENHANCEMENTS: @@ -8,9 +8,31 @@ Changes since version 2.5.6: Note that --from0 affects the line-ending character for all the --*-from options. (Wayne Davison) + - Length of csum2 is now per-file starting with protocol verison + 27. (J.W. Schultz) + + - Per-file dynamic block size is now sqrt(file length). + + The per-file checksum size is determined according + to an algorythm provided by Donovan Baarda which + reduces the probability of rsync algorithm + corrupting data and falling back using the whole md4 + checksums. (J.W. Schultz, Donovan Baarda) + BUG FIXES: - - Fixed a crash bug when interacting with older rsync versions and + - for protocol version >= 27, mdfour_tail() is called when the + block size (including checksum_seed) is a multiple of 64. + Previously it was not called, giving the wrong MD4 checksum. + (Craig Barratt) + + - for protocol version >= 27, a 64 bit bit counter is used in + mdfour.c as required by the RFC. Previously only a 32 bit bit + counter was used, causing incorrect MD4 file checksums for + file sizes >= 512MB - 4. (Craig Barratt) + + + - Fixed a crash bug when interacting with Craiolder rsync versioss and multiple files of the same name are destined for the same dir. (Wayne Davison) diff --git a/generator.c b/generator.c index 740ce668..aca5f7b2 100644 --- a/generator.c +++ b/generator.c @@ -32,7 +32,6 @@ extern int preserve_devices; extern int preserve_hard_links; extern int update_only; extern int opt_ignore_existing; -extern int block_size; extern int csum_length; extern int ignore_times; extern int size_only; @@ -100,24 +99,9 @@ static int skip_file(char *fname, } -/* use a larger block size for really big files */ -static int adapt_block_size(struct file_struct *file, int bsize) -{ - int ret; - - if (bsize != BLOCK_SIZE) return bsize; - - ret = file->length / (10000); /* rough heuristic */ - ret = ret & ~15; /* multiple of 16 */ - if (ret < bsize) ret = bsize; - if (ret > CHUNK_SIZE/2) ret = CHUNK_SIZE/2; - return ret; -} - - /* * NULL sum_struct means we have no checksums - */ + */ void write_sum_head(int f, struct sum_struct *sum) { @@ -133,7 +117,87 @@ void write_sum_head(int f, struct sum_struct *sum) write_int(f, sum->remainder); } +/* + * set (initialize) the size entries in the per-file sum_struct + * calulating dynamic block ans checksum sizes. + * + * This is only called from generate_and_send_sums() but is a seperate + * function to encapsulate the logic. + * + * The block size is a rounded square root of file length. + * + * The checksum size is determined according to: + * blocksum_bits = BLOCKSUM_EXP + 2*log2(file_len) - log2(block_len) + * provided by Donovan Baarda which gives a probability of rsync + * algorithm corrupting data and falling back using the whole md4 + * checksums. + * + * This might be made one of several selectable heuristics. + */ +static void sum_sizes_sqroot_baarda(struct sum_struct *sum, uint64 len) +{ + extern int block_size; + int blength, s2length, b; + uint32 c; + uint64 l; + + if (block_size) { + blength = block_size; + } else if (len <= BLOCK_SIZE * BLOCK_SIZE) { + blength = BLOCK_SIZE; + } else { + l = len; + c = 1; + while (l >>= 2) { + c <<= 1; + } + blength = 0; + do { + blength |= c; + if (len < (uint64)(blength * blength)) + blength &= ~c; + c >>= 1; + } while (c >= 8); /* round to multiple of 8 */ + blength = MAX(blength, BLOCK_SIZE); + } + + if (remote_version < 27) { + s2length = csum_length; + } else if (csum_length == SUM_LENGTH) { + s2length = SUM_LENGTH; + } else { + b = BLOCKSUM_BIAS; + l = len; + while (l >>= 1) { + b += 2; + } + c = blength; + while (c >>= 1 && b) { + b--; + } + s2length = (b + 1 - 32 + 7) / 8; /* add a bit, + * subtract rollsum, + * round up + * --optimize in compiler-- + */ + s2length = MAX(s2length, csum_length); + s2length = MIN(s2length, SUM_LENGTH); + } + + sum->flength = len; + sum->blength = blength; + sum->s2length = s2length; + sum->count = (len + (blength - 1)) / blength; + sum->remainder = (len % blength); + + if (sum->count && verbose > 2) { + rprintf(FINFO, "count=%ld rem=%ld blength=%ld s2length=%ld flength=%.0f\n", + (long) sum->count, (long) sum->remainder, + (long) sum->blength, (long) sum->s2length, + (double) sum->flength); + } +} /** * Perhaps we want to just send an empty checksum set for this file, @@ -163,30 +227,18 @@ static BOOL disable_deltas_p(void) * * Generate approximately one checksum every block_len bytes. */ -static void generate_and_send_sums(struct map_struct *buf, OFF_T len, - int block_len, int f_out) +static void generate_and_send_sums(struct map_struct *buf, OFF_T len, int f_out) { size_t i; struct sum_struct sum; OFF_T offset = 0; - sum.count = (len + (block_len - 1)) / block_len; - sum.remainder = (len % block_len); - sum.blength = block_len; - sum.flength = len; - sum.s2length = csum_length; - /* not needed here sum.sums = NULL; */ - - if (sum.count && verbose > 3) { - rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n", - (long) sum.count, (long) sum.remainder, - (long) sum.blength, (double) sum.flength); - } + sum_sizes_sqroot_baarda(&sum, len); write_sum_head(f_out, &sum); for (i = 0; i < sum.count; i++) { - int n1 = MIN(len, block_len); + int n1 = MIN(len, sum.blength); char *map = map_ptr(buf, offset, n1); uint32 sum1 = get_checksum1(map, n1); char sum2[SUM_LENGTH]; @@ -465,8 +517,7 @@ void recv_generator(char *fname, struct file_list *flist, int i, int f_out) rprintf(FINFO, "generating and sending sums for %d\n", i); write_int(f_out,i); - generate_and_send_sums(buf, st.st_size, - adapt_block_size(file, block_size), f_out); + generate_and_send_sums(buf, st.st_size, f_out); close(fd); if (buf) unmap_file(buf); diff --git a/options.c b/options.c index 772a87fa..14019a23 100644 --- a/options.c +++ b/options.c @@ -77,7 +77,7 @@ int do_progress=0; int keep_partial=0; int safe_symlinks=0; int copy_unsafe_links=0; -int block_size=BLOCK_SIZE; +int block_size=0; int size_only=0; int bwlimit=0; int delete_after=0; @@ -775,7 +775,7 @@ void server_options(char **args,int *argc) if (x != 1) args[ac++] = argstr; - if (block_size != BLOCK_SIZE) { + if (block_size) { snprintf(bsize,sizeof(bsize),"-B%d",block_size); args[ac++] = bsize; } diff --git a/rsync.h b/rsync.h index 0dd3e654..e3cbe570 100644 --- a/rsync.h +++ b/rsync.h @@ -341,6 +341,8 @@ enum logcode {FNONE=0, FERROR=1, FINFO=2, FLOG=3 }; /* the length of the md4 checksum */ #define MD4_SUM_LENGTH 16 #define SUM_LENGTH 16 +#define SHORT_SUM_LENGTH 2 +#define BLOCKSUM_BIAS 10 #ifndef MAXPATHLEN #define MAXPATHLEN 1024 -- 2.34.1