This patch makes rsync look try to find a basis file for a file that doesn't already have one. Be sure to run "make proto" before "make". --- orig/flist.c 2005-02-13 21:17:16 +++ flist.c 2005-02-13 09:49:22 @@ -330,7 +330,7 @@ void send_file_entry(struct file_struct char fname[MAXPATHLEN]; int l1, l2; - if (f == -1) + if (f < 0) return; if (!file) { @@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l struct file_struct *file; char fbuf[MAXPATHLEN]; - if (!(file = make_file(fname, flist, ALL_FILTERS))) + file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS); + if (!file) return; maybe_emit_filelist_progress(flist); @@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f) clean_flist(flist, relative_paths, 1); - if (f != -1) { + if (f >= 0) { /* Now send the uid/gid list. This was introduced in * protocol version 15 */ recv_uid_list(f, flist); @@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn) return k > 0 && strcmp(fn+k, backup_suffix) == 0; } +struct file_list *get_dirlist(const char *dirname, int ignore_excludes) +{ + struct file_list *dirlist; + char dirbuf[MAXPATHLEN]; + int dlen; + int save_recurse = recurse; + + dlen = strlcpy(dirbuf, dirname, MAXPATHLEN); + if (dlen >= MAXPATHLEN) + return NULL; + + dirlist = flist_new(WITHOUT_HLINK, "get_dirlist"); + recurse = 0; + send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen); + recurse = save_recurse; + + return dirlist; +} + /* This function is used to implement per-directory deletion, and * is used by all the --delete-WHEN options. Note that the fbuf --- orig/generator.c 2005-02-13 05:50:28 +++ generator.c 2005-02-13 21:47:28 @@ -47,6 +47,7 @@ extern int size_only; extern OFF_T max_size; extern int io_timeout; extern int protocol_version; +extern int fuzzy_basis; extern int always_checksum; extern char *partial_dir; extern char *basis_dir[]; @@ -227,6 +228,59 @@ static void generate_and_send_sums(int f unmap_file(mapbuf); } +/* Try to find a filename in the same dir as "fname" with a similar name. */ +static int find_fuzzy(struct file_struct *file, struct file_list *dirlist) +{ + int fname_len, fname_suf_len; + const char *fname_suf, *fname = file->basename; + uint32 lowest_dist = 0x7FFFFFFF; + int j, lowest_j = -1; + + fname_len = strlen(fname); + fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len); + + for (j = 0; j < dirlist->count; j++) { + struct file_struct *fp = dirlist->files[j]; + const char *suf, *name; + int len, suf_len; + uint32 dist; + + if (!S_ISREG(fp->mode) || !fp->length + || fp->flags & FLAG_NO_FUZZY) + continue; + + name = fp->basename; + + if (fp->length == file->length + && fp->modtime == file->modtime) { + if (verbose > 4) { + rprintf(FINFO, + "fuzzy size/modtime match for %s\n", + name); + } + return j; + } + + len = strlen(name); + suf = find_filename_suffix(name, len, &suf_len); + + dist = fuzzy_distance(name, len, fname, fname_len); + /* Add some extra weight to how well the suffixes match. */ + dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) + * 10; + if (verbose > 4) { + rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n", + name, (int)(dist>>16), (int)(dist&0xFFFF)); + } + if (dist <= lowest_dist) { + lowest_dist = dist; + lowest_j = j; + } + } + + return lowest_j; +} + /* Acts on flist->file's ndx'th item, whose name is fname. If a directory, * make sure it exists, and has the right permissions/timestamp info. For @@ -241,6 +295,8 @@ static void recv_generator(char *fname, int f_out, int f_out_name) { static int missing_below = -1; + static char *fuzzy_dirname = NULL; + static struct file_list *fuzzy_dirlist = NULL; int fd = -1, f_copy = -1; STRUCT_STAT st, partial_st; struct file_struct *back_file = NULL; @@ -275,6 +331,16 @@ static void recv_generator(char *fname, statret = -1; stat_errno = ENOENT; } else { + if (fuzzy_basis && S_ISREG(file->mode)) { + char *dn = file->dirname ? file->dirname : "."; + if (fuzzy_dirname != dn) { + if (fuzzy_dirlist) + flist_free(fuzzy_dirlist); + fuzzy_dirname = dn; + fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1); + } + } + statret = link_stat(fname, &st, keep_dirlinks && S_ISDIR(file->mode)); stat_errno = errno; @@ -492,6 +558,24 @@ static void recv_generator(char *fname, } else partialptr = NULL; + if (statret == -1 && fuzzy_basis && dry_run <= 1) { + int j = find_fuzzy(file, fuzzy_dirlist); + if (j >= 0) { + struct file_struct *fp = fuzzy_dirlist->files[j]; + f_name_to(fp, fnamecmpbuf); + if (verbose > 2) { + rprintf(FINFO, "fuzzy basis selected for %s: %s\n", + safe_fname(fname), safe_fname(fnamecmpbuf)); + } + st.st_mode = fp->mode; + st.st_size = fp->length; + st.st_mtime = fp->modtime; + statret = 0; + fnamecmp = fnamecmpbuf; + fnamecmp_type = FNAMECMP_FUZZY; + } + } + if (statret == -1) { if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; @@ -520,6 +604,8 @@ static void recv_generator(char *fname, if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH) ; + else if (fnamecmp_type == FNAMECMP_FUZZY) + ; else if (unchanged_file(fnamecmp, file, &st)) { if (fnamecmp_type == FNAMECMP_FNAME) set_perms(fname, file, &st, PERMS_REPORT); @@ -540,6 +626,11 @@ prepare_to_open: statret = -1; goto notify_others; } + if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) { + int j = flist_find(fuzzy_dirlist, file); + if (j >= 0) /* don't use updating file as future fuzzy basis */ + fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY; + } /* open the file */ fd = do_open(fnamecmp, O_RDONLY, 0); @@ -594,8 +685,24 @@ notify_others: write_int(f_out, ndx); if (protocol_version >= 29 && inplace && !read_batch) write_byte(f_out, fnamecmp_type); - if (f_out_name >= 0) + if (f_out_name >= 0) { write_byte(f_out_name, fnamecmp_type); + if (fnamecmp_type == FNAMECMP_FUZZY) { + uchar lenbuf[3], *lb = lenbuf; + int len = strlen(fnamecmpbuf); + if (len > 0x7F) { +#if MAXPATHLEN > 0x7FFF + *lb++ = len / 0x10000 + 0x80; + *lb++ = len / 0x100; +#else + *lb++ = len / 0x100 + 0x80; +#endif + } + *lb = len; + write_buf(f_out_name, lenbuf, lb - lenbuf + 1); + write_buf(f_out_name, fnamecmpbuf, len); + } + } if (dry_run || read_batch) return; --- orig/main.c 2005-02-07 20:41:56 +++ main.c 2005-01-14 18:33:15 @@ -44,6 +44,7 @@ extern int keep_dirlinks; extern int preserve_hard_links; extern int protocol_version; extern int recurse; +extern int fuzzy_basis; extern int relative_paths; extern int rsync_port; extern int whole_file; @@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st int pid; int status = 0; int error_pipe[2], name_pipe[2]; - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run; + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis) + && !dry_run; /* The receiving side mustn't obey this, or an existing symlink that * points to an identical file won't be replaced by the referent. */ --- orig/options.c 2005-02-13 05:50:28 +++ options.c 2005-02-13 21:41:41 @@ -89,6 +89,7 @@ int copy_unsafe_links = 0; int size_only = 0; int daemon_bwlimit = 0; int bwlimit = 0; +int fuzzy_basis = 0; size_t bwlimit_writemax = 0; int only_existing = 0; int opt_ignore_existing = 0; @@ -302,6 +303,7 @@ void usage(enum logcode F) rprintf(F," --size-only skip files that match in size\n"); rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n"); rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); + rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n"); rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); @@ -411,6 +413,7 @@ static struct poptOption long_options[] {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, /* TODO: Should this take an optional int giving the compression level? */ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, @@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg if (!implied_dirs && !am_sender) args[ac++] = "--no-implied-dirs"; + if (fuzzy_basis && am_sender) + args[ac++] = "--fuzzy"; + *argc = ac; return; --- orig/receiver.c 2005-02-11 10:53:14 +++ receiver.c 2005-01-15 21:21:02 @@ -257,6 +257,27 @@ static int receive_data(int f_in, char * } +static void read_gen_name(int fd, char *buf) +{ + int len = read_byte(fd); + if (len & 0x80) { +#if MAXPATHLEN > 32767 + uchar lenbuf[2]; + read_buf(fd, (char *)lenbuf, 2); + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1]; +#else + len = (len & ~0x80) * 0x100 + read_byte(fd); +#endif + } + if (len >= MAXPATHLEN) { + rprintf(FERROR, "bogus data on generator name pipe\n"); + exit_cleanup(RERR_PROTOCOL); + } + + read_sbuf(fd, buf, len); +} + + static void discard_receive_data(int f_in, OFF_T length) { receive_data(f_in, NULL, -1, 0, NULL, -1, length); @@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis case FNAMECMP_BACKUP: fnamecmp = get_backup_name(fname); break; + case FNAMECMP_FUZZY: + read_gen_name(f_in_name, fnamecmpbuf); + fnamecmp = fnamecmpbuf; + break; default: if (j >= basis_dir_cnt) { rprintf(FERROR, --- orig/rsync.h 2005-02-12 19:54:27 +++ rsync.h 2005-02-13 21:19:16 @@ -60,6 +60,7 @@ #define FLAG_TOP_DIR (1<<0) #define FLAG_HLINK_EOL (1<<1) /* generator only */ #define FLAG_MOUNT_POINT (1<<2) /* sender only */ +#define FLAG_NO_FUZZY (1<<2) /* generator only */ #define FLAG_DEL_HERE (1<<3) /* receiver/generator */ /* update this if you make incompatible changes */ @@ -127,6 +128,7 @@ #define FNAMECMP_FNAME 0x80 #define FNAMECMP_PARTIAL_DIR 0x81 #define FNAMECMP_BACKUP 0x82 +#define FNAMECMP_FUZZY 0x83 /* For calling delete_file() */ #define DEL_DIR (1<<0) --- orig/rsync.yo 2005-02-13 21:51:10 +++ rsync.yo 2005-02-13 21:41:52 @@ -351,6 +351,7 @@ to the detailed description below for a --size-only skip files that match in size --modify-window=NUM compare mod-times with reduced accuracy -T, --temp-dir=DIR create temporary files in directory DIR + -y, --fuzzy find similar file for basis if no dest file --compare-dest=DIR also compare received files relative to DIR --copy-dest=DIR ... and include copies of unchanged files --link-dest=DIR hardlink to files in DIR when unchanged @@ -909,6 +910,16 @@ scratch directory when creating temporar transferred on the receiving side. The default behavior is to create the temporary files in the receiving directory. +dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a +basis file for any destination file that is missing. The current algorithm +looks in the same directory as the destination file for either a file that +has an identical size and modified-time, or a similarly-named file. If +found, rsync uses the fuzzy basis file to try to speed up the transfer. + +Note that the use of the bf(--delete) option might get rid of any potential +fuzzy-match files, so either use bf(--delete-after) or specify some +filename exclusions if you need to prevent this. + dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on the destination machine as an additional hierarchy to compare destination files against doing transfers (if the files are missing in the destination --- orig/util.c 2005-02-11 10:53:15 +++ util.c 2005-02-13 09:44:25 @@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned return malloc(size * num); return realloc(ptr, size * num); } + +/* Take a filename and filename length and return the most significant + * filename suffix we can find. This ignores suffixes such as "~", + * ".bak", ".orig", ".~1~", etc. */ +const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr) +{ + const char *suf, *s; + BOOL had_tilde; + int s_len; + + /* One or more dots at the start aren't a suffix. */ + while (fn_len && *fn == '.') fn++, fn_len--; + + /* Ignore the ~ in a "foo~" filename. */ + if (fn_len > 1 && fn[fn_len-1] == '~') + fn_len--, had_tilde = True; + else + had_tilde = False; + + /* Assume we don't find an suffix. */ + suf = ""; + *len_ptr = 0; + + /* Find the last significant suffix. */ + for (s = fn + fn_len; fn_len > 1; ) { + while (*--s != '.' && s != fn) {} + if (s == fn) + break; + s_len = fn_len - (s - fn); + fn_len = s - fn; + if (s_len == 3) { + if (strcmp(s+1, "bak") == 0 + || strcmp(s+1, "old") == 0) + continue; + } else if (s_len == 4) { + if (strcmp(s+1, "orig") == 0) + continue; + } else if (s_len > 2 && had_tilde + && s[1] == '~' && isdigit(s[2])) + continue; + *len_ptr = s_len; + suf = s; + if (s_len == 1) + break; + /* Determine if the suffix is all digits. */ + for (s++, s_len--; s_len > 0; s++, s_len--) { + if (!isdigit(*s)) + return suf; + } + /* An all-digit suffix may not be that signficant. */ + s = suf; + } + + return suf; +} + +/* This is an implementation of the Levenshtein distance algorithm. It + * was implemented to avoid needing a two-dimensional matrix (to save + * memory). It was also tweaked to try to factor in the ASCII distance + * between changed characters as a minor distance quantity. The normal + * Levenshtein units of distance (each signifying a single change between + * the two strings) are defined as a "UNIT". */ + +#define UNIT (1 << 16) + +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2) +{ + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc; + int32 cost; + int i1, i2; + + if (!len1 || !len2) { + if (!len1) { + s1 = s2; + len1 = len2; + } + for (i1 = 0, cost = 0; i1 < len1; i1++) + cost += s1[i1]; + return (int32)len1 * UNIT + cost; + } + + for (i2 = 0; i2 < len2; i2++) + a[i2] = (i2+1) * UNIT; + + for (i1 = 0; i1 < len1; i1++) { + diag = i1 * UNIT; + above = (i1+1) * UNIT; + for (i2 = 0; i2 < len2; i2++) { + left = a[i2]; + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) { + if (cost < 0) + cost = UNIT - cost; + else + cost = UNIT + cost; + } + diag_inc = diag + cost; + left_inc = left + UNIT + *((uchar*)s1+i1); + above_inc = above + UNIT + *((uchar*)s2+i2); + a[i2] = above = left < above + ? (left_inc < diag_inc ? left_inc : diag_inc) + : (above_inc < diag_inc ? above_inc : diag_inc); + diag = left; + } + } + + return a[len2-1]; +}