The changes to generator.c were greatly simplified, making the patch easier to maintain and fixing the failing test in the testsuite. Lightly tested. Be sure to run "make proto" before "make". --- orig/generator.c 2005-01-25 12:14:14 +++ generator.c 2005-01-19 18:39:15 @@ -47,6 +47,7 @@ extern int size_only; extern OFF_T max_size; extern int io_timeout; extern int protocol_version; +extern int fuzzy_basis; extern int always_checksum; extern char *partial_dir; extern char *basis_dir[]; @@ -227,6 +228,88 @@ static void generate_and_send_sums(int f unmap_file(mapbuf); } +/* Try to find a filename in the same dir as "fname" with a similar name. + * + * TODO: + * - We should be using a cache of names for the current dir, not + * re-reading the destination directory for every file. + * - We must not return an rsync tempfile from the current transfer. + * - If the highest-rated name is not a normal file, we should fall- + * back to the next highest-rated file. + * - We must not return a destination file that is being updated + * during the current transfer, even if we already processed it + * (since the receiver may not be done with it yet). + * - We must weed out any names that a daemon's config has excluded. + */ +static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr) +{ + DIR *d; + struct dirent *di; + char *basename, *dirname, *slash; + char bestname[MAXPATHLEN]; + int suf_len, basename_len; + uint32 lowest_dist = 0x7FFFFFFF; + const char *suf; + + strlcpy(buf, fname, MAXPATHLEN); + if ((slash = strrchr(buf, '/')) != NULL) { + dirname = buf; + *slash = '\0'; + basename = slash + 1; + } else { + basename = buf; + dirname = "."; + } + basename_len = strlen(basename); + + if (!(d = opendir(dirname))) { + rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname); + return -1; + } + if (slash) + *slash = '/'; + + suf = find_filename_suffix(basename, basename_len, &suf_len); + + bestname[0] = '\0'; + while ((di = readdir(d)) != NULL) { + const char *dname_suf, *dname = d_name(di); + uint32 dist; + int dname_len, dname_suf_len; + + if (dname[0] == '.' && (dname[1] == '\0' + || (dname[1] == '.' && dname[2] == '\0'))) + continue; + + dname_len = strlen(dname); + dname_suf = find_filename_suffix(dname, dname_len, &dname_suf_len); + + dist = fuzzy_distance(dname, dname_len, basename, basename_len); + /* Add some extra weight to how well the suffixes match. */ + dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10; + if (verbose > 4) { + rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n", + dname, (int)(dist>>16), (int)(dist&0xFFFF)); + } + if (dist <= lowest_dist) { + strlcpy(bestname, dname, sizeof bestname); + lowest_dist = dist; + } + } + closedir(d); + + /* Found a candidate. */ + if (bestname[0] != '\0') { + strlcpy(basename, bestname, MAXPATHLEN - (basename - buf)); + if (verbose > 2) { + rprintf(FINFO, "fuzzy match %s->%s\n", + safe_fname(fname), buf); + } + return link_stat(buf, st_ptr, 0); + } + return -1; +} + /* * Acts on file number @p i from @p flist, whose name is @p fname. @@ -479,6 +562,15 @@ static void recv_generator(char *fname, } else partialptr = NULL; + if (statret == -1 && fuzzy_basis) { + if (find_fuzzy(fname, fnamecmpbuf, &st) == 0 + && S_ISREG(st.st_mode)) { + statret = 0; + fnamecmp = fnamecmpbuf; + fnamecmp_type = FNAMECMP_FUZZY; + } + } + if (statret == -1) { if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; @@ -507,6 +599,8 @@ static void recv_generator(char *fname, if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH) ; + else if (fnamecmp_type == FNAMECMP_FUZZY) + ; else if (unchanged_file(fnamecmp, file, &st)) { if (fnamecmp_type == FNAMECMP_FNAME) set_perms(fname, file, &st, PERMS_REPORT); @@ -581,8 +675,24 @@ notify_others: write_int(f_out, i); if (protocol_version >= 29 && inplace && !read_batch) write_byte(f_out, fnamecmp_type); - if (f_out_name >= 0) + if (f_out_name >= 0) { write_byte(f_out_name, fnamecmp_type); + if (fnamecmp_type == FNAMECMP_FUZZY) { + uchar lenbuf[3], *lb = lenbuf; + int len = strlen(fnamecmpbuf); + if (len > 0x7F) { +#if MAXPATHLEN > 0x7FFF + *lb++ = len / 0x10000 + 0x80; + *lb++ = len / 0x100; +#else + *lb++ = len / 0x100 + 0x80; +#endif + } + *lb = len; + write_buf(f_out_name, lenbuf, lb - lenbuf + 1); + write_buf(f_out_name, fnamecmpbuf, len); + } + } if (dry_run || read_batch) return; --- orig/main.c 2005-01-28 19:08:20 +++ main.c 2005-01-14 18:33:15 @@ -49,6 +49,7 @@ extern int keep_dirlinks; extern int preserve_hard_links; extern int protocol_version; extern int recurse; +extern int fuzzy_basis; extern int relative_paths; extern int rsync_port; extern int whole_file; @@ -485,7 +486,8 @@ static int do_recv(int f_in,int f_out,st int pid; int status = 0; int error_pipe[2], name_pipe[2]; - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run; + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis) + && !dry_run; /* The receiving side mustn't obey this, or an existing symlink that * points to an identical file won't be replaced by the referent. */ --- orig/options.c 2005-01-28 19:08:20 +++ options.c 2005-01-28 19:31:20 @@ -90,6 +90,7 @@ int copy_unsafe_links = 0; int size_only = 0; int daemon_bwlimit = 0; int bwlimit = 0; +int fuzzy_basis = 0; size_t bwlimit_writemax = 0; int only_existing = 0; int opt_ignore_existing = 0; @@ -303,6 +304,7 @@ void usage(enum logcode F) rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); + rprintf(F," --fuzzy find similar file for basis when no dest file\n"); rprintf(F," -z, --compress compress file data\n"); rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n"); rprintf(F," -f, --filter=RULE add a file-filtering RULE\n"); @@ -408,6 +410,7 @@ static struct poptOption long_options[] {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, /* TODO: Should this take an optional int giving the compression level? */ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, @@ -1315,6 +1318,9 @@ void server_options(char **args,int *arg if (!implied_dirs && !am_sender) args[ac++] = "--no-implied-dirs"; + if (fuzzy_basis && am_sender) + args[ac++] = "--fuzzy"; + *argc = ac; return; --- orig/receiver.c 2005-01-28 19:08:20 +++ receiver.c 2005-01-15 21:21:02 @@ -252,6 +252,27 @@ static int receive_data(int f_in, char * } +static void read_gen_name(int fd, char *buf) +{ + int len = read_byte(fd); + if (len & 0x80) { +#if MAXPATHLEN > 32767 + uchar lenbuf[2]; + read_buf(fd, (char *)lenbuf, 2); + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1]; +#else + len = (len & ~0x80) * 0x100 + read_byte(fd); +#endif + } + if (len >= MAXPATHLEN) { + rprintf(FERROR, "bogus data on generator name pipe\n"); + exit_cleanup(RERR_PROTOCOL); + } + + read_sbuf(fd, buf, len); +} + + static void discard_receive_data(int f_in, OFF_T length) { receive_data(f_in, NULL, -1, 0, NULL, -1, length); @@ -390,6 +411,10 @@ int recv_files(int f_in, struct file_lis case FNAMECMP_BACKUP: fnamecmp = get_backup_name(fname); break; + case FNAMECMP_FUZZY: + read_gen_name(f_in_name, fnamecmpbuf); + fnamecmp = fnamecmpbuf; + break; default: if (j >= basis_dir_cnt) { rprintf(FERROR, --- orig/rsync.h 2005-01-28 19:08:20 +++ rsync.h 2005-01-19 18:36:47 @@ -130,6 +130,7 @@ #define FNAMECMP_FNAME 0x80 #define FNAMECMP_PARTIAL_DIR 0x81 #define FNAMECMP_BACKUP 0x82 +#define FNAMECMP_FUZZY 0x83 /* For calling delete_file() */ #define DEL_DIR (1<<0) --- orig/rsync.yo 2005-01-28 17:12:14 +++ rsync.yo 2005-01-28 19:31:36 @@ -365,6 +365,7 @@ verb( --compare-dest=DIR also compare received files relative to DIR --copy-dest=DIR ... and include copies of unchanged files --link-dest=DIR hardlink to files in DIR when unchanged + --fuzzy find similar file for basis when no dest -z, --compress compress file data -C, --cvs-exclude auto-ignore files in the same way CVS does -f, --filter=RULE add a file-filtering RULE @@ -949,6 +950,14 @@ Note that rsync versions prior to 2.6.1 (or implied by -a). You can work-around this bug by avoiding the -o option when sending to an old rsync. +dit(bf(--fuzzy)) This option tells rsync that it should look around for a +basis file for any destination file that is missing. The current algorithm +looks for a similarly-named file in the same directory as the destination +file, and, if found, uses that to try to speed up the transfer. Note that +the use of the --delete option might get rid of any potential fuzzy-match +files, so either use --delete-after or filename exclusions if you need to +prevent this. + dit(bf(-z, --compress)) With this option, rsync compresses any data from the files that it sends to the destination machine. This option is useful on slow connections. The compression method used is the --- orig/util.c 2005-01-28 19:08:20 +++ util.c 2005-01-19 17:30:51 @@ -1213,3 +1213,108 @@ void *_realloc_array(void *ptr, unsigned return malloc(size * num); return realloc(ptr, size * num); } + +/* Take a filename and filename length and return the most significant + * filename suffix we can find. This ignores suffixes such as "~", + * ".bak", ".orig", ".~1~", etc. */ +const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr) +{ + const char *suf, *s; + BOOL had_tilde; + int s_len; + + /* One or more dots at the start aren't a suffix. */ + while (fn_len && *fn == '.') fn++, fn_len--; + + /* Ignore the ~ in a "foo~" filename. */ + if (fn_len > 1 && fn[fn_len-1] == '~') + fn_len--, had_tilde = True; + else + had_tilde = False; + + /* Assume we don't find an suffix. */ + suf = ""; + *len_ptr = 0; + + /* Find the last significant suffix. */ + for (s = fn + fn_len - 1; fn_len > 1; ) { + while (*s != '.' && s != fn) s--; + if (s == fn) + break; + s_len = fn_len - (s - fn); + fn_len = s - fn; + if (s_len == 3) { + if (strcmp(s+1, "bak") == 0 + || strcmp(s+1, "old") == 0) + continue; + } else if (s_len == 4) { + if (strcmp(s+1, "orig") == 0) + continue; + } else if (s_len > 2 && had_tilde + && s[1] == '~' && isdigit(s[2])) + continue; + *len_ptr = s_len; + suf = s; + /* Determine if the suffix is all digits. */ + for (s++, s_len--; s_len > 0; s++, s_len--) { + if (!isdigit(*s)) + return suf; + } + /* An all-digit suffix may not be that signficant. */ + continue; + } + + return suf; +} + +/* This is an implementation of the Levenshtein distance algorithm. It + * was implemented to avoid needing a two-dimensional matrix (to save + * memory). It was also tweaked to try to factor in the ASCII distance + * between changed characters as a minor distance quantity. The normal + * Levenshtein units of distance (each signifying a single change between + * the two strings) are defined as a "UNIT". */ + +#define UNIT (1 << 16) + +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2) +{ + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc; + int32 cost; + int i1, i2; + + if (!len1 || !len2) { + if (!len1) { + s1 = s2; + len1 = len2; + } + for (i1 = 0, cost = 0; i1 < len1; i1++) + cost += s1[i1]; + return (int32)len1 * UNIT + cost; + } + + for (i2 = 0; i2 < len2; i2++) + a[i2] = (i2+1) * UNIT; + + for (i1 = 0; i1 < len1; i1++) { + diag = i1 * UNIT; + above = (i1+1) * UNIT; + for (i2 = 0; i2 < len2; i2++) { + left = a[i2]; + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) { + if (cost < 0) + cost = UNIT - cost; + else + cost = UNIT + cost; + } + diag_inc = diag + cost; + left_inc = left + UNIT + *((uchar*)s1+i1); + above_inc = above + UNIT + *((uchar*)s2+i2); + a[i2] = above = left < above + ? (left_inc < diag_inc ? left_inc : diag_inc) + : (above_inc < diag_inc ? above_inc : diag_inc); + diag = left; + } + } + + return a[len2-1]; +}