The changes to generator.c were greatly simplified, making the patch easier to maintain and fixing the failing test in the testsuite. Lightly tested. Be sure to run "make proto" before "make". --- orig/generator.c 2005-01-01 21:11:00 +++ generator.c 2004-11-27 18:12:57 @@ -44,6 +44,7 @@ extern int size_only; extern OFF_T max_size; extern int io_timeout; extern int protocol_version; +extern int fuzzy_basis; extern int always_checksum; extern char *partial_dir; extern char *basis_dir[]; @@ -239,6 +240,92 @@ static void generate_and_send_sums(int f } +static void split_names(char *fname, char **dirname, char **basename) +{ + char *slash = strrchr(fname, '/'); + if (slash) { + *dirname = fname; + *slash = '\0'; + *basename = slash+1; + } else { + *basename = fname; + *dirname = "."; + } +} + + +static unsigned int measure_name(const char *name, const char *basename, + const char *ext) +{ + int namelen = strlen(name); + int extlen = strlen(ext); + unsigned int score = 0; + + /* Extensions must match */ + if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0) + return 0; + + /* Now score depends on similarity of prefix */ + for (; *name == *basename && *name; name++, basename++) + score++; + return score; +} + + +static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr) +{ + DIR *d; + struct dirent *di; + char *basename, *dirname; + char mangled_name[MAXPATHLEN]; + char bestname[MAXPATHLEN]; + unsigned int bestscore = 0; + const char *ext; + + strlcpy(mangled_name, fname, sizeof mangled_name); + + split_names(mangled_name, &dirname, &basename); + if (!(d = opendir(dirname))) { + rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname); + return -1; + } + + /* Get final extension, eg. .gz; never full basename though. */ + if (!(ext = strrchr(basename + 1, '.'))) + ext = basename + strlen(basename); /* ext = "" */ + + while ((di = readdir(d)) != NULL) { + const char *dname = d_name(di); + unsigned int score; + + if (dname[0] == '.' && (dname[1] == '\0' + || (dname[1] == '.' && dname[2] == '\0'))) + continue; + + score = measure_name(dname, basename, ext); + if (verbose > 4) { + rprintf(FINFO, "[%s] fuzzy score for %s = %u\n", + who_am_i(), dname, score); + } + if (score > bestscore) { + strlcpy(bestname, dname, sizeof bestname); + bestscore = score; + } + } + closedir(d); + + /* Found a candidate. */ + if (bestscore != 0) { + pathjoin(buf, MAXPATHLEN, dirname, bestname); + if (verbose > 2) { + rprintf(FINFO, "[%s] fuzzy match %s->%s\n", + who_am_i(), fname, buf); + } + return link_stat(buf, st_ptr, 0); + } + return -1; +} + /* * Acts on file number @p i from @p flist, whose name is @p fname. @@ -493,6 +580,15 @@ static void recv_generator(char *fname, } else partialptr = NULL; + if (statret == -1 && fuzzy_basis) { + if (find_fuzzy(fname, fnamecmpbuf, &st) == 0 + && S_ISREG(st.st_mode)) { + statret = 0; + fnamecmp = fnamecmpbuf; + fnamecmp_type = FNAMECMP_FUZZY; + } + } + if (statret == -1) { if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; @@ -521,6 +617,8 @@ static void recv_generator(char *fname, if ((link_dest || copy_dest) && fnamecmp_type != FNAMECMP_FNAME) ; + else if (fnamecmp_type == FNAMECMP_FUZZY) + ; else if (unchanged_file(fnamecmp, file, &st)) { if (fnamecmp_type == FNAMECMP_FNAME) set_perms(fname, file, &st, PERMS_REPORT); @@ -592,8 +690,24 @@ prepare_to_open: notify_others: write_int(f_out, i); - if (f_out_name >= 0) + if (f_out_name >= 0) { write_byte(f_out_name, fnamecmp_type); + if (fnamecmp_type == FNAMECMP_FUZZY) { + uchar lenbuf[3], *lb = lenbuf; + int len = strlen(fnamecmpbuf); + if (len > 0x7F) { +#if MAXPATHLEN > 0x7FFF + *lb++ = len / 0x10000 + 0x80; + *lb++ = len / 0x100; +#else + *lb++ = len / 0x100 + 0x80; +#endif + } + *lb = len; + write_buf(f_out_name, lenbuf, lb - lenbuf + 1); + write_buf(f_out_name, fnamecmpbuf, len); + } + } if (dry_run || read_batch) return; --- orig/main.c 2005-01-01 21:11:00 +++ main.c 2004-11-27 18:13:51 @@ -48,6 +48,7 @@ extern int keep_dirlinks; extern int preserve_hard_links; extern int protocol_version; extern int recurse; +extern int fuzzy_basis; extern int relative_paths; extern int rsync_port; extern int whole_file; @@ -463,7 +464,7 @@ static int do_recv(int f_in,int f_out,st int pid; int status = 0; int error_pipe[2], name_pipe[2]; - BOOL need_name_pipe = basis_dir[0] && !dry_run; + BOOL need_name_pipe = (basis_dir[0] || fuzzy_basis) && !dry_run; /* The receiving side mustn't obey this, or an existing symlink that * points to an identical file won't be replaced by the referent. */ --- orig/options.c 2005-01-01 21:11:00 +++ options.c 2004-11-29 01:36:48 @@ -86,6 +86,7 @@ int copy_unsafe_links = 0; int size_only = 0; int daemon_bwlimit = 0; int bwlimit = 0; +int fuzzy_basis = 0; size_t bwlimit_writemax = 0; int delete_after = 0; int only_existing = 0; @@ -288,6 +289,7 @@ void usage(enum logcode F) rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); + rprintf(F," --fuzzy use similar file as basis if basis doesn't exist\n"); rprintf(F," -P equivalent to --partial --progress\n"); rprintf(F," -z, --compress compress file data\n"); rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n"); @@ -384,6 +386,7 @@ static struct poptOption long_options[] {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, /* TODO: Should this take an optional int giving the compression level? */ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, @@ -958,10 +961,10 @@ int parse_arguments(int *argc, const cha am_server ? "server" : "client"); return 0; #endif - if (compare_dest || copy_dest || link_dest) { + if (dest_option || fuzzy_basis) { snprintf(err_buf, sizeof err_buf, "--inplace does not yet work with %s\n", - dest_option); + dest_option ? dest_option : "--fuzzy"); return 0; } } else { @@ -1240,6 +1243,9 @@ void server_options(char **args,int *arg args[ac++] = "--no-relative"; } + if (fuzzy_basis && am_sender) + args[ac++] = "--fuzzy"; + *argc = ac; return; --- orig/receiver.c 2005-01-01 21:11:00 +++ receiver.c 2004-11-27 18:15:01 @@ -323,6 +323,27 @@ static int receive_data(int f_in, char * } +static void read_gen_name(int fd, char *buf) +{ + int len = read_byte(fd); + if (len & 0x80) { +#if MAXPATHLEN > 32767 + uchar lenbuf[2]; + read_buf(fd, (char *)lenbuf, 2); + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1]; +#else + len = (len & ~0x80) * 0x100 + read_byte(fd); +#endif + } + if (len >= MAXPATHLEN) { + rprintf(FERROR, "bogus data on generator name pipe\n"); + exit_cleanup(RERR_PROTOCOL); + } + + read_sbuf(fd, buf, len); +} + + static void discard_receive_data(int f_in, OFF_T length) { receive_data(f_in, NULL, -1, 0, NULL, -1, length); @@ -453,6 +474,10 @@ int recv_files(int f_in, struct file_lis case FNAMECMP_BACKUP: fnamecmp = get_backup_name(fname); break; + case FNAMECMP_FUZZY: + read_gen_name(f_in_name, fnamecmpbuf); + fnamecmp = fnamecmpbuf; + break; case FNAMECMP_BASIS_DIR: default: pathjoin(fnamecmpbuf, sizeof fnamecmpbuf, --- orig/rsync.h 2005-01-01 21:11:01 +++ rsync.h 2004-11-03 22:53:09 @@ -125,6 +125,7 @@ #define FNAMECMP_FNAME 0x80 #define FNAMECMP_PARTIAL_DIR 0x81 #define FNAMECMP_BACKUP 0x82 +#define FNAMECMP_FUZZY 0x83 /* Log-message categories. FLOG is only used on the daemon side to --- orig/rsync.yo 2005-01-01 21:11:01 +++ rsync.yo 2004-11-27 18:15:22 @@ -358,6 +358,7 @@ verb( --compare-dest=DIR also compare received files relative to DIR --copy-dest=DIR ... and include copies of unchanged files --link-dest=DIR hardlink to files in DIR when unchanged + --fuzzy use similar file as basis if basis is gone -P equivalent to --partial --progress -z, --compress compress file data -C, --cvs-exclude auto ignore files in the same way CVS does