Wayne Davison wrote: I greatly simplified the changes to generator.c, making the patch easier to maintain and fixing the failing test in the testsuite. Very lightly tested. --- generator.c 30 Jun 2004 07:27:30 -0000 1.93 +++ generator.c 30 Jun 2004 07:43:46 -0000 @@ -41,6 +41,7 @@ extern int ignore_times; extern int size_only; extern int io_timeout; extern int protocol_version; +extern int fuzzy; extern int always_checksum; extern char *compare_dest; extern int link_dest; @@ -257,6 +258,94 @@ static void generate_and_send_sums(struc } +static void split_names(char *fname, char **dirname, char **basename) +{ + char *slash = strrchr(fname, '/'); + if (slash) { + *dirname = fname; + *slash = '\0'; + *basename = slash+1; + } else { + *basename = fname; + *dirname = "."; + } +} + + +static unsigned int measure_name(const char *name, const char *basename, + const char *ext) +{ + int namelen = strlen(name); + int extlen = strlen(ext); + unsigned int score = 0; + + /* Extensions must match */ + if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0) + return 0; + + /* Now score depends on similarity of prefix */ + for (; *name == *basename && *name; name++, basename++) + score++; + return score; +} + + +static int find_fuzzy(char **fname_ptr, char *buf, STRUCT_STAT *st_ptr) +{ + DIR *d; + struct dirent *di; + char *basename, *dirname; + char mangled_name[MAXPATHLEN]; + char bestname[MAXPATHLEN]; + unsigned int bestscore = 0; + const char *ext; + + strlcpy(mangled_name, *fname_ptr, sizeof mangled_name); + + split_names(mangled_name, &dirname, &basename); + if (!(d = opendir(dirname))) { + rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname); + return -1; + } + + /* Get final extension, eg. .gz; never full basename though. */ + ext = strrchr(basename + 1, '.'); + if (!ext) + ext = basename + strlen(basename); /* ext = "" */ + + while ((di = readdir(d)) != NULL) { + const char *dname = d_name(di); + unsigned int score; + + if (dname[0] == '.' && (dname[1] == '\0' + || (dname[1] == '.' && dname[2] == '\0'))) + continue; + + score = measure_name(dname, basename, ext); + if (verbose > 4) { + rprintf(FINFO, "[%s] fuzzy score for %s = %u\n", + who_am_i(), dname, score); + } + if (score > bestscore) { + strlcpy(bestname, dname, sizeof bestname); + bestscore = score; + } + } + closedir(d); + + /* Found a candidate. */ + if (bestscore != 0) { + pathjoin(buf, MAXPATHLEN, dirname, bestname); + if (verbose > 2) { + rprintf(FINFO, "[%s] fuzzy match %s->%s\n", + who_am_i(), *fname_ptr, buf); + } + *fname_ptr = buf; + return link_stat(buf, st_ptr, 0); + } + return -1; +} + /* * Acts on file number @p i from @p flist, whose name is @p fname. @@ -267,12 +356,12 @@ static void generate_and_send_sums(struc * out. It might be wrong. */ static void recv_generator(char *fname, struct file_struct *file, int i, - int f_out) + int f_out, int f_nameout) { int fd; STRUCT_STAT st; struct map_struct *mapbuf; - int statret; + int statret, fuzzy_file = 0; char *fnamecmp; char fnamecmpbuf[MAXPATHLEN]; @@ -431,8 +520,10 @@ static void recv_generator(char *fname, statret = link_stat(fnamecmpbuf, &st, 0); if (!S_ISREG(st.st_mode)) statret = -1; - if (statret == -1) + if (statret < 0) { errno = saveerrno; + *fnamecmpbuf = '\0'; + } #if HAVE_LINK else if (link_dest && !dry_run) { if (do_link(fnamecmpbuf, fname) != 0) { @@ -440,18 +531,30 @@ static void recv_generator(char *fname, rsyserr(FINFO, errno, "link %s => %s", fnamecmpbuf, fname); } - } - fnamecmp = fnamecmpbuf; + fnamecmp = fnamecmpbuf; + } else + *fnamecmpbuf = '\0'; } #endif else fnamecmp = fnamecmpbuf; + } else + *fnamecmpbuf = '\0'; + + if (statret == -1 && fuzzy) { + statret = find_fuzzy(&fnamecmp, fnamecmpbuf, &st); + if (!S_ISREG(st.st_mode)) + statret = -1; + else + fuzzy_file = 1; } if (statret == -1) { if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; if (errno == ENOENT) { + if (f_nameout >= 0) + write(f_nameout, "", 1); write_int(f_out,i); if (!dry_run) write_sum_head(f_out, NULL); @@ -471,37 +574,43 @@ static void recv_generator(char *fname, /* now pretend the file didn't exist */ if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; + if (f_nameout >= 0) + write(f_nameout, "", 1); write_int(f_out,i); if (!dry_run) write_sum_head(f_out, NULL); return; } - if (opt_ignore_existing && fnamecmp == fname) { + if (opt_ignore_existing && !*fnamecmpbuf) { if (verbose > 1) rprintf(FINFO,"%s exists\n",fname); return; } - if (update_only && fnamecmp == fname + if (update_only && !*fnamecmpbuf && cmp_modtime(st.st_mtime, file->modtime) > 0) { if (verbose > 1) rprintf(FINFO,"%s is newer\n",fname); return; } - if (skip_file(fname, file, &st)) { - if (fnamecmp == fname) + if (!fuzzy_file && skip_file(fname, file, &st)) { + if (!*fnamecmpbuf) set_perms(fname, file, &st, PERMS_REPORT); return; } if (dry_run) { + if (f_nameout >= 0) + write(f_nameout, "", 1); write_int(f_out,i); return; } if (disable_deltas_p()) { + if (f_nameout >= 0) + write(f_nameout, "", 1); write_int(f_out,i); write_sum_head(f_out, NULL); return; @@ -516,6 +625,8 @@ static void recv_generator(char *fname, /* pretend the file didn't exist */ if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; + if (f_nameout >= 0) + write(f_nameout, "", 1); write_int(f_out,i); write_sum_head(f_out, NULL); return; @@ -534,6 +645,8 @@ static void recv_generator(char *fname, if (verbose > 2) rprintf(FINFO, "generating and sending sums for %d\n", i); + if (f_nameout >= 0) + write(f_nameout, fnamecmpbuf, strlen(fnamecmpbuf) + 1); write_int(f_out,i); generate_and_send_sums(mapbuf, st.st_size, f_out); @@ -543,7 +656,8 @@ static void recv_generator(char *fname, } -void generate_files(int f, struct file_list *flist, char *local_name) +void generate_files(int f, struct file_list *flist, char *local_name, + int f_nameout) { int i; int phase = 0; @@ -584,7 +698,7 @@ void generate_files(int f, struct file_l } recv_generator(local_name ? local_name : f_name_to(file, fbuf), - file, i, f); + file, i, f, f_nameout); } phase++; @@ -601,7 +715,7 @@ void generate_files(int f, struct file_l while ((i = get_redo_num()) != -1) { struct file_struct *file = flist->files[i]; recv_generator(local_name ? local_name : f_name_to(file, fbuf), - file, i, f); + file, i, f, f_nameout); } phase++; @@ -620,7 +734,7 @@ void generate_files(int f, struct file_l if (!file->basename || !S_ISDIR(file->mode)) continue; recv_generator(local_name ? local_name : f_name(file), - file, i, -1); + file, i, -1, -1); } if (verbose > 2) --- main.c 30 Jun 2004 07:27:30 -0000 1.202 +++ main.c 30 Jun 2004 07:43:47 -0000 @@ -429,7 +429,7 @@ static int do_recv(int f_in,int f_out,st { int pid; int status = 0; - int error_pipe[2]; + int error_pipe[2], name_pipe[2]; if (preserve_hard_links) init_hard_links(flist); @@ -441,8 +441,8 @@ static int do_recv(int f_in,int f_out,st } } - if (fd_pair(error_pipe) < 0) { - rprintf(FERROR,"error pipe failed in do_recv\n"); + if (fd_pair(error_pipe) < 0 || fd_pair(name_pipe) < 0) { + rprintf(FERROR, "fd_pair() failed in do_recv\n"); exit_cleanup(RERR_SOCKETIO); } @@ -450,8 +450,10 @@ static int do_recv(int f_in,int f_out,st if ((pid = do_fork()) == 0) { close(error_pipe[0]); + close(name_pipe[1]); if (f_in != f_out) close(f_out); + set_blocking(name_pipe[0]); /* we can't let two processes write to the socket at one time */ io_multiplexing_close(); @@ -459,7 +461,7 @@ static int do_recv(int f_in,int f_out,st /* set place to send errors */ set_msg_fd_out(error_pipe[1]); - recv_files(f_in,flist,local_name); + recv_files(f_in, flist, local_name, name_pipe[0]); io_flush(FULL_FLUSH); report(f_in); @@ -475,14 +477,16 @@ static int do_recv(int f_in,int f_out,st am_generator = 1; close(error_pipe[1]); + close(name_pipe[0]); if (f_in != f_out) close(f_in); + set_blocking(name_pipe[1]); io_start_buffering_out(f_out); set_msg_fd_in(error_pipe[0]); - generate_files(f_out, flist, local_name); + generate_files(f_out, flist, local_name, name_pipe[1]); get_redo_num(); /* Read final MSG_DONE and any prior messages. */ report(-1); --- options.c 20 Jun 2004 19:47:05 -0000 1.157 +++ options.c 30 Jun 2004 07:43:47 -0000 @@ -94,6 +94,7 @@ int ignore_errors = 0; int modify_window = 0; int blocking_io = -1; int checksum_seed = 0; +int fuzzy = 0; unsigned int block_size = 0; @@ -270,6 +271,7 @@ void usage(enum logcode F) rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n"); rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n"); + rprintf(F," --fuzzy use similar file as basis if basis doesn't exist\n"); rprintf(F," -P equivalent to --partial --progress\n"); rprintf(F," -z, --compress compress file data\n"); rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n"); @@ -368,6 +370,7 @@ static struct poptOption long_options[] {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 }, {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 }, + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy, 0, 0, 0 }, /* TODO: Should this take an optional int giving the compression level? */ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 }, @@ -989,6 +992,9 @@ void server_options(char **args,int *arg } } + if (fuzzy && am_sender) + args[ac++] = "--fuzzy"; + *argc = ac; return; --- receiver.c 30 Jun 2004 07:27:30 -0000 1.84 +++ receiver.c 30 Jun 2004 07:43:47 -0000 @@ -36,7 +36,6 @@ extern int preserve_perms; extern int cvs_exclude; extern int io_error; extern char *tmpdir; -extern char *compare_dest; extern int make_backups; extern int do_progress; extern char *backup_dir; @@ -293,14 +292,15 @@ static int receive_data(int f_in,struct * main routine for receiver process. * * Receiver process runs on the same host as the generator process. */ -int recv_files(int f_in,struct file_list *flist,char *local_name) +int recv_files(int f_in, struct file_list *flist, char *local_name, + int f_name) { int fd1,fd2; STRUCT_STAT st; char *fname, fbuf[MAXPATHLEN]; char template[MAXPATHLEN]; char fnametmp[MAXPATHLEN]; - char *fnamecmp; + char *fnamecmp, *cp; char fnamecmpbuf[MAXPATHLEN]; struct map_struct *mapbuf; struct file_struct *file; @@ -364,19 +364,19 @@ int recv_files(int f_in,struct file_list if (verbose > 2) rprintf(FINFO,"recv_files(%s)\n",fname); - fnamecmp = fname; + for (cp = fnamecmpbuf; ; cp++) { + if (read(f_name, cp, 1) <= 0) { + rsyserr(FERROR, errno, "fname-pipe read failed"); + exit_cleanup(RERR_PROTOCOL); + } + if (!*cp) + break; + } + fnamecmp = *fnamecmpbuf ? fnamecmpbuf : fname; /* open the file */ fd1 = do_open(fnamecmp, O_RDONLY, 0); - if (fd1 == -1 && compare_dest != NULL) { - /* try the file at compare_dest instead */ - pathjoin(fnamecmpbuf, sizeof fnamecmpbuf, - compare_dest, fname); - fnamecmp = fnamecmpbuf; - fd1 = do_open(fnamecmp, O_RDONLY, 0); - } - if (fd1 != -1 && do_fstat(fd1,&st) != 0) { rsyserr(FERROR, errno, "fstat %s failed", full_fname(fnamecmp)); @@ -385,7 +385,7 @@ int recv_files(int f_in,struct file_list continue; } - if (fd1 != -1 && S_ISDIR(st.st_mode) && fnamecmp == fname) { + if (fd1 != -1 && S_ISDIR(st.st_mode) && !*fnamecmpbuf) { /* this special handling for directories * wouldn't be necessary if robust_rename() * and the underlying robust_unlink could cope --- rsync.yo 5 Jun 2004 16:16:30 -0000 1.171 +++ rsync.yo 30 Jun 2004 07:43:48 -0000 @@ -325,6 +325,7 @@ verb( -T --temp-dir=DIR create temporary files in directory DIR --compare-dest=DIR also compare received files relative to DIR --link-dest=DIR create hardlinks to DIR for unchanged files + --fuzzy use similar file as basis if basis is gone -P equivalent to --partial --progress -z, --compress compress file data -C, --cvs-exclude auto ignore files in the same way CVS does