From 36366d5c1d0f2576da7f595079c078527b921607 Mon Sep 17 00:00:00 2001 From: Wayne Davison Date: Sun, 13 Feb 2005 10:10:50 +0000 Subject: [PATCH] - Fixed a couple bugs in find_filename_suffix(). - Upgraded the fuzzy algorithm to scan the entire parent dir for a file into a file_list object when the first file for a particular directory is handled. This avoids rescanning the directory for each missing file, and also avoids the problem of finding: a non- regular file, an active rsync temp-file, a file that is being updated by the receiver, or a server-excluded file. --- fuzzy.diff | 222 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 144 insertions(+), 78 deletions(-) diff --git a/fuzzy.diff b/fuzzy.diff index dce1225..b1bf0e4 100644 --- a/fuzzy.diff +++ b/fuzzy.diff @@ -1,11 +1,69 @@ -The changes to generator.c were greatly simplified, making the patch -easier to maintain and fixing the failing test in the testsuite. -Lightly tested. +This latest version has most of the TODO-list items solved. The one +remaining issue is that we really need to handle all the files in a dir +before we move on to the sub-directories, so this patch needs the sorting +algorithm to change to put all the subdirs at the end of the list of a +dir's contents. Be sure to run "make proto" before "make". +--- orig/flist.c 2005-02-12 19:54:27 ++++ flist.c 2005-02-13 09:49:22 +@@ -330,7 +330,7 @@ void send_file_entry(struct file_struct + char fname[MAXPATHLEN]; + int l1, l2; + +- if (f == -1) ++ if (f < 0) + return; + + if (!file) { +@@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l + struct file_struct *file; + char fbuf[MAXPATHLEN]; + +- if (!(file = make_file(fname, flist, ALL_FILTERS))) ++ file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS); ++ if (!file) + return; + + maybe_emit_filelist_progress(flist); +@@ -1311,7 +1312,7 @@ struct file_list *recv_file_list(int f) + + clean_flist(flist, relative_paths, 1); + +- if (f != -1) { ++ if (f >= 0) { + /* Now send the uid/gid list. This was introduced in + * protocol version 15 */ + recv_uid_list(f, flist); +@@ -1650,6 +1651,25 @@ static int is_backup_file(char *fn) + return k > 0 && strcmp(fn+k, backup_suffix) == 0; + } + ++struct file_list *get_dirlist(const char *dirname, int ignore_excludes) ++{ ++ struct file_list *dirlist; ++ char dirbuf[MAXPATHLEN]; ++ int dlen; ++ int save_recurse = recurse; ++ ++ dlen = strlcpy(dirbuf, dirname, MAXPATHLEN); ++ if (dlen >= MAXPATHLEN) ++ return NULL; ++ ++ dirlist = flist_new(WITHOUT_HLINK, "get_dirlist"); ++ recurse = 0; ++ send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen); ++ recurse = save_recurse; ++ ++ return dirlist; ++} ++ + + /* This function is used to implement per-directory deletion, and + * is used by all the --delete-WHEN options. Note that the fbuf --- orig/generator.c 2005-02-13 05:50:28 -+++ generator.c 2005-02-03 02:11:10 ++++ generator.c 2005-02-13 10:01:48 @@ -47,6 +47,7 @@ extern int size_only; extern OFF_T max_size; extern int io_timeout; @@ -14,102 +72,96 @@ Be sure to run "make proto" before "make". extern int always_checksum; extern char *partial_dir; extern char *basis_dir[]; -@@ -227,6 +228,88 @@ static void generate_and_send_sums(int f +@@ -227,6 +228,47 @@ static void generate_and_send_sums(int f unmap_file(mapbuf); } -+/* Try to find a filename in the same dir as "fname" with a similar name. -+ * -+ * TODO: -+ * - We should be using a cache of names for the current dir, not -+ * re-reading the destination directory for every file. -+ * - We must not return an rsync tempfile from the current transfer. -+ * - If the highest-rated name is not a normal file, we should fall- -+ * back to the next highest-rated file. -+ * - We must not return a destination file that is being updated -+ * during the current transfer, even if we already processed it -+ * (since the receiver may not be done with it yet). -+ * - We must weed out any names that a daemon's config has excluded. -+ */ -+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr) ++/* Try to find a filename in the same dir as "fname" with a similar name. */ ++static int find_fuzzy(struct file_struct *file, struct file_list *dirlist) +{ -+ DIR *d; -+ struct dirent *di; -+ char *basename, *dirname, *slash; -+ char bestname[MAXPATHLEN]; -+ int suf_len, basename_len; ++ int fname_len, fname_suf_len; ++ const char *fname_suf, *fname = file->basename; + uint32 lowest_dist = 0x7FFFFFFF; -+ const char *suf; -+ -+ strlcpy(buf, fname, MAXPATHLEN); -+ if ((slash = strrchr(buf, '/')) != NULL) { -+ dirname = buf; -+ *slash = '\0'; -+ basename = slash + 1; -+ } else { -+ basename = buf; -+ dirname = "."; -+ } -+ basename_len = strlen(basename); -+ -+ if (!(d = opendir(dirname))) { -+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname); -+ return -1; -+ } -+ if (slash) -+ *slash = '/'; ++ int j, lowest_j = -1; + -+ suf = find_filename_suffix(basename, basename_len, &suf_len); ++ fname_len = strlen(fname); ++ fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len); + -+ bestname[0] = '\0'; -+ while ((di = readdir(d)) != NULL) { -+ const char *dname_suf, *dname = d_name(di); ++ for (j = 0; j < dirlist->count; j++) { ++ struct file_struct *fp = dirlist->files[j]; ++ const char *suf, *name; ++ int len, suf_len; + uint32 dist; -+ int dname_len, dname_suf_len; + -+ if (dname[0] == '.' && (dname[1] == '\0' -+ || (dname[1] == '.' && dname[2] == '\0'))) ++ if (!S_ISREG(fp->mode) || !fp->length) + continue; + -+ dname_len = strlen(dname); -+ dname_suf = find_filename_suffix(dname, dname_len, &dname_suf_len); ++ name = fp->basename; ++ len = strlen(name); ++ suf = find_filename_suffix(name, len, &suf_len); + -+ dist = fuzzy_distance(dname, dname_len, basename, basename_len); ++ dist = fuzzy_distance(name, len, fname, fname_len); + /* Add some extra weight to how well the suffixes match. */ -+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10; ++ dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) ++ * 10; + if (verbose > 4) { + rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n", -+ dname, (int)(dist>>16), (int)(dist&0xFFFF)); ++ name, (int)(dist>>16), (int)(dist&0xFFFF)); + } + if (dist <= lowest_dist) { -+ strlcpy(bestname, dname, sizeof bestname); + lowest_dist = dist; ++ lowest_j = j; + } + } -+ closedir(d); + -+ /* Found a candidate. */ -+ if (bestname[0] != '\0') { -+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf)); -+ if (verbose > 2) { -+ rprintf(FINFO, "fuzzy match %s->%s\n", -+ safe_fname(fname), buf); -+ } -+ return link_stat(buf, st_ptr, 0); -+ } -+ return -1; ++ return lowest_j; +} + /* Acts on flist->file's ndx'th item, whose name is fname. If a directory, * make sure it exists, and has the right permissions/timestamp info. For -@@ -492,6 +575,15 @@ static void recv_generator(char *fname, +@@ -241,6 +283,8 @@ static void recv_generator(char *fname, + int f_out, int f_out_name) + { + static int missing_below = -1; ++ static char *fuzzy_dirname = NULL; ++ static struct file_list *fuzzy_dirlist = NULL; + int fd = -1, f_copy = -1; + STRUCT_STAT st, partial_st; + struct file_struct *back_file = NULL; +@@ -275,6 +319,16 @@ static void recv_generator(char *fname, + statret = -1; + stat_errno = ENOENT; + } else { ++ if (fuzzy_basis && S_ISREG(file->mode)) { ++ char *dn = file->dirname ? file->dirname : "."; ++ if (fuzzy_dirname != dn) { ++ if (fuzzy_dirlist) ++ flist_free(fuzzy_dirlist); ++ fuzzy_dirname = dn; ++ fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1); ++ } ++ } ++ + statret = link_stat(fname, &st, + keep_dirlinks && S_ISDIR(file->mode)); + stat_errno = errno; +@@ -492,6 +546,24 @@ static void recv_generator(char *fname, } else partialptr = NULL; -+ if (statret == -1 && fuzzy_basis) { -+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0 -+ && S_ISREG(st.st_mode)) { ++ if (statret == -1 && fuzzy_basis && dry_run <= 1) { ++ int j = find_fuzzy(file, fuzzy_dirlist); ++ if (j >= 0) { ++ struct file_struct *fp = fuzzy_dirlist->files[j]; ++ f_name_to(fp, fnamecmpbuf); ++ if (verbose > 2) { ++ rprintf(FINFO, "fuzzy match for %s: %s\n", ++ safe_fname(fname), safe_fname(fnamecmpbuf)); ++ } ++ st.st_mode = fp->mode; ++ st.st_size = fp->length; ++ st.st_mtime = fp->modtime; + statret = 0; + fnamecmp = fnamecmpbuf; + fnamecmp_type = FNAMECMP_FUZZY; @@ -119,7 +171,7 @@ Be sure to run "make proto" before "make". if (statret == -1) { if (preserve_hard_links && hard_link_check(file, HL_SKIP)) return; -@@ -520,6 +612,8 @@ static void recv_generator(char *fname, +@@ -520,6 +592,8 @@ static void recv_generator(char *fname, if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH) ; @@ -128,7 +180,19 @@ Be sure to run "make proto" before "make". else if (unchanged_file(fnamecmp, file, &st)) { if (fnamecmp_type == FNAMECMP_FNAME) set_perms(fname, file, &st, PERMS_REPORT); -@@ -594,8 +688,24 @@ notify_others: +@@ -540,6 +614,11 @@ prepare_to_open: + statret = -1; + goto notify_others; + } ++ if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) { ++ int j = flist_find(fuzzy_dirlist, file); ++ if (j >= 0) /* don't use an updating file as fuzzy basis */ ++ fuzzy_dirlist->files[j]->length = 0; ++ } + + /* open the file */ + fd = do_open(fnamecmp, O_RDONLY, 0); +@@ -594,8 +673,24 @@ notify_others: write_int(f_out, ndx); if (protocol_version >= 29 && inplace && !read_batch) write_byte(f_out, fnamecmp_type); @@ -287,8 +351,8 @@ Be sure to run "make proto" before "make". the destination machine as an additional hierarchy to compare destination files against doing transfers (if the files are missing in the destination --- orig/util.c 2005-02-11 10:53:15 -+++ util.c 2005-01-19 17:30:51 -@@ -1224,3 +1224,108 @@ void *_realloc_array(void *ptr, unsigned ++++ util.c 2005-02-13 09:44:25 +@@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned return malloc(size * num); return realloc(ptr, size * num); } @@ -316,8 +380,8 @@ Be sure to run "make proto" before "make". + *len_ptr = 0; + + /* Find the last significant suffix. */ -+ for (s = fn + fn_len - 1; fn_len > 1; ) { -+ while (*s != '.' && s != fn) s--; ++ for (s = fn + fn_len; fn_len > 1; ) { ++ while (*--s != '.' && s != fn) {} + if (s == fn) + break; + s_len = fn_len - (s - fn); @@ -334,13 +398,15 @@ Be sure to run "make proto" before "make". + continue; + *len_ptr = s_len; + suf = s; ++ if (s_len == 1) ++ break; + /* Determine if the suffix is all digits. */ + for (s++, s_len--; s_len > 0; s++, s_len--) { + if (!isdigit(*s)) + return suf; + } + /* An all-digit suffix may not be that signficant. */ -+ continue; ++ s = suf; + } + + return suf; -- 2.34.1