From 42e25077abb656c002841469c5badd4a4926ff33 Mon Sep 17 00:00:00 2001 From: Wayne Davison Date: Mon, 14 Feb 2005 02:45:00 +0000 Subject: [PATCH] Applied to trunk. --- fuzzy.diff | 484 ----------------------------------------------------- 1 file changed, 484 deletions(-) delete mode 100644 fuzzy.diff diff --git a/fuzzy.diff b/fuzzy.diff deleted file mode 100644 index b16206a..0000000 --- a/fuzzy.diff +++ /dev/null @@ -1,484 +0,0 @@ -This patch makes rsync look try to find a basis file for a file that -doesn't already have one. - -Be sure to run "make proto" before "make". - ---- orig/flist.c 2005-02-13 21:17:16 -+++ flist.c 2005-02-13 09:49:22 -@@ -330,7 +330,7 @@ void send_file_entry(struct file_struct - char fname[MAXPATHLEN]; - int l1, l2; - -- if (f == -1) -+ if (f < 0) - return; - - if (!file) { -@@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l - struct file_struct *file; - char fbuf[MAXPATHLEN]; - -- if (!(file = make_file(fname, flist, ALL_FILTERS))) -+ file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS); -+ if (!file) - return; - - maybe_emit_filelist_progress(flist); -@@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f) - - clean_flist(flist, relative_paths, 1); - -- if (f != -1) { -+ if (f >= 0) { - /* Now send the uid/gid list. This was introduced in - * protocol version 15 */ - recv_uid_list(f, flist); -@@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn) - return k > 0 && strcmp(fn+k, backup_suffix) == 0; - } - -+struct file_list *get_dirlist(const char *dirname, int ignore_excludes) -+{ -+ struct file_list *dirlist; -+ char dirbuf[MAXPATHLEN]; -+ int dlen; -+ int save_recurse = recurse; -+ -+ dlen = strlcpy(dirbuf, dirname, MAXPATHLEN); -+ if (dlen >= MAXPATHLEN) -+ return NULL; -+ -+ dirlist = flist_new(WITHOUT_HLINK, "get_dirlist"); -+ recurse = 0; -+ send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen); -+ recurse = save_recurse; -+ -+ return dirlist; -+} -+ - - /* This function is used to implement per-directory deletion, and - * is used by all the --delete-WHEN options. Note that the fbuf ---- orig/generator.c 2005-02-13 05:50:28 -+++ generator.c 2005-02-13 21:47:28 -@@ -47,6 +47,7 @@ extern int size_only; - extern OFF_T max_size; - extern int io_timeout; - extern int protocol_version; -+extern int fuzzy_basis; - extern int always_checksum; - extern char *partial_dir; - extern char *basis_dir[]; -@@ -227,6 +228,59 @@ static void generate_and_send_sums(int f - unmap_file(mapbuf); - } - -+/* Try to find a filename in the same dir as "fname" with a similar name. */ -+static int find_fuzzy(struct file_struct *file, struct file_list *dirlist) -+{ -+ int fname_len, fname_suf_len; -+ const char *fname_suf, *fname = file->basename; -+ uint32 lowest_dist = 0x7FFFFFFF; -+ int j, lowest_j = -1; -+ -+ fname_len = strlen(fname); -+ fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len); -+ -+ for (j = 0; j < dirlist->count; j++) { -+ struct file_struct *fp = dirlist->files[j]; -+ const char *suf, *name; -+ int len, suf_len; -+ uint32 dist; -+ -+ if (!S_ISREG(fp->mode) || !fp->length -+ || fp->flags & FLAG_NO_FUZZY) -+ continue; -+ -+ name = fp->basename; -+ -+ if (fp->length == file->length -+ && fp->modtime == file->modtime) { -+ if (verbose > 4) { -+ rprintf(FINFO, -+ "fuzzy size/modtime match for %s\n", -+ name); -+ } -+ return j; -+ } -+ -+ len = strlen(name); -+ suf = find_filename_suffix(name, len, &suf_len); -+ -+ dist = fuzzy_distance(name, len, fname, fname_len); -+ /* Add some extra weight to how well the suffixes match. */ -+ dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) -+ * 10; -+ if (verbose > 4) { -+ rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n", -+ name, (int)(dist>>16), (int)(dist&0xFFFF)); -+ } -+ if (dist <= lowest_dist) { -+ lowest_dist = dist; -+ lowest_j = j; -+ } -+ } -+ -+ return lowest_j; -+} -+ - - /* Acts on flist->file's ndx'th item, whose name is fname. If a directory, - * make sure it exists, and has the right permissions/timestamp info. For -@@ -241,6 +295,8 @@ static void recv_generator(char *fname, - int f_out, int f_out_name) - { - static int missing_below = -1; -+ static char *fuzzy_dirname = NULL; -+ static struct file_list *fuzzy_dirlist = NULL; - int fd = -1, f_copy = -1; - STRUCT_STAT st, partial_st; - struct file_struct *back_file = NULL; -@@ -275,6 +331,16 @@ static void recv_generator(char *fname, - statret = -1; - stat_errno = ENOENT; - } else { -+ if (fuzzy_basis && S_ISREG(file->mode)) { -+ char *dn = file->dirname ? file->dirname : "."; -+ if (fuzzy_dirname != dn) { -+ if (fuzzy_dirlist) -+ flist_free(fuzzy_dirlist); -+ fuzzy_dirname = dn; -+ fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1); -+ } -+ } -+ - statret = link_stat(fname, &st, - keep_dirlinks && S_ISDIR(file->mode)); - stat_errno = errno; -@@ -492,6 +558,24 @@ static void recv_generator(char *fname, - } else - partialptr = NULL; - -+ if (statret == -1 && fuzzy_basis && dry_run <= 1) { -+ int j = find_fuzzy(file, fuzzy_dirlist); -+ if (j >= 0) { -+ struct file_struct *fp = fuzzy_dirlist->files[j]; -+ f_name_to(fp, fnamecmpbuf); -+ if (verbose > 2) { -+ rprintf(FINFO, "fuzzy basis selected for %s: %s\n", -+ safe_fname(fname), safe_fname(fnamecmpbuf)); -+ } -+ st.st_mode = fp->mode; -+ st.st_size = fp->length; -+ st.st_mtime = fp->modtime; -+ statret = 0; -+ fnamecmp = fnamecmpbuf; -+ fnamecmp_type = FNAMECMP_FUZZY; -+ } -+ } -+ - if (statret == -1) { - if (preserve_hard_links && hard_link_check(file, HL_SKIP)) - return; -@@ -520,6 +604,8 @@ static void recv_generator(char *fname, - - if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH) - ; -+ else if (fnamecmp_type == FNAMECMP_FUZZY) -+ ; - else if (unchanged_file(fnamecmp, file, &st)) { - if (fnamecmp_type == FNAMECMP_FNAME) - set_perms(fname, file, &st, PERMS_REPORT); -@@ -540,6 +626,11 @@ prepare_to_open: - statret = -1; - goto notify_others; - } -+ if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) { -+ int j = flist_find(fuzzy_dirlist, file); -+ if (j >= 0) /* don't use updating file as future fuzzy basis */ -+ fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY; -+ } - - /* open the file */ - fd = do_open(fnamecmp, O_RDONLY, 0); -@@ -594,8 +685,24 @@ notify_others: - write_int(f_out, ndx); - if (protocol_version >= 29 && inplace && !read_batch) - write_byte(f_out, fnamecmp_type); -- if (f_out_name >= 0) -+ if (f_out_name >= 0) { - write_byte(f_out_name, fnamecmp_type); -+ if (fnamecmp_type == FNAMECMP_FUZZY) { -+ uchar lenbuf[3], *lb = lenbuf; -+ int len = strlen(fnamecmpbuf); -+ if (len > 0x7F) { -+#if MAXPATHLEN > 0x7FFF -+ *lb++ = len / 0x10000 + 0x80; -+ *lb++ = len / 0x100; -+#else -+ *lb++ = len / 0x100 + 0x80; -+#endif -+ } -+ *lb = len; -+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1); -+ write_buf(f_out_name, fnamecmpbuf, len); -+ } -+ } - - if (dry_run || read_batch) - return; ---- orig/main.c 2005-02-07 20:41:56 -+++ main.c 2005-01-14 18:33:15 -@@ -44,6 +44,7 @@ extern int keep_dirlinks; - extern int preserve_hard_links; - extern int protocol_version; - extern int recurse; -+extern int fuzzy_basis; - extern int relative_paths; - extern int rsync_port; - extern int whole_file; -@@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st - int pid; - int status = 0; - int error_pipe[2], name_pipe[2]; -- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run; -+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis) -+ && !dry_run; - - /* The receiving side mustn't obey this, or an existing symlink that - * points to an identical file won't be replaced by the referent. */ ---- orig/options.c 2005-02-13 05:50:28 -+++ options.c 2005-02-13 21:41:41 -@@ -89,6 +89,7 @@ int copy_unsafe_links = 0; - int size_only = 0; - int daemon_bwlimit = 0; - int bwlimit = 0; -+int fuzzy_basis = 0; - size_t bwlimit_writemax = 0; - int only_existing = 0; - int opt_ignore_existing = 0; -@@ -302,6 +303,7 @@ void usage(enum logcode F) - rprintf(F," --size-only skip files that match in size\n"); - rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n"); - rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); -+ rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n"); - rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); - rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); - rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); -@@ -411,6 +413,7 @@ static struct poptOption long_options[] - {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, - {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, - {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, -+ {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, - /* TODO: Should this take an optional int giving the compression level? */ - {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, - {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, -@@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg - if (!implied_dirs && !am_sender) - args[ac++] = "--no-implied-dirs"; - -+ if (fuzzy_basis && am_sender) -+ args[ac++] = "--fuzzy"; -+ - *argc = ac; - return; - ---- orig/receiver.c 2005-02-11 10:53:14 -+++ receiver.c 2005-01-15 21:21:02 -@@ -257,6 +257,27 @@ static int receive_data(int f_in, char * - } - - -+static void read_gen_name(int fd, char *buf) -+{ -+ int len = read_byte(fd); -+ if (len & 0x80) { -+#if MAXPATHLEN > 32767 -+ uchar lenbuf[2]; -+ read_buf(fd, (char *)lenbuf, 2); -+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1]; -+#else -+ len = (len & ~0x80) * 0x100 + read_byte(fd); -+#endif -+ } -+ if (len >= MAXPATHLEN) { -+ rprintf(FERROR, "bogus data on generator name pipe\n"); -+ exit_cleanup(RERR_PROTOCOL); -+ } -+ -+ read_sbuf(fd, buf, len); -+} -+ -+ - static void discard_receive_data(int f_in, OFF_T length) - { - receive_data(f_in, NULL, -1, 0, NULL, -1, length); -@@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis - case FNAMECMP_BACKUP: - fnamecmp = get_backup_name(fname); - break; -+ case FNAMECMP_FUZZY: -+ read_gen_name(f_in_name, fnamecmpbuf); -+ fnamecmp = fnamecmpbuf; -+ break; - default: - if (j >= basis_dir_cnt) { - rprintf(FERROR, ---- orig/rsync.h 2005-02-12 19:54:27 -+++ rsync.h 2005-02-13 21:19:16 -@@ -60,6 +60,7 @@ - #define FLAG_TOP_DIR (1<<0) - #define FLAG_HLINK_EOL (1<<1) /* generator only */ - #define FLAG_MOUNT_POINT (1<<2) /* sender only */ -+#define FLAG_NO_FUZZY (1<<2) /* generator only */ - #define FLAG_DEL_HERE (1<<3) /* receiver/generator */ - - /* update this if you make incompatible changes */ -@@ -127,6 +128,7 @@ - #define FNAMECMP_FNAME 0x80 - #define FNAMECMP_PARTIAL_DIR 0x81 - #define FNAMECMP_BACKUP 0x82 -+#define FNAMECMP_FUZZY 0x83 - - /* For calling delete_file() */ - #define DEL_DIR (1<<0) ---- orig/rsync.yo 2005-02-13 21:51:10 -+++ rsync.yo 2005-02-13 21:41:52 -@@ -351,6 +351,7 @@ to the detailed description below for a - --size-only skip files that match in size - --modify-window=NUM compare mod-times with reduced accuracy - -T, --temp-dir=DIR create temporary files in directory DIR -+ -y, --fuzzy find similar file for basis if no dest file - --compare-dest=DIR also compare received files relative to DIR - --copy-dest=DIR ... and include copies of unchanged files - --link-dest=DIR hardlink to files in DIR when unchanged -@@ -909,6 +910,16 @@ scratch directory when creating temporar - transferred on the receiving side. The default behavior is to create - the temporary files in the receiving directory. - -+dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a -+basis file for any destination file that is missing. The current algorithm -+looks in the same directory as the destination file for either a file that -+has an identical size and modified-time, or a similarly-named file. If -+found, rsync uses the fuzzy basis file to try to speed up the transfer. -+ -+Note that the use of the bf(--delete) option might get rid of any potential -+fuzzy-match files, so either use bf(--delete-after) or specify some -+filename exclusions if you need to prevent this. -+ - dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on - the destination machine as an additional hierarchy to compare destination - files against doing transfers (if the files are missing in the destination ---- orig/util.c 2005-02-11 10:53:15 -+++ util.c 2005-02-13 09:44:25 -@@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned - return malloc(size * num); - return realloc(ptr, size * num); - } -+ -+/* Take a filename and filename length and return the most significant -+ * filename suffix we can find. This ignores suffixes such as "~", -+ * ".bak", ".orig", ".~1~", etc. */ -+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr) -+{ -+ const char *suf, *s; -+ BOOL had_tilde; -+ int s_len; -+ -+ /* One or more dots at the start aren't a suffix. */ -+ while (fn_len && *fn == '.') fn++, fn_len--; -+ -+ /* Ignore the ~ in a "foo~" filename. */ -+ if (fn_len > 1 && fn[fn_len-1] == '~') -+ fn_len--, had_tilde = True; -+ else -+ had_tilde = False; -+ -+ /* Assume we don't find an suffix. */ -+ suf = ""; -+ *len_ptr = 0; -+ -+ /* Find the last significant suffix. */ -+ for (s = fn + fn_len; fn_len > 1; ) { -+ while (*--s != '.' && s != fn) {} -+ if (s == fn) -+ break; -+ s_len = fn_len - (s - fn); -+ fn_len = s - fn; -+ if (s_len == 3) { -+ if (strcmp(s+1, "bak") == 0 -+ || strcmp(s+1, "old") == 0) -+ continue; -+ } else if (s_len == 4) { -+ if (strcmp(s+1, "orig") == 0) -+ continue; -+ } else if (s_len > 2 && had_tilde -+ && s[1] == '~' && isdigit(s[2])) -+ continue; -+ *len_ptr = s_len; -+ suf = s; -+ if (s_len == 1) -+ break; -+ /* Determine if the suffix is all digits. */ -+ for (s++, s_len--; s_len > 0; s++, s_len--) { -+ if (!isdigit(*s)) -+ return suf; -+ } -+ /* An all-digit suffix may not be that signficant. */ -+ s = suf; -+ } -+ -+ return suf; -+} -+ -+/* This is an implementation of the Levenshtein distance algorithm. It -+ * was implemented to avoid needing a two-dimensional matrix (to save -+ * memory). It was also tweaked to try to factor in the ASCII distance -+ * between changed characters as a minor distance quantity. The normal -+ * Levenshtein units of distance (each signifying a single change between -+ * the two strings) are defined as a "UNIT". */ -+ -+#define UNIT (1 << 16) -+ -+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2) -+{ -+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc; -+ int32 cost; -+ int i1, i2; -+ -+ if (!len1 || !len2) { -+ if (!len1) { -+ s1 = s2; -+ len1 = len2; -+ } -+ for (i1 = 0, cost = 0; i1 < len1; i1++) -+ cost += s1[i1]; -+ return (int32)len1 * UNIT + cost; -+ } -+ -+ for (i2 = 0; i2 < len2; i2++) -+ a[i2] = (i2+1) * UNIT; -+ -+ for (i1 = 0; i1 < len1; i1++) { -+ diag = i1 * UNIT; -+ above = (i1+1) * UNIT; -+ for (i2 = 0; i2 < len2; i2++) { -+ left = a[i2]; -+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) { -+ if (cost < 0) -+ cost = UNIT - cost; -+ else -+ cost = UNIT + cost; -+ } -+ diag_inc = diag + cost; -+ left_inc = left + UNIT + *((uchar*)s1+i1); -+ above_inc = above + UNIT + *((uchar*)s2+i2); -+ a[i2] = above = left < above -+ ? (left_inc < diag_inc ? left_inc : diag_inc) -+ : (above_inc < diag_inc ? above_inc : diag_inc); -+ diag = left; -+ } -+ } -+ -+ return a[len2-1]; -+} -- 2.34.1