+++ /dev/null
-This patch makes rsync look try to find a basis file for a file that
-doesn't already have one.
-
-Be sure to run "make proto" before "make".
-
---- orig/flist.c 2005-02-13 21:17:16
-+++ flist.c 2005-02-13 09:49:22
-@@ -330,7 +330,7 @@ void send_file_entry(struct file_struct
- char fname[MAXPATHLEN];
- int l1, l2;
-
-- if (f == -1)
-+ if (f < 0)
- return;
-
- if (!file) {
-@@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l
- struct file_struct *file;
- char fbuf[MAXPATHLEN];
-
-- if (!(file = make_file(fname, flist, ALL_FILTERS)))
-+ file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS);
-+ if (!file)
- return;
-
- maybe_emit_filelist_progress(flist);
-@@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f)
-
- clean_flist(flist, relative_paths, 1);
-
-- if (f != -1) {
-+ if (f >= 0) {
- /* Now send the uid/gid list. This was introduced in
- * protocol version 15 */
- recv_uid_list(f, flist);
-@@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn)
- return k > 0 && strcmp(fn+k, backup_suffix) == 0;
- }
-
-+struct file_list *get_dirlist(const char *dirname, int ignore_excludes)
-+{
-+ struct file_list *dirlist;
-+ char dirbuf[MAXPATHLEN];
-+ int dlen;
-+ int save_recurse = recurse;
-+
-+ dlen = strlcpy(dirbuf, dirname, MAXPATHLEN);
-+ if (dlen >= MAXPATHLEN)
-+ return NULL;
-+
-+ dirlist = flist_new(WITHOUT_HLINK, "get_dirlist");
-+ recurse = 0;
-+ send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen);
-+ recurse = save_recurse;
-+
-+ return dirlist;
-+}
-+
-
- /* This function is used to implement per-directory deletion, and
- * is used by all the --delete-WHEN options. Note that the fbuf
---- orig/generator.c 2005-02-13 05:50:28
-+++ generator.c 2005-02-13 21:47:28
-@@ -47,6 +47,7 @@ extern int size_only;
- extern OFF_T max_size;
- extern int io_timeout;
- extern int protocol_version;
-+extern int fuzzy_basis;
- extern int always_checksum;
- extern char *partial_dir;
- extern char *basis_dir[];
-@@ -227,6 +228,59 @@ static void generate_and_send_sums(int f
- unmap_file(mapbuf);
- }
-
-+/* Try to find a filename in the same dir as "fname" with a similar name. */
-+static int find_fuzzy(struct file_struct *file, struct file_list *dirlist)
-+{
-+ int fname_len, fname_suf_len;
-+ const char *fname_suf, *fname = file->basename;
-+ uint32 lowest_dist = 0x7FFFFFFF;
-+ int j, lowest_j = -1;
-+
-+ fname_len = strlen(fname);
-+ fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len);
-+
-+ for (j = 0; j < dirlist->count; j++) {
-+ struct file_struct *fp = dirlist->files[j];
-+ const char *suf, *name;
-+ int len, suf_len;
-+ uint32 dist;
-+
-+ if (!S_ISREG(fp->mode) || !fp->length
-+ || fp->flags & FLAG_NO_FUZZY)
-+ continue;
-+
-+ name = fp->basename;
-+
-+ if (fp->length == file->length
-+ && fp->modtime == file->modtime) {
-+ if (verbose > 4) {
-+ rprintf(FINFO,
-+ "fuzzy size/modtime match for %s\n",
-+ name);
-+ }
-+ return j;
-+ }
-+
-+ len = strlen(name);
-+ suf = find_filename_suffix(name, len, &suf_len);
-+
-+ dist = fuzzy_distance(name, len, fname, fname_len);
-+ /* Add some extra weight to how well the suffixes match. */
-+ dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len)
-+ * 10;
-+ if (verbose > 4) {
-+ rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n",
-+ name, (int)(dist>>16), (int)(dist&0xFFFF));
-+ }
-+ if (dist <= lowest_dist) {
-+ lowest_dist = dist;
-+ lowest_j = j;
-+ }
-+ }
-+
-+ return lowest_j;
-+}
-+
-
- /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
- * make sure it exists, and has the right permissions/timestamp info. For
-@@ -241,6 +295,8 @@ static void recv_generator(char *fname,
- int f_out, int f_out_name)
- {
- static int missing_below = -1;
-+ static char *fuzzy_dirname = NULL;
-+ static struct file_list *fuzzy_dirlist = NULL;
- int fd = -1, f_copy = -1;
- STRUCT_STAT st, partial_st;
- struct file_struct *back_file = NULL;
-@@ -275,6 +331,16 @@ static void recv_generator(char *fname,
- statret = -1;
- stat_errno = ENOENT;
- } else {
-+ if (fuzzy_basis && S_ISREG(file->mode)) {
-+ char *dn = file->dirname ? file->dirname : ".";
-+ if (fuzzy_dirname != dn) {
-+ if (fuzzy_dirlist)
-+ flist_free(fuzzy_dirlist);
-+ fuzzy_dirname = dn;
-+ fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1);
-+ }
-+ }
-+
- statret = link_stat(fname, &st,
- keep_dirlinks && S_ISDIR(file->mode));
- stat_errno = errno;
-@@ -492,6 +558,24 @@ static void recv_generator(char *fname,
- } else
- partialptr = NULL;
-
-+ if (statret == -1 && fuzzy_basis && dry_run <= 1) {
-+ int j = find_fuzzy(file, fuzzy_dirlist);
-+ if (j >= 0) {
-+ struct file_struct *fp = fuzzy_dirlist->files[j];
-+ f_name_to(fp, fnamecmpbuf);
-+ if (verbose > 2) {
-+ rprintf(FINFO, "fuzzy basis selected for %s: %s\n",
-+ safe_fname(fname), safe_fname(fnamecmpbuf));
-+ }
-+ st.st_mode = fp->mode;
-+ st.st_size = fp->length;
-+ st.st_mtime = fp->modtime;
-+ statret = 0;
-+ fnamecmp = fnamecmpbuf;
-+ fnamecmp_type = FNAMECMP_FUZZY;
-+ }
-+ }
-+
- if (statret == -1) {
- if (preserve_hard_links && hard_link_check(file, HL_SKIP))
- return;
-@@ -520,6 +604,8 @@ static void recv_generator(char *fname,
-
- if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
- ;
-+ else if (fnamecmp_type == FNAMECMP_FUZZY)
-+ ;
- else if (unchanged_file(fnamecmp, file, &st)) {
- if (fnamecmp_type == FNAMECMP_FNAME)
- set_perms(fname, file, &st, PERMS_REPORT);
-@@ -540,6 +626,11 @@ prepare_to_open:
- statret = -1;
- goto notify_others;
- }
-+ if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) {
-+ int j = flist_find(fuzzy_dirlist, file);
-+ if (j >= 0) /* don't use updating file as future fuzzy basis */
-+ fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY;
-+ }
-
- /* open the file */
- fd = do_open(fnamecmp, O_RDONLY, 0);
-@@ -594,8 +685,24 @@ notify_others:
- write_int(f_out, ndx);
- if (protocol_version >= 29 && inplace && !read_batch)
- write_byte(f_out, fnamecmp_type);
-- if (f_out_name >= 0)
-+ if (f_out_name >= 0) {
- write_byte(f_out_name, fnamecmp_type);
-+ if (fnamecmp_type == FNAMECMP_FUZZY) {
-+ uchar lenbuf[3], *lb = lenbuf;
-+ int len = strlen(fnamecmpbuf);
-+ if (len > 0x7F) {
-+#if MAXPATHLEN > 0x7FFF
-+ *lb++ = len / 0x10000 + 0x80;
-+ *lb++ = len / 0x100;
-+#else
-+ *lb++ = len / 0x100 + 0x80;
-+#endif
-+ }
-+ *lb = len;
-+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
-+ write_buf(f_out_name, fnamecmpbuf, len);
-+ }
-+ }
-
- if (dry_run || read_batch)
- return;
---- orig/main.c 2005-02-07 20:41:56
-+++ main.c 2005-01-14 18:33:15
-@@ -44,6 +44,7 @@ extern int keep_dirlinks;
- extern int preserve_hard_links;
- extern int protocol_version;
- extern int recurse;
-+extern int fuzzy_basis;
- extern int relative_paths;
- extern int rsync_port;
- extern int whole_file;
-@@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st
- int pid;
- int status = 0;
- int error_pipe[2], name_pipe[2];
-- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
-+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
-+ && !dry_run;
-
- /* The receiving side mustn't obey this, or an existing symlink that
- * points to an identical file won't be replaced by the referent. */
---- orig/options.c 2005-02-13 05:50:28
-+++ options.c 2005-02-13 21:41:41
-@@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
- int size_only = 0;
- int daemon_bwlimit = 0;
- int bwlimit = 0;
-+int fuzzy_basis = 0;
- size_t bwlimit_writemax = 0;
- int only_existing = 0;
- int opt_ignore_existing = 0;
-@@ -302,6 +303,7 @@ void usage(enum logcode F)
- rprintf(F," --size-only skip files that match in size\n");
- rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
- rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
-+ rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
- rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
- rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
- rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
-@@ -411,6 +413,7 @@ static struct poptOption long_options[]
- {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
- {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
- {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
-+ {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
- /* TODO: Should this take an optional int giving the compression level? */
- {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
- {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
-@@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg
- if (!implied_dirs && !am_sender)
- args[ac++] = "--no-implied-dirs";
-
-+ if (fuzzy_basis && am_sender)
-+ args[ac++] = "--fuzzy";
-+
- *argc = ac;
- return;
-
---- orig/receiver.c 2005-02-11 10:53:14
-+++ receiver.c 2005-01-15 21:21:02
-@@ -257,6 +257,27 @@ static int receive_data(int f_in, char *
- }
-
-
-+static void read_gen_name(int fd, char *buf)
-+{
-+ int len = read_byte(fd);
-+ if (len & 0x80) {
-+#if MAXPATHLEN > 32767
-+ uchar lenbuf[2];
-+ read_buf(fd, (char *)lenbuf, 2);
-+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
-+#else
-+ len = (len & ~0x80) * 0x100 + read_byte(fd);
-+#endif
-+ }
-+ if (len >= MAXPATHLEN) {
-+ rprintf(FERROR, "bogus data on generator name pipe\n");
-+ exit_cleanup(RERR_PROTOCOL);
-+ }
-+
-+ read_sbuf(fd, buf, len);
-+}
-+
-+
- static void discard_receive_data(int f_in, OFF_T length)
- {
- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
-@@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis
- case FNAMECMP_BACKUP:
- fnamecmp = get_backup_name(fname);
- break;
-+ case FNAMECMP_FUZZY:
-+ read_gen_name(f_in_name, fnamecmpbuf);
-+ fnamecmp = fnamecmpbuf;
-+ break;
- default:
- if (j >= basis_dir_cnt) {
- rprintf(FERROR,
---- orig/rsync.h 2005-02-12 19:54:27
-+++ rsync.h 2005-02-13 21:19:16
-@@ -60,6 +60,7 @@
- #define FLAG_TOP_DIR (1<<0)
- #define FLAG_HLINK_EOL (1<<1) /* generator only */
- #define FLAG_MOUNT_POINT (1<<2) /* sender only */
-+#define FLAG_NO_FUZZY (1<<2) /* generator only */
- #define FLAG_DEL_HERE (1<<3) /* receiver/generator */
-
- /* update this if you make incompatible changes */
-@@ -127,6 +128,7 @@
- #define FNAMECMP_FNAME 0x80
- #define FNAMECMP_PARTIAL_DIR 0x81
- #define FNAMECMP_BACKUP 0x82
-+#define FNAMECMP_FUZZY 0x83
-
- /* For calling delete_file() */
- #define DEL_DIR (1<<0)
---- orig/rsync.yo 2005-02-13 21:51:10
-+++ rsync.yo 2005-02-13 21:41:52
-@@ -351,6 +351,7 @@ to the detailed description below for a
- --size-only skip files that match in size
- --modify-window=NUM compare mod-times with reduced accuracy
- -T, --temp-dir=DIR create temporary files in directory DIR
-+ -y, --fuzzy find similar file for basis if no dest file
- --compare-dest=DIR also compare received files relative to DIR
- --copy-dest=DIR ... and include copies of unchanged files
- --link-dest=DIR hardlink to files in DIR when unchanged
-@@ -909,6 +910,16 @@ scratch directory when creating temporar
- transferred on the receiving side. The default behavior is to create
- the temporary files in the receiving directory.
-
-+dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a
-+basis file for any destination file that is missing. The current algorithm
-+looks in the same directory as the destination file for either a file that
-+has an identical size and modified-time, or a similarly-named file. If
-+found, rsync uses the fuzzy basis file to try to speed up the transfer.
-+
-+Note that the use of the bf(--delete) option might get rid of any potential
-+fuzzy-match files, so either use bf(--delete-after) or specify some
-+filename exclusions if you need to prevent this.
-+
- dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
- the destination machine as an additional hierarchy to compare destination
- files against doing transfers (if the files are missing in the destination
---- orig/util.c 2005-02-11 10:53:15
-+++ util.c 2005-02-13 09:44:25
-@@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned
- return malloc(size * num);
- return realloc(ptr, size * num);
- }
-+
-+/* Take a filename and filename length and return the most significant
-+ * filename suffix we can find. This ignores suffixes such as "~",
-+ * ".bak", ".orig", ".~1~", etc. */
-+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
-+{
-+ const char *suf, *s;
-+ BOOL had_tilde;
-+ int s_len;
-+
-+ /* One or more dots at the start aren't a suffix. */
-+ while (fn_len && *fn == '.') fn++, fn_len--;
-+
-+ /* Ignore the ~ in a "foo~" filename. */
-+ if (fn_len > 1 && fn[fn_len-1] == '~')
-+ fn_len--, had_tilde = True;
-+ else
-+ had_tilde = False;
-+
-+ /* Assume we don't find an suffix. */
-+ suf = "";
-+ *len_ptr = 0;
-+
-+ /* Find the last significant suffix. */
-+ for (s = fn + fn_len; fn_len > 1; ) {
-+ while (*--s != '.' && s != fn) {}
-+ if (s == fn)
-+ break;
-+ s_len = fn_len - (s - fn);
-+ fn_len = s - fn;
-+ if (s_len == 3) {
-+ if (strcmp(s+1, "bak") == 0
-+ || strcmp(s+1, "old") == 0)
-+ continue;
-+ } else if (s_len == 4) {
-+ if (strcmp(s+1, "orig") == 0)
-+ continue;
-+ } else if (s_len > 2 && had_tilde
-+ && s[1] == '~' && isdigit(s[2]))
-+ continue;
-+ *len_ptr = s_len;
-+ suf = s;
-+ if (s_len == 1)
-+ break;
-+ /* Determine if the suffix is all digits. */
-+ for (s++, s_len--; s_len > 0; s++, s_len--) {
-+ if (!isdigit(*s))
-+ return suf;
-+ }
-+ /* An all-digit suffix may not be that signficant. */
-+ s = suf;
-+ }
-+
-+ return suf;
-+}
-+
-+/* This is an implementation of the Levenshtein distance algorithm. It
-+ * was implemented to avoid needing a two-dimensional matrix (to save
-+ * memory). It was also tweaked to try to factor in the ASCII distance
-+ * between changed characters as a minor distance quantity. The normal
-+ * Levenshtein units of distance (each signifying a single change between
-+ * the two strings) are defined as a "UNIT". */
-+
-+#define UNIT (1 << 16)
-+
-+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
-+{
-+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
-+ int32 cost;
-+ int i1, i2;
-+
-+ if (!len1 || !len2) {
-+ if (!len1) {
-+ s1 = s2;
-+ len1 = len2;
-+ }
-+ for (i1 = 0, cost = 0; i1 < len1; i1++)
-+ cost += s1[i1];
-+ return (int32)len1 * UNIT + cost;
-+ }
-+
-+ for (i2 = 0; i2 < len2; i2++)
-+ a[i2] = (i2+1) * UNIT;
-+
-+ for (i1 = 0; i1 < len1; i1++) {
-+ diag = i1 * UNIT;
-+ above = (i1+1) * UNIT;
-+ for (i2 = 0; i2 < len2; i2++) {
-+ left = a[i2];
-+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
-+ if (cost < 0)
-+ cost = UNIT - cost;
-+ else
-+ cost = UNIT + cost;
-+ }
-+ diag_inc = diag + cost;
-+ left_inc = left + UNIT + *((uchar*)s1+i1);
-+ above_inc = above + UNIT + *((uchar*)s2+i2);
-+ a[i2] = above = left < above
-+ ? (left_inc < diag_inc ? left_inc : diag_inc)
-+ : (above_inc < diag_inc ? above_inc : diag_inc);
-+ diag = left;
-+ }
-+ }
-+
-+ return a[len2-1];
-+}