Fixed a failing hunk.
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
4535c2b6
WD
1This patch makes rsync look try to find a basis file for a file that
2doesn't already have one.
241013b4 3
824abc86
WD
4Be sure to run "make proto" before "make".
5
4535c2b6 6--- orig/flist.c 2005-02-13 21:17:16
36366d5c
WD
7+++ flist.c 2005-02-13 09:49:22
8@@ -330,7 +330,7 @@ void send_file_entry(struct file_struct
9 char fname[MAXPATHLEN];
10 int l1, l2;
11
12- if (f == -1)
13+ if (f < 0)
14 return;
15
16 if (!file) {
17@@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l
18 struct file_struct *file;
19 char fbuf[MAXPATHLEN];
20
21- if (!(file = make_file(fname, flist, ALL_FILTERS)))
22+ file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS);
23+ if (!file)
24 return;
25
26 maybe_emit_filelist_progress(flist);
4535c2b6 27@@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f)
36366d5c
WD
28
29 clean_flist(flist, relative_paths, 1);
30
31- if (f != -1) {
32+ if (f >= 0) {
33 /* Now send the uid/gid list. This was introduced in
34 * protocol version 15 */
35 recv_uid_list(f, flist);
4535c2b6 36@@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn)
36366d5c
WD
37 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
38 }
39
40+struct file_list *get_dirlist(const char *dirname, int ignore_excludes)
41+{
42+ struct file_list *dirlist;
43+ char dirbuf[MAXPATHLEN];
44+ int dlen;
45+ int save_recurse = recurse;
46+
47+ dlen = strlcpy(dirbuf, dirname, MAXPATHLEN);
48+ if (dlen >= MAXPATHLEN)
49+ return NULL;
50+
51+ dirlist = flist_new(WITHOUT_HLINK, "get_dirlist");
52+ recurse = 0;
53+ send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen);
54+ recurse = save_recurse;
55+
56+ return dirlist;
57+}
58+
59
60 /* This function is used to implement per-directory deletion, and
61 * is used by all the --delete-WHEN options. Note that the fbuf
b78a6aba 62--- orig/generator.c 2005-02-13 05:50:28
d7d8b822 63+++ generator.c 2005-02-13 21:47:28
79f132a1 64@@ -47,6 +47,7 @@ extern int size_only;
0808daa5 65 extern OFF_T max_size;
58118c25
WD
66 extern int io_timeout;
67 extern int protocol_version;
09fb2223 68+extern int fuzzy_basis;
58118c25 69 extern int always_checksum;
b952a177 70 extern char *partial_dir;
0808daa5 71 extern char *basis_dir[];
d7d8b822 72@@ -227,6 +228,59 @@ static void generate_and_send_sums(int f
4370504a 73 unmap_file(mapbuf);
58118c25
WD
74 }
75
36366d5c
WD
76+/* Try to find a filename in the same dir as "fname" with a similar name. */
77+static int find_fuzzy(struct file_struct *file, struct file_list *dirlist)
47dd7a31 78+{
36366d5c
WD
79+ int fname_len, fname_suf_len;
80+ const char *fname_suf, *fname = file->basename;
054f3f90 81+ uint32 lowest_dist = 0x7FFFFFFF;
36366d5c 82+ int j, lowest_j = -1;
47dd7a31 83+
36366d5c
WD
84+ fname_len = strlen(fname);
85+ fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len);
47dd7a31 86+
36366d5c
WD
87+ for (j = 0; j < dirlist->count; j++) {
88+ struct file_struct *fp = dirlist->files[j];
89+ const char *suf, *name;
90+ int len, suf_len;
054f3f90 91+ uint32 dist;
47dd7a31 92+
4535c2b6
WD
93+ if (!S_ISREG(fp->mode) || !fp->length
94+ || fp->flags & FLAG_NO_FUZZY)
47dd7a31
WD
95+ continue;
96+
36366d5c 97+ name = fp->basename;
d7d8b822
WD
98+
99+ if (fp->length == file->length
100+ && fp->modtime == file->modtime) {
101+ if (verbose > 4) {
102+ rprintf(FINFO,
103+ "fuzzy size/modtime match for %s\n",
104+ name);
105+ }
106+ return j;
107+ }
108+
36366d5c
WD
109+ len = strlen(name);
110+ suf = find_filename_suffix(name, len, &suf_len);
054f3f90 111+
36366d5c 112+ dist = fuzzy_distance(name, len, fname, fname_len);
fc82f579 113+ /* Add some extra weight to how well the suffixes match. */
36366d5c
WD
114+ dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len)
115+ * 10;
4cd1daea 116+ if (verbose > 4) {
4535c2b6 117+ rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n",
36366d5c 118+ name, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 119+ }
fc82f579 120+ if (dist <= lowest_dist) {
054f3f90 121+ lowest_dist = dist;
36366d5c 122+ lowest_j = j;
47dd7a31
WD
123+ }
124+ }
47dd7a31 125+
36366d5c 126+ return lowest_j;
47dd7a31 127+}
58118c25
WD
128+
129
a7219d20
WD
130 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
131 * make sure it exists, and has the right permissions/timestamp info. For
d7d8b822 132@@ -241,6 +295,8 @@ static void recv_generator(char *fname,
36366d5c
WD
133 int f_out, int f_out_name)
134 {
135 static int missing_below = -1;
136+ static char *fuzzy_dirname = NULL;
137+ static struct file_list *fuzzy_dirlist = NULL;
138 int fd = -1, f_copy = -1;
139 STRUCT_STAT st, partial_st;
140 struct file_struct *back_file = NULL;
d7d8b822 141@@ -275,6 +331,16 @@ static void recv_generator(char *fname,
36366d5c
WD
142 statret = -1;
143 stat_errno = ENOENT;
144 } else {
145+ if (fuzzy_basis && S_ISREG(file->mode)) {
146+ char *dn = file->dirname ? file->dirname : ".";
147+ if (fuzzy_dirname != dn) {
148+ if (fuzzy_dirlist)
149+ flist_free(fuzzy_dirlist);
150+ fuzzy_dirname = dn;
151+ fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1);
152+ }
153+ }
154+
155 statret = link_stat(fname, &st,
156 keep_dirlinks && S_ISDIR(file->mode));
157 stat_errno = errno;
d7d8b822 158@@ -492,6 +558,24 @@ static void recv_generator(char *fname,
f48a237e
WD
159 } else
160 partialptr = NULL;
824abc86 161
36366d5c
WD
162+ if (statret == -1 && fuzzy_basis && dry_run <= 1) {
163+ int j = find_fuzzy(file, fuzzy_dirlist);
164+ if (j >= 0) {
165+ struct file_struct *fp = fuzzy_dirlist->files[j];
166+ f_name_to(fp, fnamecmpbuf);
167+ if (verbose > 2) {
d7d8b822 168+ rprintf(FINFO, "fuzzy basis selected for %s: %s\n",
36366d5c
WD
169+ safe_fname(fname), safe_fname(fnamecmpbuf));
170+ }
171+ st.st_mode = fp->mode;
172+ st.st_size = fp->length;
173+ st.st_mtime = fp->modtime;
09fb2223
WD
174+ statret = 0;
175+ fnamecmp = fnamecmpbuf;
0edc7d7f 176+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 177+ }
824abc86
WD
178+ }
179+
09fb2223
WD
180 if (statret == -1) {
181 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 182 return;
d7d8b822 183@@ -520,6 +604,8 @@ static void recv_generator(char *fname,
241013b4 184
9cf86680 185 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
186 ;
187+ else if (fnamecmp_type == FNAMECMP_FUZZY)
188+ ;
189 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 190 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 191 set_perms(fname, file, &st, PERMS_REPORT);
d7d8b822 192@@ -540,6 +626,11 @@ prepare_to_open:
36366d5c
WD
193 statret = -1;
194 goto notify_others;
195 }
196+ if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) {
197+ int j = flist_find(fuzzy_dirlist, file);
4535c2b6
WD
198+ if (j >= 0) /* don't use updating file as future fuzzy basis */
199+ fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY;
36366d5c
WD
200+ }
201
202 /* open the file */
203 fd = do_open(fnamecmp, O_RDONLY, 0);
d7d8b822 204@@ -594,8 +685,24 @@ notify_others:
a7219d20 205 write_int(f_out, ndx);
0f626034
WD
206 if (protocol_version >= 29 && inplace && !read_batch)
207 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
208- if (f_out_name >= 0)
209+ if (f_out_name >= 0) {
09fb2223 210 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 211+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
212+ uchar lenbuf[3], *lb = lenbuf;
213+ int len = strlen(fnamecmpbuf);
214+ if (len > 0x7F) {
215+#if MAXPATHLEN > 0x7FFF
216+ *lb++ = len / 0x10000 + 0x80;
217+ *lb++ = len / 0x100;
218+#else
219+ *lb++ = len / 0x100 + 0x80;
220+#endif
221+ }
222+ *lb = len;
223+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
224+ write_buf(f_out_name, fnamecmpbuf, len);
225+ }
0edc7d7f 226+ }
09fb2223 227
0edc7d7f
WD
228 if (dry_run || read_batch)
229 return;
b78a6aba 230--- orig/main.c 2005-02-07 20:41:56
3eabe3a3 231+++ main.c 2005-01-14 18:33:15
b78a6aba 232@@ -44,6 +44,7 @@ extern int keep_dirlinks;
495f1899
WD
233 extern int preserve_hard_links;
234 extern int protocol_version;
235 extern int recurse;
09fb2223 236+extern int fuzzy_basis;
495f1899
WD
237 extern int relative_paths;
238 extern int rsync_port;
239 extern int whole_file;
b78a6aba 240@@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
241 int pid;
242 int status = 0;
243 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
244- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
245+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
246+ && !dry_run;
495f1899 247
d5753a22
WD
248 /* The receiving side mustn't obey this, or an existing symlink that
249 * points to an identical file won't be replaced by the referent. */
b78a6aba 250--- orig/options.c 2005-02-13 05:50:28
d7d8b822 251+++ options.c 2005-02-13 21:41:41
b78a6aba 252@@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
f6c3b300 253 int size_only = 0;
0808daa5 254 int daemon_bwlimit = 0;
f6c3b300 255 int bwlimit = 0;
09fb2223 256+int fuzzy_basis = 0;
f6c3b300 257 size_t bwlimit_writemax = 0;
f6c3b300 258 int only_existing = 0;
be73a66e 259 int opt_ignore_existing = 0;
b78a6aba
WD
260@@ -302,6 +303,7 @@ void usage(enum logcode F)
261 rprintf(F," --size-only skip files that match in size\n");
262 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
263 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
d7d8b822 264+ rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
f0533c4c 265 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
266 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
267 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
b78a6aba 268@@ -411,6 +413,7 @@ static struct poptOption long_options[]
0808daa5
WD
269 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
270 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
271 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
d7d8b822 272+ {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
273 /* TODO: Should this take an optional int giving the compression level? */
274 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 275 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
b78a6aba 276@@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg
4370504a
WD
277 if (!implied_dirs && !am_sender)
278 args[ac++] = "--no-implied-dirs";
7b675ff5 279
09fb2223 280+ if (fuzzy_basis && am_sender)
241013b4 281+ args[ac++] = "--fuzzy";
7b675ff5 282+
241013b4 283 *argc = ac;
f74d2272 284 return;
7b675ff5 285
b78a6aba 286--- orig/receiver.c 2005-02-11 10:53:14
9cf86680 287+++ receiver.c 2005-01-15 21:21:02
b78a6aba 288@@ -257,6 +257,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
289 }
290
291
292+static void read_gen_name(int fd, char *buf)
293+{
294+ int len = read_byte(fd);
295+ if (len & 0x80) {
296+#if MAXPATHLEN > 32767
297+ uchar lenbuf[2];
298+ read_buf(fd, (char *)lenbuf, 2);
299+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
300+#else
301+ len = (len & ~0x80) * 0x100 + read_byte(fd);
302+#endif
303+ }
304+ if (len >= MAXPATHLEN) {
305+ rprintf(FERROR, "bogus data on generator name pipe\n");
306+ exit_cleanup(RERR_PROTOCOL);
307+ }
308+
309+ read_sbuf(fd, buf, len);
310+}
311+
312+
313 static void discard_receive_data(int f_in, OFF_T length)
314 {
315 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
b78a6aba 316@@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 317 case FNAMECMP_BACKUP:
f48a237e 318 fnamecmp = get_backup_name(fname);
09fb2223 319 break;
0edc7d7f 320+ case FNAMECMP_FUZZY:
09fb2223
WD
321+ read_gen_name(f_in_name, fnamecmpbuf);
322+ fnamecmp = fnamecmpbuf;
323+ break;
324 default:
9cf86680
WD
325 if (j >= basis_dir_cnt) {
326 rprintf(FERROR,
b78a6aba 327--- orig/rsync.h 2005-02-12 19:54:27
4535c2b6
WD
328+++ rsync.h 2005-02-13 21:19:16
329@@ -60,6 +60,7 @@
330 #define FLAG_TOP_DIR (1<<0)
331 #define FLAG_HLINK_EOL (1<<1) /* generator only */
332 #define FLAG_MOUNT_POINT (1<<2) /* sender only */
333+#define FLAG_NO_FUZZY (1<<2) /* generator only */
334 #define FLAG_DEL_HERE (1<<3) /* receiver/generator */
335
336 /* update this if you make incompatible changes */
337@@ -127,6 +128,7 @@
0edc7d7f
WD
338 #define FNAMECMP_FNAME 0x80
339 #define FNAMECMP_PARTIAL_DIR 0x81
340 #define FNAMECMP_BACKUP 0x82
341+#define FNAMECMP_FUZZY 0x83
09fb2223 342
a27b0830 343 /* For calling delete_file() */
be73a66e 344 #define DEL_DIR (1<<0)
d7d8b822
WD
345--- orig/rsync.yo 2005-02-13 21:51:10
346+++ rsync.yo 2005-02-13 21:41:52
b78a6aba
WD
347@@ -351,6 +351,7 @@ to the detailed description below for a
348 --size-only skip files that match in size
349 --modify-window=NUM compare mod-times with reduced accuracy
d7d8b822
WD
350 -T, --temp-dir=DIR create temporary files in directory DIR
351+ -y, --fuzzy find similar file for basis if no dest file
f0533c4c 352 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
353 --copy-dest=DIR ... and include copies of unchanged files
354 --link-dest=DIR hardlink to files in DIR when unchanged
d7d8b822 355@@ -909,6 +910,16 @@ scratch directory when creating temporar
b78a6aba
WD
356 transferred on the receiving side. The default behavior is to create
357 the temporary files in the receiving directory.
637c560e 358
d7d8b822 359+dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a
637c560e 360+basis file for any destination file that is missing. The current algorithm
d7d8b822
WD
361+looks in the same directory as the destination file for either a file that
362+has an identical size and modified-time, or a similarly-named file. If
363+found, rsync uses the fuzzy basis file to try to speed up the transfer.
364+
365+Note that the use of the bf(--delete) option might get rid of any potential
366+fuzzy-match files, so either use bf(--delete-after) or specify some
367+filename exclusions if you need to prevent this.
637c560e 368+
b78a6aba
WD
369 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
370 the destination machine as an additional hierarchy to compare destination
371 files against doing transfers (if the files are missing in the destination
372--- orig/util.c 2005-02-11 10:53:15
36366d5c
WD
373+++ util.c 2005-02-13 09:44:25
374@@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
375 return malloc(size * num);
376 return realloc(ptr, size * num);
377 }
378+
4cd1daea
WD
379+/* Take a filename and filename length and return the most significant
380+ * filename suffix we can find. This ignores suffixes such as "~",
381+ * ".bak", ".orig", ".~1~", etc. */
a27b0830 382+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
4cd1daea
WD
383+{
384+ const char *suf, *s;
4cd1daea 385+ BOOL had_tilde;
a27b0830 386+ int s_len;
4cd1daea
WD
387+
388+ /* One or more dots at the start aren't a suffix. */
389+ while (fn_len && *fn == '.') fn++, fn_len--;
390+
391+ /* Ignore the ~ in a "foo~" filename. */
392+ if (fn_len > 1 && fn[fn_len-1] == '~')
393+ fn_len--, had_tilde = True;
394+ else
395+ had_tilde = False;
396+
397+ /* Assume we don't find an suffix. */
398+ suf = "";
399+ *len_ptr = 0;
400+
401+ /* Find the last significant suffix. */
36366d5c
WD
402+ for (s = fn + fn_len; fn_len > 1; ) {
403+ while (*--s != '.' && s != fn) {}
4cd1daea
WD
404+ if (s == fn)
405+ break;
406+ s_len = fn_len - (s - fn);
407+ fn_len = s - fn;
408+ if (s_len == 3) {
409+ if (strcmp(s+1, "bak") == 0
410+ || strcmp(s+1, "old") == 0)
411+ continue;
412+ } else if (s_len == 4) {
413+ if (strcmp(s+1, "orig") == 0)
414+ continue;
415+ } else if (s_len > 2 && had_tilde
416+ && s[1] == '~' && isdigit(s[2]))
417+ continue;
418+ *len_ptr = s_len;
419+ suf = s;
36366d5c
WD
420+ if (s_len == 1)
421+ break;
4cd1daea
WD
422+ /* Determine if the suffix is all digits. */
423+ for (s++, s_len--; s_len > 0; s++, s_len--) {
424+ if (!isdigit(*s))
425+ return suf;
426+ }
427+ /* An all-digit suffix may not be that signficant. */
36366d5c 428+ s = suf;
4cd1daea
WD
429+ }
430+
431+ return suf;
432+}
433+
054f3f90
WD
434+/* This is an implementation of the Levenshtein distance algorithm. It
435+ * was implemented to avoid needing a two-dimensional matrix (to save
436+ * memory). It was also tweaked to try to factor in the ASCII distance
437+ * between changed characters as a minor distance quantity. The normal
438+ * Levenshtein units of distance (each signifying a single change between
439+ * the two strings) are defined as a "UNIT". */
440+
441+#define UNIT (1 << 16)
442+
443+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
444+{
445+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
446+ int32 cost;
447+ int i1, i2;
448+
4cd1daea
WD
449+ if (!len1 || !len2) {
450+ if (!len1) {
451+ s1 = s2;
452+ len1 = len2;
453+ }
454+ for (i1 = 0, cost = 0; i1 < len1; i1++)
455+ cost += s1[i1];
456+ return (int32)len1 * UNIT + cost;
457+ }
054f3f90
WD
458+
459+ for (i2 = 0; i2 < len2; i2++)
460+ a[i2] = (i2+1) * UNIT;
461+
462+ for (i1 = 0; i1 < len1; i1++) {
463+ diag = i1 * UNIT;
464+ above = (i1+1) * UNIT;
465+ for (i2 = 0; i2 < len2; i2++) {
466+ left = a[i2];
467+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
468+ if (cost < 0)
469+ cost = UNIT - cost;
470+ else
471+ cost = UNIT + cost;
472+ }
473+ diag_inc = diag + cost;
474+ left_inc = left + UNIT + *((uchar*)s1+i1);
475+ above_inc = above + UNIT + *((uchar*)s2+i2);
476+ a[i2] = above = left < above
477+ ? (left_inc < diag_inc ? left_inc : diag_inc)
478+ : (above_inc < diag_inc ? above_inc : diag_inc);
479+ diag = left;
480+ }
481+ }
482+
483+ return a[len2-1];
484+}