- Fixed a couple bugs in find_filename_suffix().
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
36366d5c
WD
1This latest version has most of the TODO-list items solved. The one
2remaining issue is that we really need to handle all the files in a dir
3before we move on to the sub-directories, so this patch needs the sorting
4algorithm to change to put all the subdirs at the end of the list of a
5dir's contents.
241013b4 6
824abc86
WD
7Be sure to run "make proto" before "make".
8
36366d5c
WD
9--- orig/flist.c 2005-02-12 19:54:27
10+++ flist.c 2005-02-13 09:49:22
11@@ -330,7 +330,7 @@ void send_file_entry(struct file_struct
12 char fname[MAXPATHLEN];
13 int l1, l2;
14
15- if (f == -1)
16+ if (f < 0)
17 return;
18
19 if (!file) {
20@@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l
21 struct file_struct *file;
22 char fbuf[MAXPATHLEN];
23
24- if (!(file = make_file(fname, flist, ALL_FILTERS)))
25+ file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS);
26+ if (!file)
27 return;
28
29 maybe_emit_filelist_progress(flist);
30@@ -1311,7 +1312,7 @@ struct file_list *recv_file_list(int f)
31
32 clean_flist(flist, relative_paths, 1);
33
34- if (f != -1) {
35+ if (f >= 0) {
36 /* Now send the uid/gid list. This was introduced in
37 * protocol version 15 */
38 recv_uid_list(f, flist);
39@@ -1650,6 +1651,25 @@ static int is_backup_file(char *fn)
40 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
41 }
42
43+struct file_list *get_dirlist(const char *dirname, int ignore_excludes)
44+{
45+ struct file_list *dirlist;
46+ char dirbuf[MAXPATHLEN];
47+ int dlen;
48+ int save_recurse = recurse;
49+
50+ dlen = strlcpy(dirbuf, dirname, MAXPATHLEN);
51+ if (dlen >= MAXPATHLEN)
52+ return NULL;
53+
54+ dirlist = flist_new(WITHOUT_HLINK, "get_dirlist");
55+ recurse = 0;
56+ send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen);
57+ recurse = save_recurse;
58+
59+ return dirlist;
60+}
61+
62
63 /* This function is used to implement per-directory deletion, and
64 * is used by all the --delete-WHEN options. Note that the fbuf
b78a6aba 65--- orig/generator.c 2005-02-13 05:50:28
36366d5c 66+++ generator.c 2005-02-13 10:01:48
79f132a1 67@@ -47,6 +47,7 @@ extern int size_only;
0808daa5 68 extern OFF_T max_size;
58118c25
WD
69 extern int io_timeout;
70 extern int protocol_version;
09fb2223 71+extern int fuzzy_basis;
58118c25 72 extern int always_checksum;
b952a177 73 extern char *partial_dir;
0808daa5 74 extern char *basis_dir[];
36366d5c 75@@ -227,6 +228,47 @@ static void generate_and_send_sums(int f
4370504a 76 unmap_file(mapbuf);
58118c25
WD
77 }
78
36366d5c
WD
79+/* Try to find a filename in the same dir as "fname" with a similar name. */
80+static int find_fuzzy(struct file_struct *file, struct file_list *dirlist)
47dd7a31 81+{
36366d5c
WD
82+ int fname_len, fname_suf_len;
83+ const char *fname_suf, *fname = file->basename;
054f3f90 84+ uint32 lowest_dist = 0x7FFFFFFF;
36366d5c 85+ int j, lowest_j = -1;
47dd7a31 86+
36366d5c
WD
87+ fname_len = strlen(fname);
88+ fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len);
47dd7a31 89+
36366d5c
WD
90+ for (j = 0; j < dirlist->count; j++) {
91+ struct file_struct *fp = dirlist->files[j];
92+ const char *suf, *name;
93+ int len, suf_len;
054f3f90 94+ uint32 dist;
47dd7a31 95+
36366d5c 96+ if (!S_ISREG(fp->mode) || !fp->length)
47dd7a31
WD
97+ continue;
98+
36366d5c
WD
99+ name = fp->basename;
100+ len = strlen(name);
101+ suf = find_filename_suffix(name, len, &suf_len);
054f3f90 102+
36366d5c 103+ dist = fuzzy_distance(name, len, fname, fname_len);
fc82f579 104+ /* Add some extra weight to how well the suffixes match. */
36366d5c
WD
105+ dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len)
106+ * 10;
4cd1daea
WD
107+ if (verbose > 4) {
108+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
36366d5c 109+ name, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 110+ }
fc82f579 111+ if (dist <= lowest_dist) {
054f3f90 112+ lowest_dist = dist;
36366d5c 113+ lowest_j = j;
47dd7a31
WD
114+ }
115+ }
47dd7a31 116+
36366d5c 117+ return lowest_j;
47dd7a31 118+}
58118c25
WD
119+
120
a7219d20
WD
121 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
122 * make sure it exists, and has the right permissions/timestamp info. For
36366d5c
WD
123@@ -241,6 +283,8 @@ static void recv_generator(char *fname,
124 int f_out, int f_out_name)
125 {
126 static int missing_below = -1;
127+ static char *fuzzy_dirname = NULL;
128+ static struct file_list *fuzzy_dirlist = NULL;
129 int fd = -1, f_copy = -1;
130 STRUCT_STAT st, partial_st;
131 struct file_struct *back_file = NULL;
132@@ -275,6 +319,16 @@ static void recv_generator(char *fname,
133 statret = -1;
134 stat_errno = ENOENT;
135 } else {
136+ if (fuzzy_basis && S_ISREG(file->mode)) {
137+ char *dn = file->dirname ? file->dirname : ".";
138+ if (fuzzy_dirname != dn) {
139+ if (fuzzy_dirlist)
140+ flist_free(fuzzy_dirlist);
141+ fuzzy_dirname = dn;
142+ fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1);
143+ }
144+ }
145+
146 statret = link_stat(fname, &st,
147 keep_dirlinks && S_ISDIR(file->mode));
148 stat_errno = errno;
149@@ -492,6 +546,24 @@ static void recv_generator(char *fname,
f48a237e
WD
150 } else
151 partialptr = NULL;
824abc86 152
36366d5c
WD
153+ if (statret == -1 && fuzzy_basis && dry_run <= 1) {
154+ int j = find_fuzzy(file, fuzzy_dirlist);
155+ if (j >= 0) {
156+ struct file_struct *fp = fuzzy_dirlist->files[j];
157+ f_name_to(fp, fnamecmpbuf);
158+ if (verbose > 2) {
159+ rprintf(FINFO, "fuzzy match for %s: %s\n",
160+ safe_fname(fname), safe_fname(fnamecmpbuf));
161+ }
162+ st.st_mode = fp->mode;
163+ st.st_size = fp->length;
164+ st.st_mtime = fp->modtime;
09fb2223
WD
165+ statret = 0;
166+ fnamecmp = fnamecmpbuf;
0edc7d7f 167+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 168+ }
824abc86
WD
169+ }
170+
09fb2223
WD
171 if (statret == -1) {
172 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 173 return;
36366d5c 174@@ -520,6 +592,8 @@ static void recv_generator(char *fname,
241013b4 175
9cf86680 176 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
177 ;
178+ else if (fnamecmp_type == FNAMECMP_FUZZY)
179+ ;
180 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 181 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 182 set_perms(fname, file, &st, PERMS_REPORT);
36366d5c
WD
183@@ -540,6 +614,11 @@ prepare_to_open:
184 statret = -1;
185 goto notify_others;
186 }
187+ if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) {
188+ int j = flist_find(fuzzy_dirlist, file);
189+ if (j >= 0) /* don't use an updating file as fuzzy basis */
190+ fuzzy_dirlist->files[j]->length = 0;
191+ }
192
193 /* open the file */
194 fd = do_open(fnamecmp, O_RDONLY, 0);
195@@ -594,8 +673,24 @@ notify_others:
a7219d20 196 write_int(f_out, ndx);
0f626034
WD
197 if (protocol_version >= 29 && inplace && !read_batch)
198 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
199- if (f_out_name >= 0)
200+ if (f_out_name >= 0) {
09fb2223 201 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 202+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
203+ uchar lenbuf[3], *lb = lenbuf;
204+ int len = strlen(fnamecmpbuf);
205+ if (len > 0x7F) {
206+#if MAXPATHLEN > 0x7FFF
207+ *lb++ = len / 0x10000 + 0x80;
208+ *lb++ = len / 0x100;
209+#else
210+ *lb++ = len / 0x100 + 0x80;
211+#endif
212+ }
213+ *lb = len;
214+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
215+ write_buf(f_out_name, fnamecmpbuf, len);
216+ }
0edc7d7f 217+ }
09fb2223 218
0edc7d7f
WD
219 if (dry_run || read_batch)
220 return;
b78a6aba 221--- orig/main.c 2005-02-07 20:41:56
3eabe3a3 222+++ main.c 2005-01-14 18:33:15
b78a6aba 223@@ -44,6 +44,7 @@ extern int keep_dirlinks;
495f1899
WD
224 extern int preserve_hard_links;
225 extern int protocol_version;
226 extern int recurse;
09fb2223 227+extern int fuzzy_basis;
495f1899
WD
228 extern int relative_paths;
229 extern int rsync_port;
230 extern int whole_file;
b78a6aba 231@@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
232 int pid;
233 int status = 0;
234 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
235- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
236+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
237+ && !dry_run;
495f1899 238
d5753a22
WD
239 /* The receiving side mustn't obey this, or an existing symlink that
240 * points to an identical file won't be replaced by the referent. */
b78a6aba
WD
241--- orig/options.c 2005-02-13 05:50:28
242+++ options.c 2005-02-13 06:56:06
243@@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
f6c3b300 244 int size_only = 0;
0808daa5 245 int daemon_bwlimit = 0;
f6c3b300 246 int bwlimit = 0;
09fb2223 247+int fuzzy_basis = 0;
f6c3b300 248 size_t bwlimit_writemax = 0;
f6c3b300 249 int only_existing = 0;
be73a66e 250 int opt_ignore_existing = 0;
b78a6aba
WD
251@@ -302,6 +303,7 @@ void usage(enum logcode F)
252 rprintf(F," --size-only skip files that match in size\n");
253 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
254 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
255+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c 256 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
257 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
258 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
b78a6aba 259@@ -411,6 +413,7 @@ static struct poptOption long_options[]
0808daa5
WD
260 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
261 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
262 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 263+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
264 /* TODO: Should this take an optional int giving the compression level? */
265 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 266 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
b78a6aba 267@@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg
4370504a
WD
268 if (!implied_dirs && !am_sender)
269 args[ac++] = "--no-implied-dirs";
7b675ff5 270
09fb2223 271+ if (fuzzy_basis && am_sender)
241013b4 272+ args[ac++] = "--fuzzy";
7b675ff5 273+
241013b4 274 *argc = ac;
f74d2272 275 return;
7b675ff5 276
b78a6aba 277--- orig/receiver.c 2005-02-11 10:53:14
9cf86680 278+++ receiver.c 2005-01-15 21:21:02
b78a6aba 279@@ -257,6 +257,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
280 }
281
282
283+static void read_gen_name(int fd, char *buf)
284+{
285+ int len = read_byte(fd);
286+ if (len & 0x80) {
287+#if MAXPATHLEN > 32767
288+ uchar lenbuf[2];
289+ read_buf(fd, (char *)lenbuf, 2);
290+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
291+#else
292+ len = (len & ~0x80) * 0x100 + read_byte(fd);
293+#endif
294+ }
295+ if (len >= MAXPATHLEN) {
296+ rprintf(FERROR, "bogus data on generator name pipe\n");
297+ exit_cleanup(RERR_PROTOCOL);
298+ }
299+
300+ read_sbuf(fd, buf, len);
301+}
302+
303+
304 static void discard_receive_data(int f_in, OFF_T length)
305 {
306 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
b78a6aba 307@@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 308 case FNAMECMP_BACKUP:
f48a237e 309 fnamecmp = get_backup_name(fname);
09fb2223 310 break;
0edc7d7f 311+ case FNAMECMP_FUZZY:
09fb2223
WD
312+ read_gen_name(f_in_name, fnamecmpbuf);
313+ fnamecmp = fnamecmpbuf;
314+ break;
315 default:
9cf86680
WD
316 if (j >= basis_dir_cnt) {
317 rprintf(FERROR,
b78a6aba 318--- orig/rsync.h 2005-02-12 19:54:27
a27b0830 319+++ rsync.h 2005-01-19 18:36:47
b78a6aba 320@@ -127,6 +127,7 @@
0edc7d7f
WD
321 #define FNAMECMP_FNAME 0x80
322 #define FNAMECMP_PARTIAL_DIR 0x81
323 #define FNAMECMP_BACKUP 0x82
324+#define FNAMECMP_FUZZY 0x83
09fb2223 325
a27b0830 326 /* For calling delete_file() */
be73a66e 327 #define DEL_DIR (1<<0)
b78a6aba
WD
328--- orig/rsync.yo 2005-02-13 05:50:28
329+++ rsync.yo 2005-02-13 06:56:46
330@@ -351,6 +351,7 @@ to the detailed description below for a
331 --size-only skip files that match in size
332 --modify-window=NUM compare mod-times with reduced accuracy
333 -T --temp-dir=DIR create temporary files in directory DIR
334+ --fuzzy find similar file for basis when no dest
f0533c4c 335 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
336 --copy-dest=DIR ... and include copies of unchanged files
337 --link-dest=DIR hardlink to files in DIR when unchanged
b78a6aba
WD
338@@ -909,6 +910,14 @@ scratch directory when creating temporar
339 transferred on the receiving side. The default behavior is to create
340 the temporary files in the receiving directory.
637c560e
WD
341
342+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
343+basis file for any destination file that is missing. The current algorithm
344+looks for a similarly-named file in the same directory as the destination
fc82f579
WD
345+file, and, if found, uses that to try to speed up the transfer. Note that
346+the use of the --delete option might get rid of any potential fuzzy-match
347+files, so either use --delete-after or filename exclusions if you need to
348+prevent this.
637c560e 349+
b78a6aba
WD
350 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
351 the destination machine as an additional hierarchy to compare destination
352 files against doing transfers (if the files are missing in the destination
353--- orig/util.c 2005-02-11 10:53:15
36366d5c
WD
354+++ util.c 2005-02-13 09:44:25
355@@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
356 return malloc(size * num);
357 return realloc(ptr, size * num);
358 }
359+
4cd1daea
WD
360+/* Take a filename and filename length and return the most significant
361+ * filename suffix we can find. This ignores suffixes such as "~",
362+ * ".bak", ".orig", ".~1~", etc. */
a27b0830 363+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
4cd1daea
WD
364+{
365+ const char *suf, *s;
4cd1daea 366+ BOOL had_tilde;
a27b0830 367+ int s_len;
4cd1daea
WD
368+
369+ /* One or more dots at the start aren't a suffix. */
370+ while (fn_len && *fn == '.') fn++, fn_len--;
371+
372+ /* Ignore the ~ in a "foo~" filename. */
373+ if (fn_len > 1 && fn[fn_len-1] == '~')
374+ fn_len--, had_tilde = True;
375+ else
376+ had_tilde = False;
377+
378+ /* Assume we don't find an suffix. */
379+ suf = "";
380+ *len_ptr = 0;
381+
382+ /* Find the last significant suffix. */
36366d5c
WD
383+ for (s = fn + fn_len; fn_len > 1; ) {
384+ while (*--s != '.' && s != fn) {}
4cd1daea
WD
385+ if (s == fn)
386+ break;
387+ s_len = fn_len - (s - fn);
388+ fn_len = s - fn;
389+ if (s_len == 3) {
390+ if (strcmp(s+1, "bak") == 0
391+ || strcmp(s+1, "old") == 0)
392+ continue;
393+ } else if (s_len == 4) {
394+ if (strcmp(s+1, "orig") == 0)
395+ continue;
396+ } else if (s_len > 2 && had_tilde
397+ && s[1] == '~' && isdigit(s[2]))
398+ continue;
399+ *len_ptr = s_len;
400+ suf = s;
36366d5c
WD
401+ if (s_len == 1)
402+ break;
4cd1daea
WD
403+ /* Determine if the suffix is all digits. */
404+ for (s++, s_len--; s_len > 0; s++, s_len--) {
405+ if (!isdigit(*s))
406+ return suf;
407+ }
408+ /* An all-digit suffix may not be that signficant. */
36366d5c 409+ s = suf;
4cd1daea
WD
410+ }
411+
412+ return suf;
413+}
414+
054f3f90
WD
415+/* This is an implementation of the Levenshtein distance algorithm. It
416+ * was implemented to avoid needing a two-dimensional matrix (to save
417+ * memory). It was also tweaked to try to factor in the ASCII distance
418+ * between changed characters as a minor distance quantity. The normal
419+ * Levenshtein units of distance (each signifying a single change between
420+ * the two strings) are defined as a "UNIT". */
421+
422+#define UNIT (1 << 16)
423+
424+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
425+{
426+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
427+ int32 cost;
428+ int i1, i2;
429+
4cd1daea
WD
430+ if (!len1 || !len2) {
431+ if (!len1) {
432+ s1 = s2;
433+ len1 = len2;
434+ }
435+ for (i1 = 0, cost = 0; i1 < len1; i1++)
436+ cost += s1[i1];
437+ return (int32)len1 * UNIT + cost;
438+ }
054f3f90
WD
439+
440+ for (i2 = 0; i2 < len2; i2++)
441+ a[i2] = (i2+1) * UNIT;
442+
443+ for (i1 = 0; i1 < len1; i1++) {
444+ diag = i1 * UNIT;
445+ above = (i1+1) * UNIT;
446+ for (i2 = 0; i2 < len2; i2++) {
447+ left = a[i2];
448+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
449+ if (cost < 0)
450+ cost = UNIT - cost;
451+ else
452+ cost = UNIT + cost;
453+ }
454+ diag_inc = diag + cost;
455+ left_inc = left + UNIT + *((uchar*)s1+i1);
456+ above_inc = above + UNIT + *((uchar*)s2+i2);
457+ a[i2] = above = left < above
458+ ? (left_inc < diag_inc ? left_inc : diag_inc)
459+ : (above_inc < diag_inc ? above_inc : diag_inc);
460+ diag = left;
461+ }
462+ }
463+
464+ return a[len2-1];
465+}