Updated to apply cleanly.
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
a7219d20
WD
7--- orig/generator.c 2005-02-03 02:04:20
8+++ generator.c 2005-02-03 02:11:10
79f132a1 9@@ -47,6 +47,7 @@ extern int size_only;
0808daa5 10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
79f132a1 17@@ -227,6 +228,88 @@ static void generate_and_send_sums(int f
4370504a 18 unmap_file(mapbuf);
58118c25
WD
19 }
20
fc82f579
WD
21+/* Try to find a filename in the same dir as "fname" with a similar name.
22+ *
23+ * TODO:
4370504a 24+ * - We should be using a cache of names for the current dir, not
fc82f579 25+ * re-reading the destination directory for every file.
4370504a
WD
26+ * - We must not return an rsync tempfile from the current transfer.
27+ * - If the highest-rated name is not a normal file, we should fall-
fc82f579 28+ * back to the next highest-rated file.
4370504a 29+ * - We must not return a destination file that is being updated
fc82f579
WD
30+ * during the current transfer, even if we already processed it
31+ * (since the receiver may not be done with it yet).
4370504a 32+ * - We must weed out any names that a daemon's config has excluded.
fc82f579 33+ */
09fb2223 34+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
35+{
36+ DIR *d;
37+ struct dirent *di;
0f626034 38+ char *basename, *dirname, *slash;
47dd7a31 39+ char bestname[MAXPATHLEN];
4cd1daea 40+ int suf_len, basename_len;
054f3f90 41+ uint32 lowest_dist = 0x7FFFFFFF;
4cd1daea 42+ const char *suf;
47dd7a31 43+
0f626034
WD
44+ strlcpy(buf, fname, MAXPATHLEN);
45+ if ((slash = strrchr(buf, '/')) != NULL) {
46+ dirname = buf;
47+ *slash = '\0';
48+ basename = slash + 1;
49+ } else {
50+ basename = buf;
51+ dirname = ".";
52+ }
054f3f90 53+ basename_len = strlen(basename);
47dd7a31 54+
47dd7a31
WD
55+ if (!(d = opendir(dirname))) {
56+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
57+ return -1;
58+ }
e55625fb
WD
59+ if (slash)
60+ *slash = '/';
47dd7a31 61+
a27b0830 62+ suf = find_filename_suffix(basename, basename_len, &suf_len);
47dd7a31 63+
054f3f90 64+ bestname[0] = '\0';
47dd7a31 65+ while ((di = readdir(d)) != NULL) {
4cd1daea 66+ const char *dname_suf, *dname = d_name(di);
054f3f90 67+ uint32 dist;
4cd1daea 68+ int dname_len, dname_suf_len;
47dd7a31
WD
69+
70+ if (dname[0] == '.' && (dname[1] == '\0'
71+ || (dname[1] == '.' && dname[2] == '\0')))
72+ continue;
73+
a27b0830
WD
74+ dname_len = strlen(dname);
75+ dname_suf = find_filename_suffix(dname, dname_len, &dname_suf_len);
054f3f90
WD
76+
77+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
fc82f579 78+ /* Add some extra weight to how well the suffixes match. */
4cd1daea
WD
79+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
80+ if (verbose > 4) {
81+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
82+ dname, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 83+ }
fc82f579 84+ if (dist <= lowest_dist) {
8c5b8235 85+ strlcpy(bestname, dname, sizeof bestname);
054f3f90 86+ lowest_dist = dist;
47dd7a31
WD
87+ }
88+ }
89+ closedir(d);
90+
91+ /* Found a candidate. */
054f3f90 92+ if (bestname[0] != '\0') {
e55625fb 93+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
054f3f90
WD
94+ if (verbose > 2) {
95+ rprintf(FINFO, "fuzzy match %s->%s\n",
96+ safe_fname(fname), buf);
97+ }
58118c25 98+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
99+ }
100+ return -1;
101+}
58118c25
WD
102+
103
a7219d20
WD
104 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
105 * make sure it exists, and has the right permissions/timestamp info. For
106@@ -477,6 +560,15 @@ static void recv_generator(char *fname,
f48a237e
WD
107 } else
108 partialptr = NULL;
824abc86 109
09fb2223
WD
110+ if (statret == -1 && fuzzy_basis) {
111+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
112+ && S_ISREG(st.st_mode)) {
113+ statret = 0;
114+ fnamecmp = fnamecmpbuf;
0edc7d7f 115+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 116+ }
824abc86
WD
117+ }
118+
09fb2223
WD
119 if (statret == -1) {
120 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 121 return;
a7219d20 122@@ -505,6 +597,8 @@ static void recv_generator(char *fname,
241013b4 123
9cf86680 124 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
125 ;
126+ else if (fnamecmp_type == FNAMECMP_FUZZY)
127+ ;
128 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 129 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 130 set_perms(fname, file, &st, PERMS_REPORT);
a7219d20
WD
131@@ -579,8 +673,24 @@ notify_others:
132 write_int(f_out, ndx);
0f626034
WD
133 if (protocol_version >= 29 && inplace && !read_batch)
134 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
135- if (f_out_name >= 0)
136+ if (f_out_name >= 0) {
09fb2223 137 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 138+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
139+ uchar lenbuf[3], *lb = lenbuf;
140+ int len = strlen(fnamecmpbuf);
141+ if (len > 0x7F) {
142+#if MAXPATHLEN > 0x7FFF
143+ *lb++ = len / 0x10000 + 0x80;
144+ *lb++ = len / 0x100;
145+#else
146+ *lb++ = len / 0x100 + 0x80;
147+#endif
148+ }
149+ *lb = len;
150+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
151+ write_buf(f_out_name, fnamecmpbuf, len);
152+ }
0edc7d7f 153+ }
09fb2223 154
0edc7d7f
WD
155 if (dry_run || read_batch)
156 return;
a7219d20 157--- orig/main.c 2005-01-30 10:07:21
3eabe3a3 158+++ main.c 2005-01-14 18:33:15
a7219d20 159@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
160 extern int preserve_hard_links;
161 extern int protocol_version;
162 extern int recurse;
09fb2223 163+extern int fuzzy_basis;
495f1899
WD
164 extern int relative_paths;
165 extern int rsync_port;
166 extern int whole_file;
a7219d20 167@@ -491,7 +492,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
168 int pid;
169 int status = 0;
170 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
171- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
172+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
173+ && !dry_run;
495f1899 174
d5753a22
WD
175 /* The receiving side mustn't obey this, or an existing symlink that
176 * points to an identical file won't be replaced by the referent. */
a7219d20 177--- orig/options.c 2005-02-01 10:39:22
79f132a1 178+++ options.c 2005-01-28 19:31:20
be73a66e 179@@ -90,6 +90,7 @@ int copy_unsafe_links = 0;
f6c3b300 180 int size_only = 0;
0808daa5 181 int daemon_bwlimit = 0;
f6c3b300 182 int bwlimit = 0;
09fb2223 183+int fuzzy_basis = 0;
f6c3b300 184 size_t bwlimit_writemax = 0;
f6c3b300 185 int only_existing = 0;
be73a66e 186 int opt_ignore_existing = 0;
79f132a1 187@@ -303,6 +304,7 @@ void usage(enum logcode F)
f0533c4c 188 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
189 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
190 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 191+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c 192 rprintf(F," -z, --compress compress file data\n");
79f132a1
WD
193 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
194 rprintf(F," -f, --filter=RULE add a file-filtering RULE\n");
195@@ -408,6 +410,7 @@ static struct poptOption long_options[]
0808daa5
WD
196 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
197 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
198 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 199+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
200 /* TODO: Should this take an optional int giving the compression level? */
201 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 202 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
a7219d20 203@@ -1321,6 +1324,9 @@ void server_options(char **args,int *arg
4370504a
WD
204 if (!implied_dirs && !am_sender)
205 args[ac++] = "--no-implied-dirs";
7b675ff5 206
09fb2223 207+ if (fuzzy_basis && am_sender)
241013b4 208+ args[ac++] = "--fuzzy";
7b675ff5 209+
241013b4 210 *argc = ac;
f74d2272 211 return;
7b675ff5 212
a7219d20 213--- orig/receiver.c 2005-02-03 02:04:20
9cf86680 214+++ receiver.c 2005-01-15 21:21:02
a7219d20 215@@ -256,6 +256,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
216 }
217
218
219+static void read_gen_name(int fd, char *buf)
220+{
221+ int len = read_byte(fd);
222+ if (len & 0x80) {
223+#if MAXPATHLEN > 32767
224+ uchar lenbuf[2];
225+ read_buf(fd, (char *)lenbuf, 2);
226+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
227+#else
228+ len = (len & ~0x80) * 0x100 + read_byte(fd);
229+#endif
230+ }
231+ if (len >= MAXPATHLEN) {
232+ rprintf(FERROR, "bogus data on generator name pipe\n");
233+ exit_cleanup(RERR_PROTOCOL);
234+ }
235+
236+ read_sbuf(fd, buf, len);
237+}
238+
239+
240 static void discard_receive_data(int f_in, OFF_T length)
241 {
242 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
a7219d20 243@@ -395,6 +416,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 244 case FNAMECMP_BACKUP:
f48a237e 245 fnamecmp = get_backup_name(fname);
09fb2223 246 break;
0edc7d7f 247+ case FNAMECMP_FUZZY:
09fb2223
WD
248+ read_gen_name(f_in_name, fnamecmpbuf);
249+ fnamecmp = fnamecmpbuf;
250+ break;
251 default:
9cf86680
WD
252 if (j >= basis_dir_cnt) {
253 rprintf(FERROR,
a7219d20 254--- orig/rsync.h 2005-02-03 02:04:20
a27b0830 255+++ rsync.h 2005-01-19 18:36:47
a7219d20 256@@ -131,6 +131,7 @@
0edc7d7f
WD
257 #define FNAMECMP_FNAME 0x80
258 #define FNAMECMP_PARTIAL_DIR 0x81
259 #define FNAMECMP_BACKUP 0x82
260+#define FNAMECMP_FUZZY 0x83
09fb2223 261
a27b0830 262 /* For calling delete_file() */
be73a66e 263 #define DEL_DIR (1<<0)
a7219d20 264--- orig/rsync.yo 2005-02-01 10:39:23
79f132a1 265+++ rsync.yo 2005-01-28 19:31:36
a7219d20 266@@ -354,6 +354,7 @@ to the detailed description below for a
f0533c4c 267 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
268 --copy-dest=DIR ... and include copies of unchanged files
269 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 270+ --fuzzy find similar file for basis when no dest
f0533c4c 271 -z, --compress compress file data
79f132a1
WD
272 -C, --cvs-exclude auto-ignore files in the same way CVS does
273 -f, --filter=RULE add a file-filtering RULE
a7219d20
WD
274@@ -937,6 +938,14 @@ bf(--link-dest) from working properly fo
275 (or implied by bf(-a)). You can work-around this bug by avoiding the bf(-o) option
0f626034 276 when sending to an old rsync.
637c560e
WD
277
278+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
279+basis file for any destination file that is missing. The current algorithm
280+looks for a similarly-named file in the same directory as the destination
fc82f579
WD
281+file, and, if found, uses that to try to speed up the transfer. Note that
282+the use of the --delete option might get rid of any potential fuzzy-match
283+files, so either use --delete-after or filename exclusions if you need to
284+prevent this.
637c560e
WD
285+
286 dit(bf(-z, --compress)) With this option, rsync compresses any data from
287 the files that it sends to the destination machine. This
288 option is useful on slow connections. The compression method used is the
79f132a1 289--- orig/util.c 2005-01-28 19:08:20
a27b0830
WD
290+++ util.c 2005-01-19 17:30:51
291@@ -1213,3 +1213,108 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
292 return malloc(size * num);
293 return realloc(ptr, size * num);
294 }
295+
4cd1daea
WD
296+/* Take a filename and filename length and return the most significant
297+ * filename suffix we can find. This ignores suffixes such as "~",
298+ * ".bak", ".orig", ".~1~", etc. */
a27b0830 299+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
4cd1daea
WD
300+{
301+ const char *suf, *s;
4cd1daea 302+ BOOL had_tilde;
a27b0830 303+ int s_len;
4cd1daea
WD
304+
305+ /* One or more dots at the start aren't a suffix. */
306+ while (fn_len && *fn == '.') fn++, fn_len--;
307+
308+ /* Ignore the ~ in a "foo~" filename. */
309+ if (fn_len > 1 && fn[fn_len-1] == '~')
310+ fn_len--, had_tilde = True;
311+ else
312+ had_tilde = False;
313+
314+ /* Assume we don't find an suffix. */
315+ suf = "";
316+ *len_ptr = 0;
317+
318+ /* Find the last significant suffix. */
319+ for (s = fn + fn_len - 1; fn_len > 1; ) {
320+ while (*s != '.' && s != fn) s--;
321+ if (s == fn)
322+ break;
323+ s_len = fn_len - (s - fn);
324+ fn_len = s - fn;
325+ if (s_len == 3) {
326+ if (strcmp(s+1, "bak") == 0
327+ || strcmp(s+1, "old") == 0)
328+ continue;
329+ } else if (s_len == 4) {
330+ if (strcmp(s+1, "orig") == 0)
331+ continue;
332+ } else if (s_len > 2 && had_tilde
333+ && s[1] == '~' && isdigit(s[2]))
334+ continue;
335+ *len_ptr = s_len;
336+ suf = s;
337+ /* Determine if the suffix is all digits. */
338+ for (s++, s_len--; s_len > 0; s++, s_len--) {
339+ if (!isdigit(*s))
340+ return suf;
341+ }
342+ /* An all-digit suffix may not be that signficant. */
343+ continue;
344+ }
345+
346+ return suf;
347+}
348+
054f3f90
WD
349+/* This is an implementation of the Levenshtein distance algorithm. It
350+ * was implemented to avoid needing a two-dimensional matrix (to save
351+ * memory). It was also tweaked to try to factor in the ASCII distance
352+ * between changed characters as a minor distance quantity. The normal
353+ * Levenshtein units of distance (each signifying a single change between
354+ * the two strings) are defined as a "UNIT". */
355+
356+#define UNIT (1 << 16)
357+
358+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
359+{
360+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
361+ int32 cost;
362+ int i1, i2;
363+
4cd1daea
WD
364+ if (!len1 || !len2) {
365+ if (!len1) {
366+ s1 = s2;
367+ len1 = len2;
368+ }
369+ for (i1 = 0, cost = 0; i1 < len1; i1++)
370+ cost += s1[i1];
371+ return (int32)len1 * UNIT + cost;
372+ }
054f3f90
WD
373+
374+ for (i2 = 0; i2 < len2; i2++)
375+ a[i2] = (i2+1) * UNIT;
376+
377+ for (i1 = 0; i1 < len1; i1++) {
378+ diag = i1 * UNIT;
379+ above = (i1+1) * UNIT;
380+ for (i2 = 0; i2 < len2; i2++) {
381+ left = a[i2];
382+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
383+ if (cost < 0)
384+ cost = UNIT - cost;
385+ else
386+ cost = UNIT + cost;
387+ }
388+ diag_inc = diag + cost;
389+ left_inc = left + UNIT + *((uchar*)s1+i1);
390+ above_inc = above + UNIT + *((uchar*)s2+i2);
391+ a[i2] = above = left < above
392+ ? (left_inc < diag_inc ? left_inc : diag_inc)
393+ : (above_inc < diag_inc ? above_inc : diag_inc);
394+ diag = left;
395+ }
396+ }
397+
398+ return a[len2-1];
399+}