- Improved the comment in front of send_file_list().
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
4370504a
WD
1Depends-On-Patch: delete-during.diff
2
824abc86 3The changes to generator.c were greatly simplified, making the patch
8c5b8235 4easier to maintain and fixing the failing test in the testsuite.
0808daa5 5Lightly tested.
241013b4 6
824abc86
WD
7Be sure to run "make proto" before "make".
8
4370504a 9--- orig/generator.c 2005-01-18 23:14:23
a27b0830 10+++ generator.c 2005-01-19 18:39:15
4370504a 11@@ -46,6 +46,7 @@ extern int size_only;
0808daa5 12 extern OFF_T max_size;
58118c25
WD
13 extern int io_timeout;
14 extern int protocol_version;
09fb2223 15+extern int fuzzy_basis;
58118c25 16 extern int always_checksum;
b952a177 17 extern char *partial_dir;
0808daa5 18 extern char *basis_dir[];
a27b0830 19@@ -226,6 +227,88 @@ static void generate_and_send_sums(int f
4370504a 20 unmap_file(mapbuf);
58118c25
WD
21 }
22
fc82f579
WD
23+/* Try to find a filename in the same dir as "fname" with a similar name.
24+ *
25+ * TODO:
4370504a 26+ * - We should be using a cache of names for the current dir, not
fc82f579 27+ * re-reading the destination directory for every file.
4370504a
WD
28+ * - We must not return an rsync tempfile from the current transfer.
29+ * - If the highest-rated name is not a normal file, we should fall-
fc82f579 30+ * back to the next highest-rated file.
4370504a 31+ * - We must not return a destination file that is being updated
fc82f579
WD
32+ * during the current transfer, even if we already processed it
33+ * (since the receiver may not be done with it yet).
4370504a 34+ * - We must weed out any names that a daemon's config has excluded.
fc82f579 35+ */
09fb2223 36+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
37+{
38+ DIR *d;
39+ struct dirent *di;
0f626034 40+ char *basename, *dirname, *slash;
47dd7a31 41+ char bestname[MAXPATHLEN];
4cd1daea 42+ int suf_len, basename_len;
054f3f90 43+ uint32 lowest_dist = 0x7FFFFFFF;
4cd1daea 44+ const char *suf;
47dd7a31 45+
0f626034
WD
46+ strlcpy(buf, fname, MAXPATHLEN);
47+ if ((slash = strrchr(buf, '/')) != NULL) {
48+ dirname = buf;
49+ *slash = '\0';
50+ basename = slash + 1;
51+ } else {
52+ basename = buf;
53+ dirname = ".";
54+ }
054f3f90 55+ basename_len = strlen(basename);
47dd7a31 56+
47dd7a31
WD
57+ if (!(d = opendir(dirname))) {
58+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
59+ return -1;
60+ }
e55625fb
WD
61+ if (slash)
62+ *slash = '/';
47dd7a31 63+
a27b0830 64+ suf = find_filename_suffix(basename, basename_len, &suf_len);
47dd7a31 65+
054f3f90 66+ bestname[0] = '\0';
47dd7a31 67+ while ((di = readdir(d)) != NULL) {
4cd1daea 68+ const char *dname_suf, *dname = d_name(di);
054f3f90 69+ uint32 dist;
4cd1daea 70+ int dname_len, dname_suf_len;
47dd7a31
WD
71+
72+ if (dname[0] == '.' && (dname[1] == '\0'
73+ || (dname[1] == '.' && dname[2] == '\0')))
74+ continue;
75+
a27b0830
WD
76+ dname_len = strlen(dname);
77+ dname_suf = find_filename_suffix(dname, dname_len, &dname_suf_len);
054f3f90
WD
78+
79+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
fc82f579 80+ /* Add some extra weight to how well the suffixes match. */
4cd1daea
WD
81+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
82+ if (verbose > 4) {
83+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
84+ dname, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 85+ }
fc82f579 86+ if (dist <= lowest_dist) {
8c5b8235 87+ strlcpy(bestname, dname, sizeof bestname);
054f3f90 88+ lowest_dist = dist;
47dd7a31
WD
89+ }
90+ }
91+ closedir(d);
92+
93+ /* Found a candidate. */
054f3f90 94+ if (bestname[0] != '\0') {
e55625fb 95+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
054f3f90
WD
96+ if (verbose > 2) {
97+ rprintf(FINFO, "fuzzy match %s->%s\n",
98+ safe_fname(fname), buf);
99+ }
58118c25 100+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
101+ }
102+ return -1;
103+}
58118c25
WD
104+
105
106 /*
107 * Acts on file number @p i from @p flist, whose name is @p fname.
a27b0830 108@@ -478,6 +561,15 @@ static void recv_generator(char *fname,
f48a237e
WD
109 } else
110 partialptr = NULL;
824abc86 111
09fb2223
WD
112+ if (statret == -1 && fuzzy_basis) {
113+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
114+ && S_ISREG(st.st_mode)) {
115+ statret = 0;
116+ fnamecmp = fnamecmpbuf;
0edc7d7f 117+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 118+ }
824abc86
WD
119+ }
120+
09fb2223
WD
121 if (statret == -1) {
122 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 123 return;
a27b0830 124@@ -506,6 +598,8 @@ static void recv_generator(char *fname,
241013b4 125
9cf86680 126 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
127 ;
128+ else if (fnamecmp_type == FNAMECMP_FUZZY)
129+ ;
130 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 131 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 132 set_perms(fname, file, &st, PERMS_REPORT);
a27b0830 133@@ -580,8 +674,24 @@ notify_others:
0edc7d7f 134 write_int(f_out, i);
0f626034
WD
135 if (protocol_version >= 29 && inplace && !read_batch)
136 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
137- if (f_out_name >= 0)
138+ if (f_out_name >= 0) {
09fb2223 139 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 140+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
141+ uchar lenbuf[3], *lb = lenbuf;
142+ int len = strlen(fnamecmpbuf);
143+ if (len > 0x7F) {
144+#if MAXPATHLEN > 0x7FFF
145+ *lb++ = len / 0x10000 + 0x80;
146+ *lb++ = len / 0x100;
147+#else
148+ *lb++ = len / 0x100 + 0x80;
149+#endif
150+ }
151+ *lb = len;
152+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
153+ write_buf(f_out_name, fnamecmpbuf, len);
154+ }
0edc7d7f 155+ }
09fb2223 156
0edc7d7f
WD
157 if (dry_run || read_batch)
158 return;
4370504a 159--- orig/main.c 2005-01-18 21:56:05
3eabe3a3 160+++ main.c 2005-01-14 18:33:15
4370504a 161@@ -49,6 +49,7 @@ extern int keep_dirlinks;
495f1899
WD
162 extern int preserve_hard_links;
163 extern int protocol_version;
164 extern int recurse;
09fb2223 165+extern int fuzzy_basis;
495f1899
WD
166 extern int relative_paths;
167 extern int rsync_port;
168 extern int whole_file;
4370504a 169@@ -465,7 +466,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
170 int pid;
171 int status = 0;
172 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
173- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
174+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
175+ && !dry_run;
495f1899 176
d5753a22
WD
177 /* The receiving side mustn't obey this, or an existing symlink that
178 * points to an identical file won't be replaced by the referent. */
4370504a 179--- orig/options.c 2005-01-19 01:07:34
9cf86680 180+++ options.c 2005-01-15 21:08:13
4370504a 181@@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
f6c3b300 182 int size_only = 0;
0808daa5 183 int daemon_bwlimit = 0;
f6c3b300 184 int bwlimit = 0;
09fb2223 185+int fuzzy_basis = 0;
f6c3b300
WD
186 size_t bwlimit_writemax = 0;
187 int delete_after = 0;
188 int only_existing = 0;
4370504a 189@@ -292,6 +293,7 @@ void usage(enum logcode F)
f0533c4c 190 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
191 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
192 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 193+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
194 rprintf(F," -P equivalent to --partial --progress\n");
195 rprintf(F," -z, --compress compress file data\n");
196 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
4370504a 197@@ -390,6 +392,7 @@ static struct poptOption long_options[]
0808daa5
WD
198 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
199 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
200 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 201+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
202 /* TODO: Should this take an optional int giving the compression level? */
203 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 204 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
4370504a
WD
205@@ -1247,6 +1250,9 @@ void server_options(char **args,int *arg
206 if (!implied_dirs && !am_sender)
207 args[ac++] = "--no-implied-dirs";
7b675ff5 208
09fb2223 209+ if (fuzzy_basis && am_sender)
241013b4 210+ args[ac++] = "--fuzzy";
7b675ff5 211+
241013b4 212 *argc = ac;
f74d2272 213 return;
7b675ff5 214
4370504a 215--- orig/receiver.c 2005-01-18 22:47:38
9cf86680 216+++ receiver.c 2005-01-15 21:21:02
4370504a 217@@ -234,6 +234,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
218 }
219
220
221+static void read_gen_name(int fd, char *buf)
222+{
223+ int len = read_byte(fd);
224+ if (len & 0x80) {
225+#if MAXPATHLEN > 32767
226+ uchar lenbuf[2];
227+ read_buf(fd, (char *)lenbuf, 2);
228+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
229+#else
230+ len = (len & ~0x80) * 0x100 + read_byte(fd);
231+#endif
232+ }
233+ if (len >= MAXPATHLEN) {
234+ rprintf(FERROR, "bogus data on generator name pipe\n");
235+ exit_cleanup(RERR_PROTOCOL);
236+ }
237+
238+ read_sbuf(fd, buf, len);
239+}
240+
241+
242 static void discard_receive_data(int f_in, OFF_T length)
243 {
244 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
4370504a 245@@ -364,6 +385,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 246 case FNAMECMP_BACKUP:
f48a237e 247 fnamecmp = get_backup_name(fname);
09fb2223 248 break;
0edc7d7f 249+ case FNAMECMP_FUZZY:
09fb2223
WD
250+ read_gen_name(f_in_name, fnamecmpbuf);
251+ fnamecmp = fnamecmpbuf;
252+ break;
253 default:
9cf86680
WD
254 if (j >= basis_dir_cnt) {
255 rprintf(FERROR,
a27b0830
WD
256--- orig/rsync.h 2005-01-19 20:11:10
257+++ rsync.h 2005-01-19 18:36:47
0f626034 258@@ -128,6 +128,7 @@
0edc7d7f
WD
259 #define FNAMECMP_FNAME 0x80
260 #define FNAMECMP_PARTIAL_DIR 0x81
261 #define FNAMECMP_BACKUP 0x82
262+#define FNAMECMP_FUZZY 0x83
09fb2223 263
a27b0830
WD
264 /* For calling delete_file() */
265 #define DEL_DIR (1<<0)
4370504a 266--- orig/rsync.yo 2005-01-19 01:05:05
637c560e 267+++ rsync.yo 2005-01-15 21:48:52
4370504a 268@@ -359,6 +359,7 @@ verb(
f0533c4c 269 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
270 --copy-dest=DIR ... and include copies of unchanged files
271 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 272+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
273 -P equivalent to --partial --progress
274 -z, --compress compress file data
275 -C, --cvs-exclude auto ignore files in the same way CVS does
4370504a 276@@ -888,6 +889,14 @@ Note that rsync versions prior to 2.6.1
0f626034
WD
277 (or implied by -a). You can work-around this bug by avoiding the -o option
278 when sending to an old rsync.
637c560e
WD
279
280+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
281+basis file for any destination file that is missing. The current algorithm
282+looks for a similarly-named file in the same directory as the destination
fc82f579
WD
283+file, and, if found, uses that to try to speed up the transfer. Note that
284+the use of the --delete option might get rid of any potential fuzzy-match
285+files, so either use --delete-after or filename exclusions if you need to
286+prevent this.
637c560e
WD
287+
288 dit(bf(-z, --compress)) With this option, rsync compresses any data from
289 the files that it sends to the destination machine. This
290 option is useful on slow connections. The compression method used is the
a27b0830
WD
291--- orig/util.c 2005-01-19 20:11:10
292+++ util.c 2005-01-19 17:30:51
293@@ -1213,3 +1213,108 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
294 return malloc(size * num);
295 return realloc(ptr, size * num);
296 }
297+
4cd1daea
WD
298+/* Take a filename and filename length and return the most significant
299+ * filename suffix we can find. This ignores suffixes such as "~",
300+ * ".bak", ".orig", ".~1~", etc. */
a27b0830 301+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
4cd1daea
WD
302+{
303+ const char *suf, *s;
4cd1daea 304+ BOOL had_tilde;
a27b0830 305+ int s_len;
4cd1daea
WD
306+
307+ /* One or more dots at the start aren't a suffix. */
308+ while (fn_len && *fn == '.') fn++, fn_len--;
309+
310+ /* Ignore the ~ in a "foo~" filename. */
311+ if (fn_len > 1 && fn[fn_len-1] == '~')
312+ fn_len--, had_tilde = True;
313+ else
314+ had_tilde = False;
315+
316+ /* Assume we don't find an suffix. */
317+ suf = "";
318+ *len_ptr = 0;
319+
320+ /* Find the last significant suffix. */
321+ for (s = fn + fn_len - 1; fn_len > 1; ) {
322+ while (*s != '.' && s != fn) s--;
323+ if (s == fn)
324+ break;
325+ s_len = fn_len - (s - fn);
326+ fn_len = s - fn;
327+ if (s_len == 3) {
328+ if (strcmp(s+1, "bak") == 0
329+ || strcmp(s+1, "old") == 0)
330+ continue;
331+ } else if (s_len == 4) {
332+ if (strcmp(s+1, "orig") == 0)
333+ continue;
334+ } else if (s_len > 2 && had_tilde
335+ && s[1] == '~' && isdigit(s[2]))
336+ continue;
337+ *len_ptr = s_len;
338+ suf = s;
339+ /* Determine if the suffix is all digits. */
340+ for (s++, s_len--; s_len > 0; s++, s_len--) {
341+ if (!isdigit(*s))
342+ return suf;
343+ }
344+ /* An all-digit suffix may not be that signficant. */
345+ continue;
346+ }
347+
348+ return suf;
349+}
350+
054f3f90
WD
351+/* This is an implementation of the Levenshtein distance algorithm. It
352+ * was implemented to avoid needing a two-dimensional matrix (to save
353+ * memory). It was also tweaked to try to factor in the ASCII distance
354+ * between changed characters as a minor distance quantity. The normal
355+ * Levenshtein units of distance (each signifying a single change between
356+ * the two strings) are defined as a "UNIT". */
357+
358+#define UNIT (1 << 16)
359+
360+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
361+{
362+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
363+ int32 cost;
364+ int i1, i2;
365+
4cd1daea
WD
366+ if (!len1 || !len2) {
367+ if (!len1) {
368+ s1 = s2;
369+ len1 = len2;
370+ }
371+ for (i1 = 0, cost = 0; i1 < len1; i1++)
372+ cost += s1[i1];
373+ return (int32)len1 * UNIT + cost;
374+ }
054f3f90
WD
375+
376+ for (i2 = 0; i2 < len2; i2++)
377+ a[i2] = (i2+1) * UNIT;
378+
379+ for (i1 = 0; i1 < len1; i1++) {
380+ diag = i1 * UNIT;
381+ above = (i1+1) * UNIT;
382+ for (i2 = 0; i2 < len2; i2++) {
383+ left = a[i2];
384+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
385+ if (cost < 0)
386+ cost = UNIT - cost;
387+ else
388+ cost = UNIT + cost;
389+ }
390+ diag_inc = diag + cost;
391+ left_inc = left + UNIT + *((uchar*)s1+i1);
392+ above_inc = above + UNIT + *((uchar*)s2+i2);
393+ a[i2] = above = left < above
394+ ? (left_inc < diag_inc ? left_inc : diag_inc)
395+ : (above_inc < diag_inc ? above_inc : diag_inc);
396+ diag = left;
397+ }
398+ }
399+
400+ return a[len2-1];
401+}