Fixing failed hunks caused by recent removal of delete_one().
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
4370504a
WD
1Depends-On-Patch: delete-during.diff
2
824abc86 3The changes to generator.c were greatly simplified, making the patch
8c5b8235 4easier to maintain and fixing the failing test in the testsuite.
0808daa5 5Lightly tested.
241013b4 6
824abc86
WD
7Be sure to run "make proto" before "make".
8
4370504a 9--- orig/generator.c 2005-01-18 23:14:23
4cd1daea 10+++ generator.c 2005-01-18 19:25:55
4370504a 11@@ -46,6 +46,7 @@ extern int size_only;
0808daa5 12 extern OFF_T max_size;
58118c25
WD
13 extern int io_timeout;
14 extern int protocol_version;
09fb2223 15+extern int fuzzy_basis;
58118c25 16 extern int always_checksum;
b952a177 17 extern char *partial_dir;
0808daa5 18 extern char *basis_dir[];
4370504a
WD
19@@ -243,6 +244,89 @@ static void generate_and_send_sums(int f
20 unmap_file(mapbuf);
58118c25
WD
21 }
22
fc82f579
WD
23+/* Try to find a filename in the same dir as "fname" with a similar name.
24+ *
25+ * TODO:
4370504a 26+ * - We should be using a cache of names for the current dir, not
fc82f579 27+ * re-reading the destination directory for every file.
4370504a
WD
28+ * - We must not return an rsync tempfile from the current transfer.
29+ * - If the highest-rated name is not a normal file, we should fall-
fc82f579 30+ * back to the next highest-rated file.
4370504a 31+ * - We must not return a destination file that is being updated
fc82f579
WD
32+ * during the current transfer, even if we already processed it
33+ * (since the receiver may not be done with it yet).
4370504a 34+ * - We must weed out any names that a daemon's config has excluded.
fc82f579 35+ */
09fb2223 36+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
37+{
38+ DIR *d;
39+ struct dirent *di;
0f626034 40+ char *basename, *dirname, *slash;
47dd7a31 41+ char bestname[MAXPATHLEN];
4cd1daea 42+ int suf_len, basename_len;
054f3f90 43+ uint32 lowest_dist = 0x7FFFFFFF;
4cd1daea 44+ const char *suf;
47dd7a31 45+
0f626034
WD
46+ strlcpy(buf, fname, MAXPATHLEN);
47+ if ((slash = strrchr(buf, '/')) != NULL) {
48+ dirname = buf;
49+ *slash = '\0';
50+ basename = slash + 1;
51+ } else {
52+ basename = buf;
53+ dirname = ".";
54+ }
054f3f90 55+ basename_len = strlen(basename);
47dd7a31 56+
47dd7a31
WD
57+ if (!(d = opendir(dirname))) {
58+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
59+ return -1;
60+ }
e55625fb
WD
61+ if (slash)
62+ *slash = '/';
47dd7a31 63+
4cd1daea
WD
64+ suf_len = basename_len;
65+ suf = find_filename_suffix(basename, &suf_len);
47dd7a31 66+
054f3f90 67+ bestname[0] = '\0';
47dd7a31 68+ while ((di = readdir(d)) != NULL) {
4cd1daea 69+ const char *dname_suf, *dname = d_name(di);
054f3f90 70+ uint32 dist;
4cd1daea 71+ int dname_len, dname_suf_len;
47dd7a31
WD
72+
73+ if (dname[0] == '.' && (dname[1] == '\0'
74+ || (dname[1] == '.' && dname[2] == '\0')))
75+ continue;
76+
4cd1daea
WD
77+ dname_len = dname_suf_len = strlen(dname);
78+ dname_suf = find_filename_suffix(dname, &dname_suf_len);
054f3f90
WD
79+
80+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
fc82f579 81+ /* Add some extra weight to how well the suffixes match. */
4cd1daea
WD
82+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
83+ if (verbose > 4) {
84+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
85+ dname, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 86+ }
fc82f579 87+ if (dist <= lowest_dist) {
8c5b8235 88+ strlcpy(bestname, dname, sizeof bestname);
054f3f90 89+ lowest_dist = dist;
47dd7a31
WD
90+ }
91+ }
92+ closedir(d);
93+
94+ /* Found a candidate. */
054f3f90 95+ if (bestname[0] != '\0') {
e55625fb 96+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
054f3f90
WD
97+ if (verbose > 2) {
98+ rprintf(FINFO, "fuzzy match %s->%s\n",
99+ safe_fname(fname), buf);
100+ }
58118c25 101+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
102+ }
103+ return -1;
104+}
58118c25
WD
105+
106
107 /*
108 * Acts on file number @p i from @p flist, whose name is @p fname.
4370504a 109@@ -498,6 +582,15 @@ static void recv_generator(char *fname,
f48a237e
WD
110 } else
111 partialptr = NULL;
824abc86 112
09fb2223
WD
113+ if (statret == -1 && fuzzy_basis) {
114+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
115+ && S_ISREG(st.st_mode)) {
116+ statret = 0;
117+ fnamecmp = fnamecmpbuf;
0edc7d7f 118+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 119+ }
824abc86
WD
120+ }
121+
09fb2223
WD
122 if (statret == -1) {
123 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 124 return;
4370504a 125@@ -526,6 +619,8 @@ static void recv_generator(char *fname,
241013b4 126
9cf86680 127 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
128 ;
129+ else if (fnamecmp_type == FNAMECMP_FUZZY)
130+ ;
131 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 132 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 133 set_perms(fname, file, &st, PERMS_REPORT);
4370504a 134@@ -600,8 +695,24 @@ notify_others:
0edc7d7f 135 write_int(f_out, i);
0f626034
WD
136 if (protocol_version >= 29 && inplace && !read_batch)
137 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
138- if (f_out_name >= 0)
139+ if (f_out_name >= 0) {
09fb2223 140 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 141+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
142+ uchar lenbuf[3], *lb = lenbuf;
143+ int len = strlen(fnamecmpbuf);
144+ if (len > 0x7F) {
145+#if MAXPATHLEN > 0x7FFF
146+ *lb++ = len / 0x10000 + 0x80;
147+ *lb++ = len / 0x100;
148+#else
149+ *lb++ = len / 0x100 + 0x80;
150+#endif
151+ }
152+ *lb = len;
153+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
154+ write_buf(f_out_name, fnamecmpbuf, len);
155+ }
0edc7d7f 156+ }
09fb2223 157
0edc7d7f
WD
158 if (dry_run || read_batch)
159 return;
4370504a 160--- orig/main.c 2005-01-18 21:56:05
3eabe3a3 161+++ main.c 2005-01-14 18:33:15
4370504a 162@@ -49,6 +49,7 @@ extern int keep_dirlinks;
495f1899
WD
163 extern int preserve_hard_links;
164 extern int protocol_version;
165 extern int recurse;
09fb2223 166+extern int fuzzy_basis;
495f1899
WD
167 extern int relative_paths;
168 extern int rsync_port;
169 extern int whole_file;
4370504a 170@@ -465,7 +466,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
171 int pid;
172 int status = 0;
173 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
174- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
175+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
176+ && !dry_run;
495f1899 177
d5753a22
WD
178 /* The receiving side mustn't obey this, or an existing symlink that
179 * points to an identical file won't be replaced by the referent. */
4370504a 180--- orig/options.c 2005-01-19 01:07:34
9cf86680 181+++ options.c 2005-01-15 21:08:13
4370504a 182@@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
f6c3b300 183 int size_only = 0;
0808daa5 184 int daemon_bwlimit = 0;
f6c3b300 185 int bwlimit = 0;
09fb2223 186+int fuzzy_basis = 0;
f6c3b300
WD
187 size_t bwlimit_writemax = 0;
188 int delete_after = 0;
189 int only_existing = 0;
4370504a 190@@ -292,6 +293,7 @@ void usage(enum logcode F)
f0533c4c 191 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
192 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
193 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 194+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
195 rprintf(F," -P equivalent to --partial --progress\n");
196 rprintf(F," -z, --compress compress file data\n");
197 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
4370504a 198@@ -390,6 +392,7 @@ static struct poptOption long_options[]
0808daa5
WD
199 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
200 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
201 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 202+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
203 /* TODO: Should this take an optional int giving the compression level? */
204 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 205 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
4370504a
WD
206@@ -1247,6 +1250,9 @@ void server_options(char **args,int *arg
207 if (!implied_dirs && !am_sender)
208 args[ac++] = "--no-implied-dirs";
7b675ff5 209
09fb2223 210+ if (fuzzy_basis && am_sender)
241013b4 211+ args[ac++] = "--fuzzy";
7b675ff5 212+
241013b4 213 *argc = ac;
f74d2272 214 return;
7b675ff5 215
4370504a 216--- orig/receiver.c 2005-01-18 22:47:38
9cf86680 217+++ receiver.c 2005-01-15 21:21:02
4370504a 218@@ -234,6 +234,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
219 }
220
221
222+static void read_gen_name(int fd, char *buf)
223+{
224+ int len = read_byte(fd);
225+ if (len & 0x80) {
226+#if MAXPATHLEN > 32767
227+ uchar lenbuf[2];
228+ read_buf(fd, (char *)lenbuf, 2);
229+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
230+#else
231+ len = (len & ~0x80) * 0x100 + read_byte(fd);
232+#endif
233+ }
234+ if (len >= MAXPATHLEN) {
235+ rprintf(FERROR, "bogus data on generator name pipe\n");
236+ exit_cleanup(RERR_PROTOCOL);
237+ }
238+
239+ read_sbuf(fd, buf, len);
240+}
241+
242+
243 static void discard_receive_data(int f_in, OFF_T length)
244 {
245 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
4370504a 246@@ -364,6 +385,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 247 case FNAMECMP_BACKUP:
f48a237e 248 fnamecmp = get_backup_name(fname);
09fb2223 249 break;
0edc7d7f 250+ case FNAMECMP_FUZZY:
09fb2223
WD
251+ read_gen_name(f_in_name, fnamecmpbuf);
252+ fnamecmp = fnamecmpbuf;
253+ break;
254 default:
9cf86680
WD
255 if (j >= basis_dir_cnt) {
256 rprintf(FERROR,
0f626034 257--- orig/rsync.h 2005-01-17 23:11:45
9cf86680 258+++ rsync.h 2005-01-15 21:24:09
0f626034 259@@ -128,6 +128,7 @@
0edc7d7f
WD
260 #define FNAMECMP_FNAME 0x80
261 #define FNAMECMP_PARTIAL_DIR 0x81
262 #define FNAMECMP_BACKUP 0x82
263+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
264
265
266 /* Log-message categories. FLOG is only used on the daemon side to
4370504a 267--- orig/rsync.yo 2005-01-19 01:05:05
637c560e 268+++ rsync.yo 2005-01-15 21:48:52
4370504a 269@@ -359,6 +359,7 @@ verb(
f0533c4c 270 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
271 --copy-dest=DIR ... and include copies of unchanged files
272 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 273+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
274 -P equivalent to --partial --progress
275 -z, --compress compress file data
276 -C, --cvs-exclude auto ignore files in the same way CVS does
4370504a 277@@ -888,6 +889,14 @@ Note that rsync versions prior to 2.6.1
0f626034
WD
278 (or implied by -a). You can work-around this bug by avoiding the -o option
279 when sending to an old rsync.
637c560e
WD
280
281+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
282+basis file for any destination file that is missing. The current algorithm
283+looks for a similarly-named file in the same directory as the destination
fc82f579
WD
284+file, and, if found, uses that to try to speed up the transfer. Note that
285+the use of the --delete option might get rid of any potential fuzzy-match
286+files, so either use --delete-after or filename exclusions if you need to
287+prevent this.
637c560e
WD
288+
289 dit(bf(-z, --compress)) With this option, rsync compresses any data from
290 the files that it sends to the destination machine. This
291 option is useful on slow connections. The compression method used is the
054f3f90 292--- orig/util.c 2004-09-07 21:45:30
4cd1daea
WD
293+++ util.c 2005-01-18 19:25:47
294@@ -1217,3 +1217,108 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
295 return malloc(size * num);
296 return realloc(ptr, size * num);
297 }
298+
4cd1daea
WD
299+/* Take a filename and filename length and return the most significant
300+ * filename suffix we can find. This ignores suffixes such as "~",
301+ * ".bak", ".orig", ".~1~", etc. */
302+const char *find_filename_suffix(const char *fn, int *len_ptr)
303+{
304+ const char *suf, *s;
305+ int s_len, fn_len = *len_ptr;
306+ BOOL had_tilde;
307+
308+ /* One or more dots at the start aren't a suffix. */
309+ while (fn_len && *fn == '.') fn++, fn_len--;
310+
311+ /* Ignore the ~ in a "foo~" filename. */
312+ if (fn_len > 1 && fn[fn_len-1] == '~')
313+ fn_len--, had_tilde = True;
314+ else
315+ had_tilde = False;
316+
317+ /* Assume we don't find an suffix. */
318+ suf = "";
319+ *len_ptr = 0;
320+
321+ /* Find the last significant suffix. */
322+ for (s = fn + fn_len - 1; fn_len > 1; ) {
323+ while (*s != '.' && s != fn) s--;
324+ if (s == fn)
325+ break;
326+ s_len = fn_len - (s - fn);
327+ fn_len = s - fn;
328+ if (s_len == 3) {
329+ if (strcmp(s+1, "bak") == 0
330+ || strcmp(s+1, "old") == 0)
331+ continue;
332+ } else if (s_len == 4) {
333+ if (strcmp(s+1, "orig") == 0)
334+ continue;
335+ } else if (s_len > 2 && had_tilde
336+ && s[1] == '~' && isdigit(s[2]))
337+ continue;
338+ *len_ptr = s_len;
339+ suf = s;
340+ /* Determine if the suffix is all digits. */
341+ for (s++, s_len--; s_len > 0; s++, s_len--) {
342+ if (!isdigit(*s))
343+ return suf;
344+ }
345+ /* An all-digit suffix may not be that signficant. */
346+ continue;
347+ }
348+
349+ return suf;
350+}
351+
054f3f90
WD
352+/* This is an implementation of the Levenshtein distance algorithm. It
353+ * was implemented to avoid needing a two-dimensional matrix (to save
354+ * memory). It was also tweaked to try to factor in the ASCII distance
355+ * between changed characters as a minor distance quantity. The normal
356+ * Levenshtein units of distance (each signifying a single change between
357+ * the two strings) are defined as a "UNIT". */
358+
359+#define UNIT (1 << 16)
360+
361+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
362+{
363+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
364+ int32 cost;
365+ int i1, i2;
366+
4cd1daea
WD
367+ if (!len1 || !len2) {
368+ if (!len1) {
369+ s1 = s2;
370+ len1 = len2;
371+ }
372+ for (i1 = 0, cost = 0; i1 < len1; i1++)
373+ cost += s1[i1];
374+ return (int32)len1 * UNIT + cost;
375+ }
054f3f90
WD
376+
377+ for (i2 = 0; i2 < len2; i2++)
378+ a[i2] = (i2+1) * UNIT;
379+
380+ for (i1 = 0; i1 < len1; i1++) {
381+ diag = i1 * UNIT;
382+ above = (i1+1) * UNIT;
383+ for (i2 = 0; i2 < len2; i2++) {
384+ left = a[i2];
385+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
386+ if (cost < 0)
387+ cost = UNIT - cost;
388+ else
389+ cost = UNIT + cost;
390+ }
391+ diag_inc = diag + cost;
392+ left_inc = left + UNIT + *((uchar*)s1+i1);
393+ above_inc = above + UNIT + *((uchar*)s2+i2);
394+ a[i2] = above = left < above
395+ ? (left_inc < diag_inc ? left_inc : diag_inc)
396+ : (above_inc < diag_inc ? above_inc : diag_inc);
397+ diag = left;
398+ }
399+ }
400+
401+ return a[len2-1];
402+}