- Added a better filename-suffix-finding heuristic, including
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
0f626034 7--- orig/generator.c 2005-01-17 23:11:45
4cd1daea 8+++ generator.c 2005-01-18 19:25:55
0808daa5
WD
9@@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
4cd1daea 17@@ -242,6 +243,76 @@ static void generate_and_send_sums(int f
58118c25
WD
18 }
19
20
09fb2223 21+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
22+{
23+ DIR *d;
24+ struct dirent *di;
0f626034 25+ char *basename, *dirname, *slash;
47dd7a31 26+ char bestname[MAXPATHLEN];
4cd1daea 27+ int suf_len, basename_len;
054f3f90 28+ uint32 lowest_dist = 0x7FFFFFFF;
4cd1daea 29+ const char *suf;
47dd7a31 30+
0f626034
WD
31+ strlcpy(buf, fname, MAXPATHLEN);
32+ if ((slash = strrchr(buf, '/')) != NULL) {
33+ dirname = buf;
34+ *slash = '\0';
35+ basename = slash + 1;
36+ } else {
37+ basename = buf;
38+ dirname = ".";
39+ }
054f3f90 40+ basename_len = strlen(basename);
47dd7a31 41+
47dd7a31
WD
42+ if (!(d = opendir(dirname))) {
43+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
44+ return -1;
45+ }
e55625fb
WD
46+ if (slash)
47+ *slash = '/';
47dd7a31 48+
4cd1daea
WD
49+ suf_len = basename_len;
50+ suf = find_filename_suffix(basename, &suf_len);
47dd7a31 51+
054f3f90 52+ bestname[0] = '\0';
47dd7a31 53+ while ((di = readdir(d)) != NULL) {
4cd1daea 54+ const char *dname_suf, *dname = d_name(di);
054f3f90 55+ uint32 dist;
4cd1daea 56+ int dname_len, dname_suf_len;
47dd7a31
WD
57+
58+ if (dname[0] == '.' && (dname[1] == '\0'
59+ || (dname[1] == '.' && dname[2] == '\0')))
60+ continue;
61+
4cd1daea
WD
62+ dname_len = dname_suf_len = strlen(dname);
63+ dname_suf = find_filename_suffix(dname, &dname_suf_len);
054f3f90
WD
64+
65+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
4cd1daea
WD
66+ /* Add some extra weight to how well the suffixes matched. */
67+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
68+ if (verbose > 4) {
69+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
70+ dname, (int)(dist>>16), (int)(dist&0xFFFF));
47dd7a31 71+ }
054f3f90 72+ if (dist < lowest_dist) {
8c5b8235 73+ strlcpy(bestname, dname, sizeof bestname);
054f3f90 74+ lowest_dist = dist;
47dd7a31
WD
75+ }
76+ }
77+ closedir(d);
78+
79+ /* Found a candidate. */
054f3f90 80+ if (bestname[0] != '\0') {
e55625fb 81+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
054f3f90
WD
82+ if (verbose > 2) {
83+ rprintf(FINFO, "fuzzy match %s->%s\n",
84+ safe_fname(fname), buf);
85+ }
58118c25 86+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
87+ }
88+ return -1;
89+}
58118c25
WD
90+
91
92 /*
93 * Acts on file number @p i from @p flist, whose name is @p fname.
4cd1daea 94@@ -496,6 +567,15 @@ static void recv_generator(char *fname,
f48a237e
WD
95 } else
96 partialptr = NULL;
824abc86 97
09fb2223
WD
98+ if (statret == -1 && fuzzy_basis) {
99+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
100+ && S_ISREG(st.st_mode)) {
101+ statret = 0;
102+ fnamecmp = fnamecmpbuf;
0edc7d7f 103+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 104+ }
824abc86
WD
105+ }
106+
09fb2223
WD
107 if (statret == -1) {
108 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 109 return;
4cd1daea 110@@ -524,6 +604,8 @@ static void recv_generator(char *fname,
241013b4 111
9cf86680 112 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
113 ;
114+ else if (fnamecmp_type == FNAMECMP_FUZZY)
115+ ;
116 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 117 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 118 set_perms(fname, file, &st, PERMS_REPORT);
4cd1daea 119@@ -598,8 +680,24 @@ notify_others:
0edc7d7f 120 write_int(f_out, i);
0f626034
WD
121 if (protocol_version >= 29 && inplace && !read_batch)
122 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
123- if (f_out_name >= 0)
124+ if (f_out_name >= 0) {
09fb2223 125 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 126+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
127+ uchar lenbuf[3], *lb = lenbuf;
128+ int len = strlen(fnamecmpbuf);
129+ if (len > 0x7F) {
130+#if MAXPATHLEN > 0x7FFF
131+ *lb++ = len / 0x10000 + 0x80;
132+ *lb++ = len / 0x100;
133+#else
134+ *lb++ = len / 0x100 + 0x80;
135+#endif
136+ }
137+ *lb = len;
138+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
139+ write_buf(f_out_name, fnamecmpbuf, len);
140+ }
0edc7d7f 141+ }
09fb2223 142
0edc7d7f
WD
143 if (dry_run || read_batch)
144 return;
0f626034 145--- orig/main.c 2005-01-17 23:11:45
3eabe3a3 146+++ main.c 2005-01-14 18:33:15
d5753a22 147@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
148 extern int preserve_hard_links;
149 extern int protocol_version;
150 extern int recurse;
09fb2223 151+extern int fuzzy_basis;
495f1899
WD
152 extern int relative_paths;
153 extern int rsync_port;
154 extern int whole_file;
3eabe3a3 155@@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
156 int pid;
157 int status = 0;
158 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
159- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
160+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
161+ && !dry_run;
495f1899 162
d5753a22
WD
163 /* The receiving side mustn't obey this, or an existing symlink that
164 * points to an identical file won't be replaced by the referent. */
0f626034 165--- orig/options.c 2005-01-17 23:11:45
9cf86680 166+++ options.c 2005-01-15 21:08:13
0808daa5 167@@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
f6c3b300 168 int size_only = 0;
0808daa5 169 int daemon_bwlimit = 0;
f6c3b300 170 int bwlimit = 0;
09fb2223 171+int fuzzy_basis = 0;
f6c3b300
WD
172 size_t bwlimit_writemax = 0;
173 int delete_after = 0;
174 int only_existing = 0;
0808daa5 175@@ -288,6 +289,7 @@ void usage(enum logcode F)
f0533c4c 176 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
177 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
178 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 179+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
180 rprintf(F," -P equivalent to --partial --progress\n");
181 rprintf(F," -z, --compress compress file data\n");
182 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
37da98ae 183@@ -384,6 +386,7 @@ static struct poptOption long_options[]
0808daa5
WD
184 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
185 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
186 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 187+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
188 /* TODO: Should this take an optional int giving the compression level? */
189 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 190 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
0f626034 191@@ -1234,6 +1237,9 @@ void server_options(char **args,int *arg
37da98ae 192 args[ac++] = "--no-relative";
241013b4 193 }
7b675ff5 194
09fb2223 195+ if (fuzzy_basis && am_sender)
241013b4 196+ args[ac++] = "--fuzzy";
7b675ff5 197+
241013b4 198 *argc = ac;
f74d2272 199 return;
7b675ff5 200
0f626034 201--- orig/receiver.c 2005-01-17 23:11:45
9cf86680
WD
202+++ receiver.c 2005-01-15 21:21:02
203@@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
204 }
205
206
207+static void read_gen_name(int fd, char *buf)
208+{
209+ int len = read_byte(fd);
210+ if (len & 0x80) {
211+#if MAXPATHLEN > 32767
212+ uchar lenbuf[2];
213+ read_buf(fd, (char *)lenbuf, 2);
214+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
215+#else
216+ len = (len & ~0x80) * 0x100 + read_byte(fd);
217+#endif
218+ }
219+ if (len >= MAXPATHLEN) {
220+ rprintf(FERROR, "bogus data on generator name pipe\n");
221+ exit_cleanup(RERR_PROTOCOL);
222+ }
223+
224+ read_sbuf(fd, buf, len);
225+}
226+
227+
228 static void discard_receive_data(int f_in, OFF_T length)
229 {
230 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
9cf86680 231@@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 232 case FNAMECMP_BACKUP:
f48a237e 233 fnamecmp = get_backup_name(fname);
09fb2223 234 break;
0edc7d7f 235+ case FNAMECMP_FUZZY:
09fb2223
WD
236+ read_gen_name(f_in_name, fnamecmpbuf);
237+ fnamecmp = fnamecmpbuf;
238+ break;
239 default:
9cf86680
WD
240 if (j >= basis_dir_cnt) {
241 rprintf(FERROR,
0f626034 242--- orig/rsync.h 2005-01-17 23:11:45
9cf86680 243+++ rsync.h 2005-01-15 21:24:09
0f626034 244@@ -128,6 +128,7 @@
0edc7d7f
WD
245 #define FNAMECMP_FNAME 0x80
246 #define FNAMECMP_PARTIAL_DIR 0x81
247 #define FNAMECMP_BACKUP 0x82
248+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
249
250
251 /* Log-message categories. FLOG is only used on the daemon side to
0f626034 252--- orig/rsync.yo 2005-01-17 23:11:46
637c560e 253+++ rsync.yo 2005-01-15 21:48:52
0808daa5 254@@ -358,6 +358,7 @@ verb(
f0533c4c 255 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
256 --copy-dest=DIR ... and include copies of unchanged files
257 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 258+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
259 -P equivalent to --partial --progress
260 -z, --compress compress file data
261 -C, --cvs-exclude auto ignore files in the same way CVS does
0f626034
WD
262@@ -878,6 +879,11 @@ Note that rsync versions prior to 2.6.1
263 (or implied by -a). You can work-around this bug by avoiding the -o option
264 when sending to an old rsync.
637c560e
WD
265
266+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
267+basis file for any destination file that is missing. The current algorithm
268+looks for a similarly-named file in the same directory as the destination
269+file, and, if found, uses that to try to speed up the transfer.
270+
271 dit(bf(-z, --compress)) With this option, rsync compresses any data from
272 the files that it sends to the destination machine. This
273 option is useful on slow connections. The compression method used is the
054f3f90 274--- orig/util.c 2004-09-07 21:45:30
4cd1daea
WD
275+++ util.c 2005-01-18 19:25:47
276@@ -1217,3 +1217,108 @@ void *_realloc_array(void *ptr, unsigned
054f3f90
WD
277 return malloc(size * num);
278 return realloc(ptr, size * num);
279 }
280+
4cd1daea
WD
281+/* Take a filename and filename length and return the most significant
282+ * filename suffix we can find. This ignores suffixes such as "~",
283+ * ".bak", ".orig", ".~1~", etc. */
284+const char *find_filename_suffix(const char *fn, int *len_ptr)
285+{
286+ const char *suf, *s;
287+ int s_len, fn_len = *len_ptr;
288+ BOOL had_tilde;
289+
290+ /* One or more dots at the start aren't a suffix. */
291+ while (fn_len && *fn == '.') fn++, fn_len--;
292+
293+ /* Ignore the ~ in a "foo~" filename. */
294+ if (fn_len > 1 && fn[fn_len-1] == '~')
295+ fn_len--, had_tilde = True;
296+ else
297+ had_tilde = False;
298+
299+ /* Assume we don't find an suffix. */
300+ suf = "";
301+ *len_ptr = 0;
302+
303+ /* Find the last significant suffix. */
304+ for (s = fn + fn_len - 1; fn_len > 1; ) {
305+ while (*s != '.' && s != fn) s--;
306+ if (s == fn)
307+ break;
308+ s_len = fn_len - (s - fn);
309+ fn_len = s - fn;
310+ if (s_len == 3) {
311+ if (strcmp(s+1, "bak") == 0
312+ || strcmp(s+1, "old") == 0)
313+ continue;
314+ } else if (s_len == 4) {
315+ if (strcmp(s+1, "orig") == 0)
316+ continue;
317+ } else if (s_len > 2 && had_tilde
318+ && s[1] == '~' && isdigit(s[2]))
319+ continue;
320+ *len_ptr = s_len;
321+ suf = s;
322+ /* Determine if the suffix is all digits. */
323+ for (s++, s_len--; s_len > 0; s++, s_len--) {
324+ if (!isdigit(*s))
325+ return suf;
326+ }
327+ /* An all-digit suffix may not be that signficant. */
328+ continue;
329+ }
330+
331+ return suf;
332+}
333+
054f3f90
WD
334+/* This is an implementation of the Levenshtein distance algorithm. It
335+ * was implemented to avoid needing a two-dimensional matrix (to save
336+ * memory). It was also tweaked to try to factor in the ASCII distance
337+ * between changed characters as a minor distance quantity. The normal
338+ * Levenshtein units of distance (each signifying a single change between
339+ * the two strings) are defined as a "UNIT". */
340+
341+#define UNIT (1 << 16)
342+
343+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
344+{
345+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
346+ int32 cost;
347+ int i1, i2;
348+
4cd1daea
WD
349+ if (!len1 || !len2) {
350+ if (!len1) {
351+ s1 = s2;
352+ len1 = len2;
353+ }
354+ for (i1 = 0, cost = 0; i1 < len1; i1++)
355+ cost += s1[i1];
356+ return (int32)len1 * UNIT + cost;
357+ }
054f3f90
WD
358+
359+ for (i2 = 0; i2 < len2; i2++)
360+ a[i2] = (i2+1) * UNIT;
361+
362+ for (i1 = 0; i1 < len1; i1++) {
363+ diag = i1 * UNIT;
364+ above = (i1+1) * UNIT;
365+ for (i2 = 0; i2 < len2; i2++) {
366+ left = a[i2];
367+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
368+ if (cost < 0)
369+ cost = UNIT - cost;
370+ else
371+ cost = UNIT + cost;
372+ }
373+ diag_inc = diag + cost;
374+ left_inc = left + UNIT + *((uchar*)s1+i1);
375+ above_inc = above + UNIT + *((uchar*)s2+i2);
376+ a[i2] = above = left < above
377+ ? (left_inc < diag_inc ? left_inc : diag_inc)
378+ : (above_inc < diag_inc ? above_inc : diag_inc);
379+ diag = left;
380+ }
381+ }
382+
383+ return a[len2-1];
384+}