Swapped out the simplistic measure_name() run-length count for
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
0f626034 7--- orig/generator.c 2005-01-17 23:11:45
054f3f90 8+++ generator.c 2005-01-18 10:55:29
0808daa5
WD
9@@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
054f3f90 17@@ -242,6 +243,81 @@ static void generate_and_send_sums(int f
58118c25
WD
18 }
19
20
09fb2223 21+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
22+{
23+ DIR *d;
24+ struct dirent *di;
0f626034 25+ char *basename, *dirname, *slash;
47dd7a31 26+ char bestname[MAXPATHLEN];
054f3f90
WD
27+ int extlen, basename_len;
28+ uint32 lowest_dist = 0x7FFFFFFF;
47dd7a31
WD
29+ const char *ext;
30+
0f626034
WD
31+ strlcpy(buf, fname, MAXPATHLEN);
32+ if ((slash = strrchr(buf, '/')) != NULL) {
33+ dirname = buf;
34+ *slash = '\0';
35+ basename = slash + 1;
36+ } else {
37+ basename = buf;
38+ dirname = ".";
39+ }
054f3f90 40+ basename_len = strlen(basename);
47dd7a31 41+
47dd7a31
WD
42+ if (!(d = opendir(dirname))) {
43+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
44+ return -1;
45+ }
e55625fb
WD
46+ if (slash)
47+ *slash = '/';
47dd7a31
WD
48+
49+ /* Get final extension, eg. .gz; never full basename though. */
0f626034
WD
50+ for (ext = basename; *ext == '.'; ext++) {}
51+ if (!(ext = strrchr(ext, '.')))
054f3f90
WD
52+ ext = basename + basename_len; /* ext = "" */
53+ extlen = strlen(ext);
47dd7a31 54+
054f3f90 55+ bestname[0] = '\0';
47dd7a31
WD
56+ while ((di = readdir(d)) != NULL) {
57+ const char *dname = d_name(di);
054f3f90
WD
58+ uint32 dist;
59+ int dname_len;
47dd7a31
WD
60+
61+ if (dname[0] == '.' && (dname[1] == '\0'
62+ || (dname[1] == '.' && dname[2] == '\0')))
63+ continue;
64+
054f3f90
WD
65+ dname_len = strlen(dname);
66+
67+ /* Extensions must match */
68+ if (dname_len <= extlen
69+ || strcmp(dname + dname_len - extlen, ext) != 0)
70+ continue;
71+
72+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
73+ if (verbose > 1) {
74+ rprintf(FINFO, "fuzzy distance for %s = %lx\n",
75+ dname, (unsigned long)dist);
47dd7a31 76+ }
054f3f90 77+ if (dist < lowest_dist) {
8c5b8235 78+ strlcpy(bestname, dname, sizeof bestname);
054f3f90 79+ lowest_dist = dist;
47dd7a31
WD
80+ }
81+ }
82+ closedir(d);
83+
84+ /* Found a candidate. */
054f3f90 85+ if (bestname[0] != '\0') {
e55625fb 86+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
054f3f90
WD
87+ if (verbose > 2) {
88+ rprintf(FINFO, "fuzzy match %s->%s\n",
89+ safe_fname(fname), buf);
90+ }
58118c25 91+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
92+ }
93+ return -1;
94+}
58118c25
WD
95+
96
97 /*
98 * Acts on file number @p i from @p flist, whose name is @p fname.
054f3f90 99@@ -496,6 +572,15 @@ static void recv_generator(char *fname,
f48a237e
WD
100 } else
101 partialptr = NULL;
824abc86 102
09fb2223
WD
103+ if (statret == -1 && fuzzy_basis) {
104+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
105+ && S_ISREG(st.st_mode)) {
106+ statret = 0;
107+ fnamecmp = fnamecmpbuf;
0edc7d7f 108+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 109+ }
824abc86
WD
110+ }
111+
09fb2223
WD
112 if (statret == -1) {
113 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 114 return;
054f3f90 115@@ -524,6 +609,8 @@ static void recv_generator(char *fname,
241013b4 116
9cf86680 117 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
118 ;
119+ else if (fnamecmp_type == FNAMECMP_FUZZY)
120+ ;
121 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 122 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 123 set_perms(fname, file, &st, PERMS_REPORT);
054f3f90 124@@ -598,8 +685,24 @@ notify_others:
0edc7d7f 125 write_int(f_out, i);
0f626034
WD
126 if (protocol_version >= 29 && inplace && !read_batch)
127 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
128- if (f_out_name >= 0)
129+ if (f_out_name >= 0) {
09fb2223 130 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 131+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
132+ uchar lenbuf[3], *lb = lenbuf;
133+ int len = strlen(fnamecmpbuf);
134+ if (len > 0x7F) {
135+#if MAXPATHLEN > 0x7FFF
136+ *lb++ = len / 0x10000 + 0x80;
137+ *lb++ = len / 0x100;
138+#else
139+ *lb++ = len / 0x100 + 0x80;
140+#endif
141+ }
142+ *lb = len;
143+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
144+ write_buf(f_out_name, fnamecmpbuf, len);
145+ }
0edc7d7f 146+ }
09fb2223 147
0edc7d7f
WD
148 if (dry_run || read_batch)
149 return;
0f626034 150--- orig/main.c 2005-01-17 23:11:45
3eabe3a3 151+++ main.c 2005-01-14 18:33:15
d5753a22 152@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
153 extern int preserve_hard_links;
154 extern int protocol_version;
155 extern int recurse;
09fb2223 156+extern int fuzzy_basis;
495f1899
WD
157 extern int relative_paths;
158 extern int rsync_port;
159 extern int whole_file;
3eabe3a3 160@@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
161 int pid;
162 int status = 0;
163 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
164- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
165+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
166+ && !dry_run;
495f1899 167
d5753a22
WD
168 /* The receiving side mustn't obey this, or an existing symlink that
169 * points to an identical file won't be replaced by the referent. */
0f626034 170--- orig/options.c 2005-01-17 23:11:45
9cf86680 171+++ options.c 2005-01-15 21:08:13
0808daa5 172@@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
f6c3b300 173 int size_only = 0;
0808daa5 174 int daemon_bwlimit = 0;
f6c3b300 175 int bwlimit = 0;
09fb2223 176+int fuzzy_basis = 0;
f6c3b300
WD
177 size_t bwlimit_writemax = 0;
178 int delete_after = 0;
179 int only_existing = 0;
0808daa5 180@@ -288,6 +289,7 @@ void usage(enum logcode F)
f0533c4c 181 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
182 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
183 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 184+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
185 rprintf(F," -P equivalent to --partial --progress\n");
186 rprintf(F," -z, --compress compress file data\n");
187 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
37da98ae 188@@ -384,6 +386,7 @@ static struct poptOption long_options[]
0808daa5
WD
189 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
190 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
191 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 192+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
193 /* TODO: Should this take an optional int giving the compression level? */
194 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 195 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
0f626034 196@@ -1234,6 +1237,9 @@ void server_options(char **args,int *arg
37da98ae 197 args[ac++] = "--no-relative";
241013b4 198 }
7b675ff5 199
09fb2223 200+ if (fuzzy_basis && am_sender)
241013b4 201+ args[ac++] = "--fuzzy";
7b675ff5 202+
241013b4 203 *argc = ac;
f74d2272 204 return;
7b675ff5 205
0f626034 206--- orig/receiver.c 2005-01-17 23:11:45
9cf86680
WD
207+++ receiver.c 2005-01-15 21:21:02
208@@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
209 }
210
211
212+static void read_gen_name(int fd, char *buf)
213+{
214+ int len = read_byte(fd);
215+ if (len & 0x80) {
216+#if MAXPATHLEN > 32767
217+ uchar lenbuf[2];
218+ read_buf(fd, (char *)lenbuf, 2);
219+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
220+#else
221+ len = (len & ~0x80) * 0x100 + read_byte(fd);
222+#endif
223+ }
224+ if (len >= MAXPATHLEN) {
225+ rprintf(FERROR, "bogus data on generator name pipe\n");
226+ exit_cleanup(RERR_PROTOCOL);
227+ }
228+
229+ read_sbuf(fd, buf, len);
230+}
231+
232+
233 static void discard_receive_data(int f_in, OFF_T length)
234 {
235 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
9cf86680 236@@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 237 case FNAMECMP_BACKUP:
f48a237e 238 fnamecmp = get_backup_name(fname);
09fb2223 239 break;
0edc7d7f 240+ case FNAMECMP_FUZZY:
09fb2223
WD
241+ read_gen_name(f_in_name, fnamecmpbuf);
242+ fnamecmp = fnamecmpbuf;
243+ break;
244 default:
9cf86680
WD
245 if (j >= basis_dir_cnt) {
246 rprintf(FERROR,
0f626034 247--- orig/rsync.h 2005-01-17 23:11:45
9cf86680 248+++ rsync.h 2005-01-15 21:24:09
0f626034 249@@ -128,6 +128,7 @@
0edc7d7f
WD
250 #define FNAMECMP_FNAME 0x80
251 #define FNAMECMP_PARTIAL_DIR 0x81
252 #define FNAMECMP_BACKUP 0x82
253+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
254
255
256 /* Log-message categories. FLOG is only used on the daemon side to
0f626034 257--- orig/rsync.yo 2005-01-17 23:11:46
637c560e 258+++ rsync.yo 2005-01-15 21:48:52
0808daa5 259@@ -358,6 +358,7 @@ verb(
f0533c4c 260 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
261 --copy-dest=DIR ... and include copies of unchanged files
262 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 263+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
264 -P equivalent to --partial --progress
265 -z, --compress compress file data
266 -C, --cvs-exclude auto ignore files in the same way CVS does
0f626034
WD
267@@ -878,6 +879,11 @@ Note that rsync versions prior to 2.6.1
268 (or implied by -a). You can work-around this bug by avoiding the -o option
269 when sending to an old rsync.
637c560e
WD
270
271+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
272+basis file for any destination file that is missing. The current algorithm
273+looks for a similarly-named file in the same directory as the destination
274+file, and, if found, uses that to try to speed up the transfer.
275+
276 dit(bf(-z, --compress)) With this option, rsync compresses any data from
277 the files that it sends to the destination machine. This
278 option is useful on slow connections. The compression method used is the
054f3f90
WD
279--- orig/util.c 2004-09-07 21:45:30
280+++ util.c 2005-01-18 11:18:46
281@@ -1217,3 +1217,50 @@ void *_realloc_array(void *ptr, unsigned
282 return malloc(size * num);
283 return realloc(ptr, size * num);
284 }
285+
286+/* This is an implementation of the Levenshtein distance algorithm. It
287+ * was implemented to avoid needing a two-dimensional matrix (to save
288+ * memory). It was also tweaked to try to factor in the ASCII distance
289+ * between changed characters as a minor distance quantity. The normal
290+ * Levenshtein units of distance (each signifying a single change between
291+ * the two strings) are defined as a "UNIT". */
292+
293+#define UNIT (1 << 16)
294+
295+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
296+{
297+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
298+ int32 cost;
299+ int i1, i2;
300+
301+ if (!len1)
302+ return (int32)len2 * UNIT;
303+ if (!len2)
304+ return (int32)len1 * UNIT;
305+
306+ for (i2 = 0; i2 < len2; i2++)
307+ a[i2] = (i2+1) * UNIT;
308+
309+ for (i1 = 0; i1 < len1; i1++) {
310+ diag = i1 * UNIT;
311+ above = (i1+1) * UNIT;
312+ for (i2 = 0; i2 < len2; i2++) {
313+ left = a[i2];
314+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
315+ if (cost < 0)
316+ cost = UNIT - cost;
317+ else
318+ cost = UNIT + cost;
319+ }
320+ diag_inc = diag + cost;
321+ left_inc = left + UNIT + *((uchar*)s1+i1);
322+ above_inc = above + UNIT + *((uchar*)s2+i2);
323+ a[i2] = above = left < above
324+ ? (left_inc < diag_inc ? left_inc : diag_inc)
325+ : (above_inc < diag_inc ? above_inc : diag_inc);
326+ diag = left;
327+ }
328+ }
329+
330+ return a[len2-1];
331+}