Updated to apply cleanly.
[rsync/rsync-patches.git] / fuzzy.diff
... / ...
CommitLineData
1The changes to generator.c were greatly simplified, making the patch
2easier to maintain and fixing the failing test in the testsuite.
3Lightly tested.
4
5Be sure to run "make proto" before "make".
6
7--- orig/generator.c 2005-02-03 02:04:20
8+++ generator.c 2005-02-03 02:11:10
9@@ -47,6 +47,7 @@ extern int size_only;
10 extern OFF_T max_size;
11 extern int io_timeout;
12 extern int protocol_version;
13+extern int fuzzy_basis;
14 extern int always_checksum;
15 extern char *partial_dir;
16 extern char *basis_dir[];
17@@ -227,6 +228,88 @@ static void generate_and_send_sums(int f
18 unmap_file(mapbuf);
19 }
20
21+/* Try to find a filename in the same dir as "fname" with a similar name.
22+ *
23+ * TODO:
24+ * - We should be using a cache of names for the current dir, not
25+ * re-reading the destination directory for every file.
26+ * - We must not return an rsync tempfile from the current transfer.
27+ * - If the highest-rated name is not a normal file, we should fall-
28+ * back to the next highest-rated file.
29+ * - We must not return a destination file that is being updated
30+ * during the current transfer, even if we already processed it
31+ * (since the receiver may not be done with it yet).
32+ * - We must weed out any names that a daemon's config has excluded.
33+ */
34+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
35+{
36+ DIR *d;
37+ struct dirent *di;
38+ char *basename, *dirname, *slash;
39+ char bestname[MAXPATHLEN];
40+ int suf_len, basename_len;
41+ uint32 lowest_dist = 0x7FFFFFFF;
42+ const char *suf;
43+
44+ strlcpy(buf, fname, MAXPATHLEN);
45+ if ((slash = strrchr(buf, '/')) != NULL) {
46+ dirname = buf;
47+ *slash = '\0';
48+ basename = slash + 1;
49+ } else {
50+ basename = buf;
51+ dirname = ".";
52+ }
53+ basename_len = strlen(basename);
54+
55+ if (!(d = opendir(dirname))) {
56+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
57+ return -1;
58+ }
59+ if (slash)
60+ *slash = '/';
61+
62+ suf = find_filename_suffix(basename, basename_len, &suf_len);
63+
64+ bestname[0] = '\0';
65+ while ((di = readdir(d)) != NULL) {
66+ const char *dname_suf, *dname = d_name(di);
67+ uint32 dist;
68+ int dname_len, dname_suf_len;
69+
70+ if (dname[0] == '.' && (dname[1] == '\0'
71+ || (dname[1] == '.' && dname[2] == '\0')))
72+ continue;
73+
74+ dname_len = strlen(dname);
75+ dname_suf = find_filename_suffix(dname, dname_len, &dname_suf_len);
76+
77+ dist = fuzzy_distance(dname, dname_len, basename, basename_len);
78+ /* Add some extra weight to how well the suffixes match. */
79+ dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
80+ if (verbose > 4) {
81+ rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
82+ dname, (int)(dist>>16), (int)(dist&0xFFFF));
83+ }
84+ if (dist <= lowest_dist) {
85+ strlcpy(bestname, dname, sizeof bestname);
86+ lowest_dist = dist;
87+ }
88+ }
89+ closedir(d);
90+
91+ /* Found a candidate. */
92+ if (bestname[0] != '\0') {
93+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
94+ if (verbose > 2) {
95+ rprintf(FINFO, "fuzzy match %s->%s\n",
96+ safe_fname(fname), buf);
97+ }
98+ return link_stat(buf, st_ptr, 0);
99+ }
100+ return -1;
101+}
102+
103
104 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
105 * make sure it exists, and has the right permissions/timestamp info. For
106@@ -477,6 +560,15 @@ static void recv_generator(char *fname,
107 } else
108 partialptr = NULL;
109
110+ if (statret == -1 && fuzzy_basis) {
111+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
112+ && S_ISREG(st.st_mode)) {
113+ statret = 0;
114+ fnamecmp = fnamecmpbuf;
115+ fnamecmp_type = FNAMECMP_FUZZY;
116+ }
117+ }
118+
119 if (statret == -1) {
120 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
121 return;
122@@ -505,6 +597,8 @@ static void recv_generator(char *fname,
123
124 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
125 ;
126+ else if (fnamecmp_type == FNAMECMP_FUZZY)
127+ ;
128 else if (unchanged_file(fnamecmp, file, &st)) {
129 if (fnamecmp_type == FNAMECMP_FNAME)
130 set_perms(fname, file, &st, PERMS_REPORT);
131@@ -579,8 +673,24 @@ notify_others:
132 write_int(f_out, ndx);
133 if (protocol_version >= 29 && inplace && !read_batch)
134 write_byte(f_out, fnamecmp_type);
135- if (f_out_name >= 0)
136+ if (f_out_name >= 0) {
137 write_byte(f_out_name, fnamecmp_type);
138+ if (fnamecmp_type == FNAMECMP_FUZZY) {
139+ uchar lenbuf[3], *lb = lenbuf;
140+ int len = strlen(fnamecmpbuf);
141+ if (len > 0x7F) {
142+#if MAXPATHLEN > 0x7FFF
143+ *lb++ = len / 0x10000 + 0x80;
144+ *lb++ = len / 0x100;
145+#else
146+ *lb++ = len / 0x100 + 0x80;
147+#endif
148+ }
149+ *lb = len;
150+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
151+ write_buf(f_out_name, fnamecmpbuf, len);
152+ }
153+ }
154
155 if (dry_run || read_batch)
156 return;
157--- orig/main.c 2005-01-30 10:07:21
158+++ main.c 2005-01-14 18:33:15
159@@ -48,6 +48,7 @@ extern int keep_dirlinks;
160 extern int preserve_hard_links;
161 extern int protocol_version;
162 extern int recurse;
163+extern int fuzzy_basis;
164 extern int relative_paths;
165 extern int rsync_port;
166 extern int whole_file;
167@@ -491,7 +492,8 @@ static int do_recv(int f_in,int f_out,st
168 int pid;
169 int status = 0;
170 int error_pipe[2], name_pipe[2];
171- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
172+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
173+ && !dry_run;
174
175 /* The receiving side mustn't obey this, or an existing symlink that
176 * points to an identical file won't be replaced by the referent. */
177--- orig/options.c 2005-02-01 10:39:22
178+++ options.c 2005-01-28 19:31:20
179@@ -90,6 +90,7 @@ int copy_unsafe_links = 0;
180 int size_only = 0;
181 int daemon_bwlimit = 0;
182 int bwlimit = 0;
183+int fuzzy_basis = 0;
184 size_t bwlimit_writemax = 0;
185 int only_existing = 0;
186 int opt_ignore_existing = 0;
187@@ -303,6 +304,7 @@ void usage(enum logcode F)
188 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
189 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
190 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
191+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
192 rprintf(F," -z, --compress compress file data\n");
193 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
194 rprintf(F," -f, --filter=RULE add a file-filtering RULE\n");
195@@ -408,6 +410,7 @@ static struct poptOption long_options[]
196 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
197 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
198 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
199+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
200 /* TODO: Should this take an optional int giving the compression level? */
201 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
202 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
203@@ -1321,6 +1324,9 @@ void server_options(char **args,int *arg
204 if (!implied_dirs && !am_sender)
205 args[ac++] = "--no-implied-dirs";
206
207+ if (fuzzy_basis && am_sender)
208+ args[ac++] = "--fuzzy";
209+
210 *argc = ac;
211 return;
212
213--- orig/receiver.c 2005-02-03 02:04:20
214+++ receiver.c 2005-01-15 21:21:02
215@@ -256,6 +256,27 @@ static int receive_data(int f_in, char *
216 }
217
218
219+static void read_gen_name(int fd, char *buf)
220+{
221+ int len = read_byte(fd);
222+ if (len & 0x80) {
223+#if MAXPATHLEN > 32767
224+ uchar lenbuf[2];
225+ read_buf(fd, (char *)lenbuf, 2);
226+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
227+#else
228+ len = (len & ~0x80) * 0x100 + read_byte(fd);
229+#endif
230+ }
231+ if (len >= MAXPATHLEN) {
232+ rprintf(FERROR, "bogus data on generator name pipe\n");
233+ exit_cleanup(RERR_PROTOCOL);
234+ }
235+
236+ read_sbuf(fd, buf, len);
237+}
238+
239+
240 static void discard_receive_data(int f_in, OFF_T length)
241 {
242 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
243@@ -395,6 +416,10 @@ int recv_files(int f_in, struct file_lis
244 case FNAMECMP_BACKUP:
245 fnamecmp = get_backup_name(fname);
246 break;
247+ case FNAMECMP_FUZZY:
248+ read_gen_name(f_in_name, fnamecmpbuf);
249+ fnamecmp = fnamecmpbuf;
250+ break;
251 default:
252 if (j >= basis_dir_cnt) {
253 rprintf(FERROR,
254--- orig/rsync.h 2005-02-03 02:04:20
255+++ rsync.h 2005-01-19 18:36:47
256@@ -131,6 +131,7 @@
257 #define FNAMECMP_FNAME 0x80
258 #define FNAMECMP_PARTIAL_DIR 0x81
259 #define FNAMECMP_BACKUP 0x82
260+#define FNAMECMP_FUZZY 0x83
261
262 /* For calling delete_file() */
263 #define DEL_DIR (1<<0)
264--- orig/rsync.yo 2005-02-01 10:39:23
265+++ rsync.yo 2005-01-28 19:31:36
266@@ -354,6 +354,7 @@ to the detailed description below for a
267 --compare-dest=DIR also compare received files relative to DIR
268 --copy-dest=DIR ... and include copies of unchanged files
269 --link-dest=DIR hardlink to files in DIR when unchanged
270+ --fuzzy find similar file for basis when no dest
271 -z, --compress compress file data
272 -C, --cvs-exclude auto-ignore files in the same way CVS does
273 -f, --filter=RULE add a file-filtering RULE
274@@ -937,6 +938,14 @@ bf(--link-dest) from working properly fo
275 (or implied by bf(-a)). You can work-around this bug by avoiding the bf(-o) option
276 when sending to an old rsync.
277
278+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
279+basis file for any destination file that is missing. The current algorithm
280+looks for a similarly-named file in the same directory as the destination
281+file, and, if found, uses that to try to speed up the transfer. Note that
282+the use of the --delete option might get rid of any potential fuzzy-match
283+files, so either use --delete-after or filename exclusions if you need to
284+prevent this.
285+
286 dit(bf(-z, --compress)) With this option, rsync compresses any data from
287 the files that it sends to the destination machine. This
288 option is useful on slow connections. The compression method used is the
289--- orig/util.c 2005-01-28 19:08:20
290+++ util.c 2005-01-19 17:30:51
291@@ -1213,3 +1213,108 @@ void *_realloc_array(void *ptr, unsigned
292 return malloc(size * num);
293 return realloc(ptr, size * num);
294 }
295+
296+/* Take a filename and filename length and return the most significant
297+ * filename suffix we can find. This ignores suffixes such as "~",
298+ * ".bak", ".orig", ".~1~", etc. */
299+const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
300+{
301+ const char *suf, *s;
302+ BOOL had_tilde;
303+ int s_len;
304+
305+ /* One or more dots at the start aren't a suffix. */
306+ while (fn_len && *fn == '.') fn++, fn_len--;
307+
308+ /* Ignore the ~ in a "foo~" filename. */
309+ if (fn_len > 1 && fn[fn_len-1] == '~')
310+ fn_len--, had_tilde = True;
311+ else
312+ had_tilde = False;
313+
314+ /* Assume we don't find an suffix. */
315+ suf = "";
316+ *len_ptr = 0;
317+
318+ /* Find the last significant suffix. */
319+ for (s = fn + fn_len - 1; fn_len > 1; ) {
320+ while (*s != '.' && s != fn) s--;
321+ if (s == fn)
322+ break;
323+ s_len = fn_len - (s - fn);
324+ fn_len = s - fn;
325+ if (s_len == 3) {
326+ if (strcmp(s+1, "bak") == 0
327+ || strcmp(s+1, "old") == 0)
328+ continue;
329+ } else if (s_len == 4) {
330+ if (strcmp(s+1, "orig") == 0)
331+ continue;
332+ } else if (s_len > 2 && had_tilde
333+ && s[1] == '~' && isdigit(s[2]))
334+ continue;
335+ *len_ptr = s_len;
336+ suf = s;
337+ /* Determine if the suffix is all digits. */
338+ for (s++, s_len--; s_len > 0; s++, s_len--) {
339+ if (!isdigit(*s))
340+ return suf;
341+ }
342+ /* An all-digit suffix may not be that signficant. */
343+ continue;
344+ }
345+
346+ return suf;
347+}
348+
349+/* This is an implementation of the Levenshtein distance algorithm. It
350+ * was implemented to avoid needing a two-dimensional matrix (to save
351+ * memory). It was also tweaked to try to factor in the ASCII distance
352+ * between changed characters as a minor distance quantity. The normal
353+ * Levenshtein units of distance (each signifying a single change between
354+ * the two strings) are defined as a "UNIT". */
355+
356+#define UNIT (1 << 16)
357+
358+uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
359+{
360+ uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
361+ int32 cost;
362+ int i1, i2;
363+
364+ if (!len1 || !len2) {
365+ if (!len1) {
366+ s1 = s2;
367+ len1 = len2;
368+ }
369+ for (i1 = 0, cost = 0; i1 < len1; i1++)
370+ cost += s1[i1];
371+ return (int32)len1 * UNIT + cost;
372+ }
373+
374+ for (i2 = 0; i2 < len2; i2++)
375+ a[i2] = (i2+1) * UNIT;
376+
377+ for (i1 = 0; i1 < len1; i1++) {
378+ diag = i1 * UNIT;
379+ above = (i1+1) * UNIT;
380+ for (i2 = 0; i2 < len2; i2++) {
381+ left = a[i2];
382+ if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
383+ if (cost < 0)
384+ cost = UNIT - cost;
385+ else
386+ cost = UNIT + cost;
387+ }
388+ diag_inc = diag + cost;
389+ left_inc = left + UNIT + *((uchar*)s1+i1);
390+ above_inc = above + UNIT + *((uchar*)s2+i2);
391+ a[i2] = above = left < above
392+ ? (left_inc < diag_inc ? left_inc : diag_inc)
393+ : (above_inc < diag_inc ? above_inc : diag_inc);
394+ diag = left;
395+ }
396+ }
397+
398+ return a[len2-1];
399+}