1 The changes to generator.c were greatly simplified, making the patch
2 easier to maintain and fixing the failing test in the testsuite.
5 Be sure to run "make proto" before "make".
7 --- orig/generator.c 2005-01-17 23:11:45
8 +++ generator.c 2005-01-18 19:25:55
9 @@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
11 extern int io_timeout;
12 extern int protocol_version;
13 +extern int fuzzy_basis;
14 extern int always_checksum;
15 extern char *partial_dir;
16 extern char *basis_dir[];
17 @@ -242,6 +243,89 @@ static void generate_and_send_sums(int f
21 +/* Try to find a filename in the same dir as "fname" with a similar name.
25 + * 1. We should be using a cache of names for the current dir, not
26 + * re-reading the destination directory for every file.
27 + * 2. We must not return an rsync tempfile from the current transfer.
28 + * 3. If the highest-rated name is not a normal file, we should fall-
29 + * back to the next highest-rated file.
30 + * 4. We must not return a destination file that is being updated
31 + * during the current transfer, even if we already processed it
32 + * (since the receiver may not be done with it yet).
34 +static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
38 + char *basename, *dirname, *slash;
39 + char bestname[MAXPATHLEN];
40 + int suf_len, basename_len;
41 + uint32 lowest_dist = 0x7FFFFFFF;
44 + strlcpy(buf, fname, MAXPATHLEN);
45 + if ((slash = strrchr(buf, '/')) != NULL) {
48 + basename = slash + 1;
53 + basename_len = strlen(basename);
55 + if (!(d = opendir(dirname))) {
56 + rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
62 + suf_len = basename_len;
63 + suf = find_filename_suffix(basename, &suf_len);
66 + while ((di = readdir(d)) != NULL) {
67 + const char *dname_suf, *dname = d_name(di);
69 + int dname_len, dname_suf_len;
71 + if (dname[0] == '.' && (dname[1] == '\0'
72 + || (dname[1] == '.' && dname[2] == '\0')))
75 + dname_len = dname_suf_len = strlen(dname);
76 + dname_suf = find_filename_suffix(dname, &dname_suf_len);
78 + dist = fuzzy_distance(dname, dname_len, basename, basename_len);
79 + /* Add some extra weight to how well the suffixes match. */
80 + dist += fuzzy_distance(dname_suf, dname_suf_len, suf, suf_len) * 10;
82 + rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
83 + dname, (int)(dist>>16), (int)(dist&0xFFFF));
85 + if (dist <= lowest_dist) {
86 + strlcpy(bestname, dname, sizeof bestname);
92 + /* Found a candidate. */
93 + if (bestname[0] != '\0') {
94 + strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
96 + rprintf(FINFO, "fuzzy match %s->%s\n",
97 + safe_fname(fname), buf);
99 + return link_stat(buf, st_ptr, 0);
106 * Acts on file number @p i from @p flist, whose name is @p fname.
107 @@ -496,6 +580,15 @@ static void recv_generator(char *fname,
111 + if (statret == -1 && fuzzy_basis) {
112 + if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
113 + && S_ISREG(st.st_mode)) {
115 + fnamecmp = fnamecmpbuf;
116 + fnamecmp_type = FNAMECMP_FUZZY;
121 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
123 @@ -524,6 +617,8 @@ static void recv_generator(char *fname,
125 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
127 + else if (fnamecmp_type == FNAMECMP_FUZZY)
129 else if (unchanged_file(fnamecmp, file, &st)) {
130 if (fnamecmp_type == FNAMECMP_FNAME)
131 set_perms(fname, file, &st, PERMS_REPORT);
132 @@ -598,8 +693,24 @@ notify_others:
134 if (protocol_version >= 29 && inplace && !read_batch)
135 write_byte(f_out, fnamecmp_type);
136 - if (f_out_name >= 0)
137 + if (f_out_name >= 0) {
138 write_byte(f_out_name, fnamecmp_type);
139 + if (fnamecmp_type == FNAMECMP_FUZZY) {
140 + uchar lenbuf[3], *lb = lenbuf;
141 + int len = strlen(fnamecmpbuf);
143 +#if MAXPATHLEN > 0x7FFF
144 + *lb++ = len / 0x10000 + 0x80;
145 + *lb++ = len / 0x100;
147 + *lb++ = len / 0x100 + 0x80;
151 + write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
152 + write_buf(f_out_name, fnamecmpbuf, len);
156 if (dry_run || read_batch)
158 --- orig/main.c 2005-01-17 23:11:45
159 +++ main.c 2005-01-14 18:33:15
160 @@ -48,6 +48,7 @@ extern int keep_dirlinks;
161 extern int preserve_hard_links;
162 extern int protocol_version;
164 +extern int fuzzy_basis;
165 extern int relative_paths;
166 extern int rsync_port;
167 extern int whole_file;
168 @@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
171 int error_pipe[2], name_pipe[2];
172 - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
173 + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
176 /* The receiving side mustn't obey this, or an existing symlink that
177 * points to an identical file won't be replaced by the referent. */
178 --- orig/options.c 2005-01-17 23:11:45
179 +++ options.c 2005-01-15 21:08:13
180 @@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
182 int daemon_bwlimit = 0;
184 +int fuzzy_basis = 0;
185 size_t bwlimit_writemax = 0;
186 int delete_after = 0;
187 int only_existing = 0;
188 @@ -288,6 +289,7 @@ void usage(enum logcode F)
189 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
190 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
191 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
192 + rprintf(F," --fuzzy find similar file for basis when no dest file\n");
193 rprintf(F," -P equivalent to --partial --progress\n");
194 rprintf(F," -z, --compress compress file data\n");
195 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
196 @@ -384,6 +386,7 @@ static struct poptOption long_options[]
197 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
198 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
199 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
200 + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
201 /* TODO: Should this take an optional int giving the compression level? */
202 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
203 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
204 @@ -1234,6 +1237,9 @@ void server_options(char **args,int *arg
205 args[ac++] = "--no-relative";
208 + if (fuzzy_basis && am_sender)
209 + args[ac++] = "--fuzzy";
214 --- orig/receiver.c 2005-01-17 23:11:45
215 +++ receiver.c 2005-01-15 21:21:02
216 @@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
220 +static void read_gen_name(int fd, char *buf)
222 + int len = read_byte(fd);
224 +#if MAXPATHLEN > 32767
226 + read_buf(fd, (char *)lenbuf, 2);
227 + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
229 + len = (len & ~0x80) * 0x100 + read_byte(fd);
232 + if (len >= MAXPATHLEN) {
233 + rprintf(FERROR, "bogus data on generator name pipe\n");
234 + exit_cleanup(RERR_PROTOCOL);
237 + read_sbuf(fd, buf, len);
241 static void discard_receive_data(int f_in, OFF_T length)
243 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
244 @@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
245 case FNAMECMP_BACKUP:
246 fnamecmp = get_backup_name(fname);
248 + case FNAMECMP_FUZZY:
249 + read_gen_name(f_in_name, fnamecmpbuf);
250 + fnamecmp = fnamecmpbuf;
253 if (j >= basis_dir_cnt) {
255 --- orig/rsync.h 2005-01-17 23:11:45
256 +++ rsync.h 2005-01-15 21:24:09
258 #define FNAMECMP_FNAME 0x80
259 #define FNAMECMP_PARTIAL_DIR 0x81
260 #define FNAMECMP_BACKUP 0x82
261 +#define FNAMECMP_FUZZY 0x83
264 /* Log-message categories. FLOG is only used on the daemon side to
265 --- orig/rsync.yo 2005-01-17 23:11:46
266 +++ rsync.yo 2005-01-15 21:48:52
267 @@ -358,6 +358,7 @@ verb(
268 --compare-dest=DIR also compare received files relative to DIR
269 --copy-dest=DIR ... and include copies of unchanged files
270 --link-dest=DIR hardlink to files in DIR when unchanged
271 + --fuzzy find similar file for basis when no dest
272 -P equivalent to --partial --progress
273 -z, --compress compress file data
274 -C, --cvs-exclude auto ignore files in the same way CVS does
275 @@ -878,6 +879,14 @@ Note that rsync versions prior to 2.6.1
276 (or implied by -a). You can work-around this bug by avoiding the -o option
277 when sending to an old rsync.
279 +dit(bf(--fuzzy)) This option tells rsync that it should look around for a
280 +basis file for any destination file that is missing. The current algorithm
281 +looks for a similarly-named file in the same directory as the destination
282 +file, and, if found, uses that to try to speed up the transfer. Note that
283 +the use of the --delete option might get rid of any potential fuzzy-match
284 +files, so either use --delete-after or filename exclusions if you need to
287 dit(bf(-z, --compress)) With this option, rsync compresses any data from
288 the files that it sends to the destination machine. This
289 option is useful on slow connections. The compression method used is the
290 --- orig/util.c 2004-09-07 21:45:30
291 +++ util.c 2005-01-18 19:25:47
292 @@ -1217,3 +1217,108 @@ void *_realloc_array(void *ptr, unsigned
293 return malloc(size * num);
294 return realloc(ptr, size * num);
297 +/* Take a filename and filename length and return the most significant
298 + * filename suffix we can find. This ignores suffixes such as "~",
299 + * ".bak", ".orig", ".~1~", etc. */
300 +const char *find_filename_suffix(const char *fn, int *len_ptr)
302 + const char *suf, *s;
303 + int s_len, fn_len = *len_ptr;
306 + /* One or more dots at the start aren't a suffix. */
307 + while (fn_len && *fn == '.') fn++, fn_len--;
309 + /* Ignore the ~ in a "foo~" filename. */
310 + if (fn_len > 1 && fn[fn_len-1] == '~')
311 + fn_len--, had_tilde = True;
315 + /* Assume we don't find an suffix. */
319 + /* Find the last significant suffix. */
320 + for (s = fn + fn_len - 1; fn_len > 1; ) {
321 + while (*s != '.' && s != fn) s--;
324 + s_len = fn_len - (s - fn);
327 + if (strcmp(s+1, "bak") == 0
328 + || strcmp(s+1, "old") == 0)
330 + } else if (s_len == 4) {
331 + if (strcmp(s+1, "orig") == 0)
333 + } else if (s_len > 2 && had_tilde
334 + && s[1] == '~' && isdigit(s[2]))
338 + /* Determine if the suffix is all digits. */
339 + for (s++, s_len--; s_len > 0; s++, s_len--) {
343 + /* An all-digit suffix may not be that signficant. */
350 +/* This is an implementation of the Levenshtein distance algorithm. It
351 + * was implemented to avoid needing a two-dimensional matrix (to save
352 + * memory). It was also tweaked to try to factor in the ASCII distance
353 + * between changed characters as a minor distance quantity. The normal
354 + * Levenshtein units of distance (each signifying a single change between
355 + * the two strings) are defined as a "UNIT". */
357 +#define UNIT (1 << 16)
359 +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
361 + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
365 + if (!len1 || !len2) {
370 + for (i1 = 0, cost = 0; i1 < len1; i1++)
372 + return (int32)len1 * UNIT + cost;
375 + for (i2 = 0; i2 < len2; i2++)
376 + a[i2] = (i2+1) * UNIT;
378 + for (i1 = 0; i1 < len1; i1++) {
380 + above = (i1+1) * UNIT;
381 + for (i2 = 0; i2 < len2; i2++) {
383 + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
385 + cost = UNIT - cost;
387 + cost = UNIT + cost;
389 + diag_inc = diag + cost;
390 + left_inc = left + UNIT + *((uchar*)s1+i1);
391 + above_inc = above + UNIT + *((uchar*)s2+i2);
392 + a[i2] = above = left < above
393 + ? (left_inc < diag_inc ? left_inc : diag_inc)
394 + : (above_inc < diag_inc ? above_inc : diag_inc);