1 This patch adds the --detect-renamed option which makes rsync notice files
2 that either (1) match in size & modify-time (plus the basename, if possible)
3 or (2) match in size & checksum (when --checksum was also specified) and use
4 each match as an alternate basis file to speed up the transfer.
6 The algorithm attempts to scan the receiving-side's files in an efficient
7 manner. If --delete[-before] is enabled, we'll take advantage of the
8 pre-transfer delete pass to prepare any alternate-basis-file matches we
9 might find. If --delete-before is not enabled, rsync does the rename scan
10 during the regular file-sending scan (scanning each directory right before
11 the generator starts updating files from that dir). In this latter mode,
12 rsync might delay the updating of a file (if no alternate-basis match was
13 yet found) until the full scan of the receiving side is complete, at which
14 point any delayed files are processed.
16 I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that
17 takes advantage of rsync's pre-existing partial-dir logic. This uses less
18 memory than trying to keep track of the matches internally, and also allows
19 any deletions or file-updates to occur normally without interfering with
20 these alternate-basis discoveries.
22 To use this patch, run these commands for a successful build:
24 patch -p1 <patches/detect-renamed.diff
25 ./configure (optional if already run)
30 We need to never return a match from fattr_find() that has a basis
31 file. This will ensure that we don't try to give a renamed file to
32 a file that can't use it, while missing out on giving it to a file
37 @@ -41,6 +41,7 @@ extern int checksum_seed;
38 extern int basis_dir_cnt;
39 extern int prune_empty_dirs;
40 extern int protocol_version;
41 +extern int detect_renamed;
42 extern int protect_args;
43 extern int preserve_uid;
44 extern int preserve_gid;
45 @@ -218,7 +219,7 @@ void setup_protocol(int f_out,int f_in)
46 } else if (protocol_version >= 30) {
47 if (recurse && allow_inc_recurse
48 && !delete_before && !delete_after && !delay_updates
49 - && !use_qsort && !prune_empty_dirs)
50 + && !use_qsort && !prune_empty_dirs && !detect_renamed)
52 need_messages_from_generator = 1;
56 @@ -61,6 +61,7 @@ extern int non_perishable_cnt;
57 extern int prune_empty_dirs;
58 extern int copy_links;
59 extern int copy_unsafe_links;
60 +extern int detect_renamed;
61 extern int protocol_version;
62 extern int sanitize_paths;
63 extern struct stats stats;
64 @@ -113,6 +114,8 @@ static int64 tmp_dev, tmp_ino;
66 static char tmp_sum[MAX_DIGEST_LEN];
68 +struct file_list the_fattr_list;
70 static char empty_sum[MAX_DIGEST_LEN];
71 static int flist_count_offset; /* for --delete --progress */
72 static int dir_count = 0;
73 @@ -252,6 +255,45 @@ static int is_excluded(char *fname, int
77 +static int fattr_compare(struct file_struct **file1, struct file_struct **file2)
79 + struct file_struct *f1 = *file1;
80 + struct file_struct *f2 = *file2;
81 + int64 len1 = F_LENGTH(f1), len2 = F_LENGTH(f2);
84 + if (!f1->basename || !S_ISREG(f1->mode) || !len1) {
85 + if (!f2->basename || !S_ISREG(f2->mode) || !len2)
89 + if (!f2->basename || !S_ISREG(f2->mode) || !len2)
92 + /* Don't use diff for values that are longer than an int. */
94 + return len1 < len2 ? -1 : 1;
96 + if (always_checksum) {
97 + diff = u_memcmp(F_SUM(f1), F_SUM(f2), checksum_len);
100 + } else if (f1->modtime != f2->modtime)
101 + return f1->modtime < f2->modtime ? -1 : 1;
103 + diff = u_strcmp(f1->basename, f2->basename);
107 + if (f1->dirname == f2->dirname)
113 + return u_strcmp(f1->dirname, f2->dirname);
116 static void send_directory(int f, struct file_list *flist,
117 char *fbuf, int len, int flags);
119 @@ -2154,6 +2196,25 @@ struct file_list *recv_file_list(int f)
121 clean_flist(flist, relative_paths);
123 + if (detect_renamed) {
124 + int j = flist->used;
125 + the_fattr_list.used = j;
126 + the_fattr_list.files = new_array(struct file_struct *, j);
127 + if (!the_fattr_list.files)
128 + out_of_memory("recv_file_list");
129 + memcpy(the_fattr_list.files, flist->files,
130 + j * sizeof (struct file_struct *));
131 + qsort(the_fattr_list.files, j,
132 + sizeof the_fattr_list.files[0], (int (*)())fattr_compare);
133 + the_fattr_list.low = 0;
135 + struct file_struct *fp = the_fattr_list.files[j];
136 + if (fp->basename && S_ISREG(fp->mode) && F_LENGTH(fp))
139 + the_fattr_list.high = j;
142 if (protocol_version < 30) {
143 /* Recv the io_error flag */
147 @@ -79,6 +79,7 @@ extern char *basis_dir[];
148 extern int compare_dest;
149 extern int copy_dest;
150 extern int link_dest;
151 +extern int detect_renamed;
152 extern int whole_file;
153 extern int list_only;
154 extern int read_batch;
155 @@ -95,6 +96,7 @@ extern char *backup_suffix;
156 extern int backup_suffix_len;
157 extern struct file_list *cur_flist, *first_flist, *dir_flist;
158 extern struct filter_list_struct server_filter_list;
159 +extern struct file_list the_fattr_list;
163 @@ -105,6 +107,7 @@ int maybe_ATTRS_REPORT = 0;
165 static dev_t dev_zero;
166 static int deletion_count = 0; /* used to implement --max-delete */
167 +static int unexplored_dirs = 1;
168 static int deldelay_size = 0, deldelay_cnt = 0;
169 static char *deldelay_buf = NULL;
170 static int deldelay_fd = -1;
171 @@ -114,7 +117,8 @@ static int need_retouch_dir_times;
172 static int need_retouch_dir_perms;
173 static const char *solo_file = NULL;
175 -/* For calling delete_item() and delete_dir_contents(). */
176 +/* For calling delete_item(), delete_dir_contents(), and delete_in_dir(). */
177 +#define DEL_NO_DELETIONS (1<<0)
178 #define DEL_RECURSE (1<<1) /* recurse */
179 #define DEL_DIR_IS_EMPTY (1<<2) /* internal delete_FUNCTIONS use only */
181 @@ -136,11 +140,121 @@ static int is_backup_file(char *fn)
182 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
185 +/* Search for a regular file that matches either (1) the size & modified
186 + * time (plus the basename, if possible) or (2) the size & checksum. If
187 + * we find an exact match down to the dirname, return -1 because we found
188 + * an up-to-date file in the transfer, not a renamed file. */
189 +static int fattr_find(struct file_struct *f, char *fname)
191 + int low = the_fattr_list.low, high = the_fattr_list.high;
192 + int mid, ok_match = -1, good_match = -1;
193 + struct file_struct *fmid;
196 + while (low <= high) {
197 + mid = (low + high) / 2;
198 + fmid = the_fattr_list.files[mid];
199 + if (F_LENGTH(fmid) != F_LENGTH(f)) {
200 + if (F_LENGTH(fmid) < F_LENGTH(f))
206 + if (always_checksum) {
207 + /* We use the FLAG_FILE_SENT flag to indicate when we
208 + * have computed the checksum for an entry. */
209 + if (!(f->flags & FLAG_FILE_SENT)) {
210 + if (fmid->modtime == f->modtime
211 + && f_name_cmp(fmid, f) == 0)
212 + return -1; /* assume we can't help */
213 + file_checksum(fname, F_SUM(f), F_LENGTH(f));
214 + f->flags |= FLAG_FILE_SENT;
216 + diff = u_memcmp(F_SUM(fmid), F_SUM(f), checksum_len);
225 + if (fmid->modtime != f->modtime) {
226 + if (fmid->modtime < f->modtime)
234 + diff = u_strcmp(fmid->basename, f->basename);
237 + if (fmid->dirname == f->dirname)
238 + return -1; /* file is up-to-date */
239 + if (!fmid->dirname) {
247 + diff = u_strcmp(fmid->dirname, f->dirname);
249 + return -1; /* file is up-to-date */
257 + return good_match >= 0 ? good_match : ok_match;
260 +static void look_for_rename(struct file_struct *file, char *fname)
262 + struct file_struct *fp;
263 + char *partialptr, *fn;
267 + if (!partial_dir || (ndx = fattr_find(file, fname)) < 0)
270 + fp = the_fattr_list.files[ndx];
271 + fn = f_name(fp, NULL);
272 + /* We don't provide an alternate-basis file if there is a basis file. */
273 + if (link_stat(fn, &st, 0) == 0)
277 + if ((partialptr = partial_dir_fname(fn)) == NULL
278 + || !handle_partial_dir(partialptr, PDIR_CREATE))
280 + /* We only use the file if we can hard-link it into our tmp dir. */
281 + if (link(fname, partialptr) != 0) {
282 + if (errno != EEXIST)
283 + handle_partial_dir(partialptr, PDIR_DELETE);
288 + /* I think this falls into the -vv category with "%s is uptodate", etc. */
290 + rprintf(FINFO, "found renamed: %s => %s\n", fname, fn);
293 /* Delete a file or directory. If DEL_RECURSE is set in the flags, this will
294 * delete recursively.
296 * Note that fbuf must point to a MAXPATHLEN buffer if the mode indicates it's
297 * a directory! (The buffer is used for recursion, but returned unchanged.)
299 + * Also note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
301 static enum delret delete_item(char *fbuf, int mode, char *replace, int flags)
303 @@ -162,6 +276,8 @@ static enum delret delete_item(char *fbu
305 /* OK: try to delete the directory. */
307 + if (flags & DEL_NO_DELETIONS)
310 if (!replace && max_delete >= 0 && ++deletion_count > max_delete)
312 @@ -208,6 +324,8 @@ static enum delret delete_item(char *fbu
313 * its contents, otherwise just checks for content. Returns DR_SUCCESS or
314 * DR_NOT_EMPTY. Note that fname must point to a MAXPATHLEN buffer! (The
315 * buffer is used for recursion, but returned unchanged.)
317 + * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
319 static enum delret delete_dir_contents(char *fname, int flags)
321 @@ -227,7 +345,9 @@ static enum delret delete_dir_contents(c
322 save_filters = push_local_filters(fname, dlen);
324 non_perishable_cnt = 0;
325 + file_extra_cnt += SUM_EXTRA_CNT;
326 dirlist = get_dirlist(fname, dlen, 0);
327 + file_extra_cnt -= SUM_EXTRA_CNT;
328 ret = non_perishable_cnt ? DR_NOT_EMPTY : DR_SUCCESS;
331 @@ -264,6 +384,8 @@ static enum delret delete_dir_contents(c
332 if (S_ISDIR(fp->mode)
333 && delete_dir_contents(fname, flags | DEL_RECURSE) != DR_SUCCESS)
335 + if (detect_renamed && S_ISREG(fp->mode))
336 + look_for_rename(fp, fname);
337 if (delete_item(fname, fp->mode, NULL, flags) != DR_SUCCESS)
340 @@ -416,13 +538,18 @@ static void do_delayed_deletions(char *d
341 * all the --delete-WHEN options. Note that the fbuf pointer must point to a
342 * MAXPATHLEN buffer with the name of the directory in it (the functions we
343 * call will append names onto the end, but the old dir value will be restored
345 -static void delete_in_dir(char *fbuf, struct file_struct *file, dev_t *fs_dev)
348 + * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
350 +static void delete_in_dir(char *fbuf, struct file_struct *file, dev_t *fs_dev,
353 static int already_warned = 0;
354 struct file_list *dirlist;
355 - char delbuf[MAXPATHLEN];
357 + char *p, delbuf[MAXPATHLEN];
358 + unsigned remainder;
359 + int dlen, i, restore_dot = 0;
362 change_local_filter_dir(NULL, 0, 0);
363 @@ -432,21 +559,28 @@ static void delete_in_dir(char *fbuf, st
365 rprintf(FINFO, "delete_in_dir(%s)\n", fbuf);
367 + flags |= DEL_RECURSE;
370 maybe_send_keepalive();
372 if (io_error && !ignore_errors) {
373 - if (already_warned)
374 + if (!already_warned) {
376 + "IO error encountered -- skipping file deletion\n");
377 + already_warned = 1;
379 + if (!detect_renamed)
382 - "IO error encountered -- skipping file deletion\n");
383 - already_warned = 1;
385 + flags |= DEL_NO_DELETIONS;
389 change_local_filter_dir(fbuf, dlen, F_DEPTH(file));
391 + if (detect_renamed)
394 if (one_file_system) {
395 if (file->flags & FLAG_TOP_DIR)
396 filesystem_dev = *fs_dev;
397 @@ -456,6 +590,14 @@ static void delete_in_dir(char *fbuf, st
399 dirlist = get_dirlist(fbuf, dlen, 0);
402 + if (dlen == 1 && *fbuf == '.') {
405 + } else if (dlen != 1 || *fbuf != '/')
407 + remainder = MAXPATHLEN - (p - fbuf);
409 /* If an item in dirlist is not found in flist, delete it
410 * from the filesystem. */
411 for (i = dirlist->used; i--; ) {
412 @@ -468,16 +610,25 @@ static void delete_in_dir(char *fbuf, st
416 + if (detect_renamed && S_ISREG(fp->mode)) {
417 + strlcpy(p, fp->basename, remainder);
418 + look_for_rename(fp, fbuf);
420 if (flist_find(cur_flist, fp) < 0) {
422 - if (delete_during == 2) {
423 + if (delete_during == 2 && !(flags & DEL_NO_DELETIONS)) {
424 if (!remember_delete(fp, delbuf))
427 - delete_item(delbuf, fp->mode, NULL, DEL_RECURSE);
429 + delete_item(delbuf, fp->mode, NULL, flags);
430 + } else if (detect_renamed && S_ISDIR(fp->mode))
441 @@ -507,9 +658,9 @@ static void do_delete_pass(void)
442 || !S_ISDIR(st.st_mode))
445 - delete_in_dir(fbuf, file, &st.st_dev);
446 + delete_in_dir(fbuf, file, &st.st_dev, 0);
448 - delete_in_dir(NULL, NULL, &dev_zero);
449 + delete_in_dir(NULL, NULL, &dev_zero, 0);
451 if (do_progress && !am_server)
452 rprintf(FINFO, " \r");
453 @@ -1101,6 +1252,7 @@ static void list_file_entry(struct file_
457 +static struct bitbag *delayed_bits = NULL;
458 static int phase = 0;
459 static int dflt_perms;
461 @@ -1345,8 +1497,12 @@ static void recv_generator(char *fname,
464 else if (delete_during && f_out != -1 && !phase && dry_run < 2
465 - && (file->flags & FLAG_CONTENT_DIR))
466 - delete_in_dir(fname, file, &real_sx.st.st_dev);
467 + && (file->flags & FLAG_CONTENT_DIR)) {
468 + if (detect_renamed && real_ret != 0)
470 + delete_in_dir(fname, file, &real_sx.st.st_dev,
471 + delete_during < 0 ? DEL_NO_DELETIONS : 0);
476 @@ -1624,8 +1780,14 @@ static void recv_generator(char *fname,
480 - if (stat_errno == ENOENT)
481 + if (stat_errno == ENOENT) {
482 + if (detect_renamed && unexplored_dirs > 0
483 + && F_LENGTH(file)) {
484 + bitbag_set_bit(delayed_bits, ndx);
489 rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s",
492 @@ -1961,6 +2123,12 @@ void generate_files(int f_out, const cha
494 rprintf(FINFO, "generator starting pid=%ld\n", (long)getpid());
496 + if (detect_renamed) {
497 + delayed_bits = bitbag_create(cur_flist->used);
498 + if (!delete_before && !delete_during)
499 + delete_during = -1;
502 if (delete_before && !solo_file && cur_flist->used > 0)
504 if (delete_during == 2) {
505 @@ -1971,7 +2139,7 @@ void generate_files(int f_out, const cha
509 - if (append_mode > 0 || whole_file < 0)
510 + if (append_mode > 0 || detect_renamed || whole_file < 0)
513 rprintf(FINFO, "delta-transmission %s\n",
514 @@ -2009,7 +2177,7 @@ void generate_files(int f_out, const cha
515 dirdev = MAKEDEV(DEV_MAJOR(devp), DEV_MINOR(devp));
517 dirdev = MAKEDEV(0, 0);
518 - delete_in_dir(f_name(fp, fbuf), fp, &dirdev);
519 + delete_in_dir(f_name(fp, fbuf), fp, &dirdev, 0);
523 @@ -2054,7 +2222,21 @@ void generate_files(int f_out, const cha
524 } while ((cur_flist = cur_flist->next) != NULL);
527 - delete_in_dir(NULL, NULL, &dev_zero);
528 + delete_in_dir(NULL, NULL, &dev_zero, 0);
529 + if (detect_renamed) {
530 + if (delete_during < 0)
532 + detect_renamed = 0;
534 + for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) {
535 + struct file_struct *file = cur_flist->files[i];
537 + strlcpy(fbuf, local_name, sizeof fbuf);
539 + f_name(file, fbuf);
540 + recv_generator(fbuf, file, i, itemizing, code, f_out);
545 rprintf(FINFO, "generate_files phase=%d\n", phase);
548 @@ -80,6 +80,7 @@ int am_generator = 0;
549 int am_starting_up = 1;
550 int relative_paths = -1;
551 int implied_dirs = 1;
552 +int detect_renamed = 0;
554 int allow_8bit_chars = 0;
555 int force_delete = 0;
556 @@ -383,6 +384,7 @@ void usage(enum logcode F)
557 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
558 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
559 rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
560 + rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n");
561 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
562 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
563 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
564 @@ -561,6 +563,7 @@ static struct poptOption long_options[]
565 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
566 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
567 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
568 + {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 },
569 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
570 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
571 {"no-compress", 0, POPT_ARG_VAL, &do_compression, 0, 0, 0 },
572 @@ -1532,7 +1535,7 @@ int parse_arguments(int *argc_p, const c
576 - if (delay_updates && !partial_dir)
577 + if ((delay_updates || detect_renamed) && !partial_dir)
578 partial_dir = tmp_partialdir;
581 @@ -1541,6 +1544,7 @@ int parse_arguments(int *argc_p, const c
582 snprintf(err_buf, sizeof err_buf,
583 "--%s cannot be used with --%s\n",
584 append_mode ? "append" : "inplace",
585 + detect_renamed ? "detect-renamed" :
586 delay_updates ? "delay-updates" : "partial-dir");
589 @@ -1890,6 +1894,9 @@ void server_options(char **args, int *ar
593 + /* Both sides need to know in case this disables incremental recursion. */
594 + if (detect_renamed)
595 + args[ac++] = "--detect-renamed";
597 if (modify_window_set) {
598 if (asprintf(&arg, "--modify-window=%d", modify_window) < 0)
601 @@ -384,6 +384,7 @@ to the detailed description below for a
602 --modify-window=NUM compare mod-times with reduced accuracy
603 -T, --temp-dir=DIR create temporary files in directory DIR
604 -y, --fuzzy find similar file for basis if no dest file
605 + --detect-renamed try to find renamed files to speed the xfer
606 --compare-dest=DIR also compare received files relative to DIR
607 --copy-dest=DIR ... and include copies of unchanged files
608 --link-dest=DIR hardlink to files in DIR when unchanged
609 @@ -1424,6 +1425,21 @@ Note that the use of the bf(--delete) op
610 fuzzy-match files, so either use bf(--delete-after) or specify some
611 filename exclusions if you need to prevent this.
613 +dit(bf(--detect-renamed)) With this option, for each new source file
614 +(call it em(src/S)), rsync looks for a file em(dest/D) anywhere in the
615 +destination that passes the quick check with em(src/S). If such a em(dest/D)
616 +is found, rsync uses it as an alternate basis for transferring em(S). The
617 +idea is that if em(src/S) was renamed from em(src/D) (as opposed to em(src/S)
618 +passing the quick check with em(dest/D) by coincidence), the delta-transfer
619 +algorithm will find that all the data matches between em(src/S) and em(dest/D),
620 +and the transfer will be really fast.
622 +By default, alternate-basis files are hard-linked into a directory named
623 +".~tmp~" in each file's destination directory, but if you've specified
624 +the bf(--partial-dir) option, that directory will be used instead. These
625 +potential alternate-basis files will be removed as the transfer progresses.
626 +This option conflicts with bf(--inplace) and bf(--append).
628 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
629 the destination machine as an additional hierarchy to compare destination
630 files against doing transfers (if the files are missing in the destination
633 @@ -1030,6 +1030,32 @@ int handle_partial_dir(const char *fname
637 +/* We need to supply our own strcmp function for file list comparisons
638 + * to ensure that signed/unsigned usage is consistent between machines. */
639 +int u_strcmp(const char *p1, const char *p2)
641 + for ( ; *p1; p1++, p2++) {
646 + return (int)*(uchar*)p1 - (int)*(uchar*)p2;
649 +/* We need a memcmp function compares unsigned-byte values. */
650 +int u_memcmp(const void *p1, const void *p2, size_t len)
652 + const uchar *u1 = p1;
653 + const uchar *u2 = p2;
657 + return (int)*u1 - (int)*u2;
664 * Determine if a symlink points outside the current directory tree.
665 * This is considered "unsafe" because e.g. when mirroring somebody