1 This patch adds the --detect-renamed option which makes rsync notice files
2 that either (1) match in size & modify-time (plus the basename, if possible)
3 or (2) match in size & checksum (when --checksum was also specified) and use
4 each match as an alternate basis file to speed up the transfer.
6 The algorithm attempts to scan the receiving-side's files in an efficient
7 manner. If --delete[-before] is enabled, we'll take advantage of the
8 pre-transfer delete pass to prepare any alternate-basis-file matches we
9 might find. If --delete-before is not enabled, rsync does the rename scan
10 during the regular file-sending scan (scanning each directory right before
11 the generator starts updating files from that dir). In this latter mode,
12 rsync might delay the updating of a file (if no alternate-basis match was
13 yet found) until the full scan of the receiving side is complete, at which
14 point any delayed files are processed.
16 I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that
17 takes advantage of rsync's pre-existing partial-dir logic. This uses less
18 memory than trying to keep track of the matches internally, and also allows
19 any deletions or file-updates to occur normally without interfering with
20 these alternate-basis discoveries.
22 To use this patch, run these commands for a successful build:
24 patch -p1 <patches/detect-renamed.diff
25 ./configure (optional if already run)
30 We need to never return a match from fattr_find() that has a basis
31 file. This will ensure that we don't try to give a renamed file to
32 a file that can't use it, while missing out on giving it to a file
37 @@ -54,6 +54,7 @@ extern int non_perishable_cnt;
38 extern int prune_empty_dirs;
39 extern int copy_links;
40 extern int copy_unsafe_links;
41 +extern int detect_renamed;
42 extern int protocol_version;
43 extern int sanitize_paths;
44 extern struct stats stats;
45 @@ -80,6 +81,8 @@ static struct idev tmp_idev;
47 static char tmp_sum[MD4_SUM_LENGTH];
49 +struct file_list the_fattr_list;
51 static char empty_sum[MD4_SUM_LENGTH];
52 static int flist_count_offset;
54 @@ -262,6 +265,44 @@ static mode_t from_wire_mode(int mode)
58 +static int fattr_compare(struct file_struct **file1, struct file_struct **file2)
60 + struct file_struct *f1 = *file1;
61 + struct file_struct *f2 = *file2;
64 + if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) {
65 + if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
69 + if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
72 + /* Don't use diff for values that are longer than an int. */
73 + if (f1->length != f2->length)
74 + return f1->length < f2->length ? -1 : 1;
76 + if (always_checksum) {
77 + diff = u_memcmp(F_SUM(f1), F_SUM(f2), checksum_len);
80 + } else if (f1->modtime != f2->modtime)
81 + return f1->modtime < f2->modtime ? -1 : 1;
83 + diff = u_strcmp(f1->basename, f2->basename);
87 + if (f1->dirname == f2->dirname)
93 + return u_strcmp(f1->dirname, f2->dirname);
96 static void send_directory(int f, struct file_list *flist,
99 @@ -1414,6 +1455,25 @@ struct file_list *recv_file_list(int f)
101 clean_flist(flist, relative_paths, 1);
103 + if (detect_renamed) {
104 + int j = flist->count;
105 + the_fattr_list.count = j;
106 + the_fattr_list.files = new_array(struct file_struct *, j);
107 + if (!the_fattr_list.files)
108 + out_of_memory("recv_file_list");
109 + memcpy(the_fattr_list.files, flist->files,
110 + j * sizeof (struct file_struct *));
111 + qsort(the_fattr_list.files, j,
112 + sizeof the_fattr_list.files[0], (int (*)())fattr_compare);
113 + the_fattr_list.low = 0;
115 + struct file_struct *fp = the_fattr_list.files[j];
116 + if (fp->basename && S_ISREG(fp->mode) && fp->length)
119 + the_fattr_list.high = j;
123 recv_uid_list(f, flist);
127 @@ -76,6 +76,7 @@ extern char *basis_dir[];
128 extern int compare_dest;
129 extern int copy_dest;
130 extern int link_dest;
131 +extern int detect_renamed;
132 extern int whole_file;
133 extern int list_only;
134 extern int new_root_dir;
135 @@ -91,6 +92,7 @@ extern char *backup_dir;
136 extern char *backup_suffix;
137 extern int backup_suffix_len;
138 extern struct file_list *the_file_list;
139 +extern struct file_list the_fattr_list;
140 extern struct filter_list_struct server_filter_list;
142 int ignore_perishable = 0;
143 @@ -98,12 +100,14 @@ int non_perishable_cnt = 0;
144 int maybe_ATTRS_REPORT = 0;
146 static int deletion_count = 0; /* used to implement --max-delete */
147 +static int unexplored_dirs = 1;
148 static int deldelay_size = 0, deldelay_cnt = 0;
149 static char *deldelay_buf = NULL;
150 static int deldelay_fd = -1;
151 static BOOL solo_file = 0;
153 -/* For calling delete_item() and delete_dir_contents(). */
154 +/* For calling delete_item(), delete_dir_contents(), and delete_in_dir(). */
155 +#define DEL_NO_DELETIONS (1<<0)
156 #define DEL_RECURSE (1<<1) /* recurse */
157 #define DEL_DIR_IS_EMPTY (1<<2) /* internal delete_FUNCTIONS use only */
159 @@ -125,11 +129,121 @@ static int is_backup_file(char *fn)
160 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
163 +/* Search for a regular file that matches either (1) the size & modified
164 + * time (plus the basename, if possible) or (2) the size & checksum. If
165 + * we find an exact match down to the dirname, return -1 because we found
166 + * an up-to-date file in the transfer, not a renamed file. */
167 +static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool)
169 + int low = the_fattr_list.low, high = the_fattr_list.high;
170 + int mid, ok_match = -1, good_match = -1;
171 + struct file_struct *fmid;
174 + while (low <= high) {
175 + mid = (low + high) / 2;
176 + fmid = the_fattr_list.files[mid];
177 + if (fmid->length != f->length) {
178 + if (fmid->length < f->length)
184 + if (always_checksum) {
186 + if (fmid->modtime == f->modtime
187 + && f_name_cmp(fmid, f) == 0)
188 + return -1; /* assume we can't help */
189 + /* XXX update this to new checksum var idiom! */
190 + F_SUM(f) = pool_alloc(pool, MD4_SUM_LENGTH,
192 + file_checksum(fname, F_SUM(f), f->length);
194 + diff = u_memcmp(F_SUM(fmid), F_SUM(f), checksum_len);
203 + if (fmid->modtime != f->modtime) {
204 + if (fmid->modtime < f->modtime)
212 + diff = u_strcmp(fmid->basename, f->basename);
215 + if (fmid->dirname == f->dirname)
216 + return -1; /* file is up-to-date */
217 + if (!fmid->dirname) {
225 + diff = u_strcmp(fmid->dirname, f->dirname);
227 + return -1; /* file is up-to-date */
235 + return good_match >= 0 ? good_match : ok_match;
238 +static void look_for_rename(struct file_struct *file, char *fname,
241 + struct file_struct *fp;
242 + char *partialptr, *fn;
246 + if ((ndx = fattr_find(file, fname, pool)) < 0)
249 + fp = the_fattr_list.files[ndx];
250 + fn = f_name(fp, NULL);
251 + /* We don't provide an alternate-basis file if there is a basis file. */
252 + if (link_stat(fn, &st, 0) == 0)
254 + if ((partialptr = partial_dir_fname(fn)) == NULL
255 + || !handle_partial_dir(partialptr, PDIR_CREATE))
258 + /* We only use the file if we can hard-link it into our tmp dir. */
259 + if (link(fname, partialptr) == 0) {
261 + rprintf(FINFO, "found renamed: %s => %s\n",
262 + fname, partialptr);
267 + if (errno != EEXIST)
268 + handle_partial_dir(partialptr, PDIR_DELETE);
271 /* Delete a file or directory. If DEL_RECURSE is set in the flags, this will
272 * delete recursively.
274 * Note that fbuf must point to a MAXPATHLEN buffer if the mode indicates it's
275 * a directory! (The buffer is used for recursion, but returned unchanged.)
277 + * Also note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
279 static enum delret delete_item(char *fbuf, int mode, char *replace, int flags)
281 @@ -151,6 +265,8 @@ static enum delret delete_item(char *fbu
283 /* OK: try to delete the directory. */
285 + if (flags & DEL_NO_DELETIONS)
288 if (!replace && max_delete >= 0 && ++deletion_count > max_delete)
290 @@ -197,6 +313,8 @@ static enum delret delete_item(char *fbu
291 * its contents, otherwise just checks for content. Returns DR_SUCCESS or
292 * DR_NOT_EMPTY. Note that fname must point to a MAXPATHLEN buffer! (The
293 * buffer is used for recursion, but returned unchanged.)
295 + * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
297 static enum delret delete_dir_contents(char *fname, int flags)
299 @@ -253,6 +371,8 @@ static enum delret delete_dir_contents(c
300 if (S_ISDIR(fp->mode)
301 && delete_dir_contents(fname, flags | DEL_RECURSE) != DR_SUCCESS)
303 + if (detect_renamed && S_ISREG(fp->mode))
304 + look_for_rename(fp, fname, dirlist->file_pool);
305 if (delete_item(fname, fp->mode, NULL, flags) != DR_SUCCESS)
308 @@ -405,15 +525,19 @@ static void do_delayed_deletions(char *d
309 * all the --delete-WHEN options. Note that the fbuf pointer must point to a
310 * MAXPATHLEN buffer with the name of the directory in it (the functions we
311 * call will append names onto the end, but the old dir value will be restored
315 + * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
317 static void delete_in_dir(struct file_list *flist, char *fbuf,
318 - struct file_struct *file, STRUCT_STAT *stp)
319 + struct file_struct *file, STRUCT_STAT *stp, int flags)
321 static int min_depth = MAXPATHLEN, cur_depth = -1;
322 static void *filt_array[MAXPATHLEN/2+1];
323 static int already_warned = 0;
324 struct file_list *dirlist;
325 - char delbuf[MAXPATHLEN];
326 + char *p, delbuf[MAXPATHLEN];
327 + unsigned remainder;
331 @@ -427,6 +551,8 @@ static void delete_in_dir(struct file_li
333 rprintf(FINFO, "delete_in_dir(%s)\n", fbuf);
335 + flags |= DEL_RECURSE;
338 maybe_send_keepalive();
340 @@ -434,12 +560,14 @@ static void delete_in_dir(struct file_li
341 return; /* Impossible... */
343 if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) {
344 - if (already_warned)
345 + if (!already_warned) {
347 + "IO error encountered -- skipping file deletion\n");
348 + already_warned = 1;
350 + if (!detect_renamed)
353 - "IO error encountered -- skipping file deletion\n");
354 - already_warned = 1;
356 + flags |= DEL_NO_DELETIONS;
359 while (cur_depth >= file->dir.depth && cur_depth >= min_depth)
360 @@ -450,6 +578,9 @@ static void delete_in_dir(struct file_li
362 filt_array[cur_depth] = push_local_filters(fbuf, dlen);
364 + if (detect_renamed)
367 if (one_file_system) {
368 if (file->flags & FLAG_TOP_DIR)
369 filesystem_dev = stp->st_dev;
370 @@ -459,6 +590,11 @@ static void delete_in_dir(struct file_li
372 dirlist = get_dirlist(fbuf, dlen, 0);
375 + if (dlen != 1 || *fbuf != '/')
377 + remainder = MAXPATHLEN - (p - fbuf);
379 /* If an item in dirlist is not found in flist, delete it
380 * from the filesystem. */
381 for (i = dirlist->count; i--; ) {
382 @@ -471,16 +607,23 @@ static void delete_in_dir(struct file_li
386 + if (detect_renamed && S_ISREG(fp->mode)) {
387 + strlcpy(p, fp->basename, remainder);
388 + look_for_rename(fp, fbuf, dirlist->file_pool);
390 if (flist_find(flist, fp) < 0) {
392 - if (delete_during == 2) {
393 ++ if (delete_during == 2 && !(flags & DEL_NO_DELETIONS)) {
394 if (!remember_delete(fp, delbuf))
397 - delete_item(delbuf, fp->mode, NULL, DEL_RECURSE);
399 + delete_item(delbuf, fp->mode, NULL, flags);
400 + } else if (detect_renamed && S_ISDIR(fp->mode))
409 @@ -510,9 +653,9 @@ static void do_delete_pass(struct file_l
410 || !S_ISDIR(st.st_mode))
413 - delete_in_dir(flist, fbuf, file, &st);
414 + delete_in_dir(flist, fbuf, file, &st, 0);
416 - delete_in_dir(NULL, NULL, NULL, NULL);
417 + delete_in_dir(NULL, NULL, NULL, NULL, 0);
419 if (do_progress && !am_server)
420 rprintf(FINFO, " \r");
421 @@ -1048,6 +1191,7 @@ static int try_dests_non(struct file_str
425 +static struct bitbag *delayed_bits = NULL;
426 static int phase = 0;
428 /* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir,
429 @@ -1233,8 +1377,12 @@ static void recv_generator(char *fname,
430 if (real_ret != 0 && one_file_system)
431 real_st.st_dev = filesystem_dev;
432 if (delete_during && f_out != -1 && !phase && dry_run < 2
433 - && (file->flags & FLAG_XFER_DIR))
434 - delete_in_dir(the_file_list, fname, file, &real_st);
435 + && (file->flags & FLAG_XFER_DIR)) {
436 + if (detect_renamed && real_ret != 0)
438 + delete_in_dir(the_file_list, fname, file, &real_st,
439 + delete_during < 0 ? DEL_NO_DELETIONS : 0);
444 @@ -1498,8 +1646,14 @@ static void recv_generator(char *fname,
445 if (preserve_hard_links && F_NOT_HLINK_LAST(file))
448 - if (stat_errno == ENOENT)
449 + if (stat_errno == ENOENT) {
450 + if (detect_renamed && unexplored_dirs > 0
452 + bitbag_set_bit(delayed_bits, ndx);
457 rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s",
460 @@ -1691,6 +1845,12 @@ void generate_files(int f_out, struct fi
461 (long)getpid(), flist->count);
464 + if (detect_renamed) {
465 + delayed_bits = bitbag_create(flist->count);
466 + if (!delete_before && !delete_during)
467 + delete_during = -1;
470 if (delete_before && !local_name && flist->count > 0)
471 do_delete_pass(flist);
472 if (delete_during == 2) {
473 @@ -1701,7 +1861,7 @@ void generate_files(int f_out, struct fi
477 - if (append_mode || whole_file < 0)
478 + if (append_mode || detect_renamed || whole_file < 0)
481 rprintf(FINFO, "delta-transmission %s\n",
482 @@ -1757,7 +1917,23 @@ void generate_files(int f_out, struct fi
484 recv_generator(NULL, NULL, 0, 0, code, -1);
486 - delete_in_dir(NULL, NULL, NULL, NULL);
487 + delete_in_dir(NULL, NULL, NULL, NULL, 0);
489 + if (detect_renamed) {
490 + if (delete_during < 0)
492 + detect_renamed = 0;
494 + for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) {
495 + struct file_struct *file = flist->files[i];
497 + strlcpy(fbuf, local_name, sizeof fbuf);
499 + f_name(file, fbuf);
500 + recv_generator(fbuf, file, i, itemizing,
501 + maybe_ATTRS_REPORT, code, f_out);
506 csum_length = SUM_LENGTH;
509 @@ -78,6 +78,7 @@ int am_generator = 0;
510 int am_starting_up = 1;
511 int relative_paths = -1;
512 int implied_dirs = 1;
513 +int detect_renamed = 0;
515 int allow_8bit_chars = 0;
516 int force_delete = 0;
517 @@ -343,6 +344,7 @@ void usage(enum logcode F)
518 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
519 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
520 rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
521 + rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n");
522 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
523 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
524 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
525 @@ -497,6 +499,7 @@ static struct poptOption long_options[]
526 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
527 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
528 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
529 + {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 },
530 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
531 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
532 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
533 @@ -1361,7 +1364,7 @@ int parse_arguments(int *argc, const cha
537 - if (delay_updates && !partial_dir)
538 + if ((delay_updates || detect_renamed) && !partial_dir)
539 partial_dir = tmp_partialdir;
542 @@ -1370,6 +1373,7 @@ int parse_arguments(int *argc, const cha
543 snprintf(err_buf, sizeof err_buf,
544 "--%s cannot be used with --%s\n",
545 append_mode ? "append" : "inplace",
546 + detect_renamed ? "detect-renamed" :
547 delay_updates ? "delay-updates" : "partial-dir");
550 @@ -1680,6 +1684,8 @@ void server_options(char **args,int *arg
551 args[ac++] = "--super";
553 args[ac++] = "--size-only";
554 + if (detect_renamed)
555 + args[ac++] = "--detect-renamed";
558 if (modify_window_set) {
561 @@ -364,6 +364,7 @@ to the detailed description below for a
562 --modify-window=NUM compare mod-times with reduced accuracy
563 -T, --temp-dir=DIR create temporary files in directory DIR
564 -y, --fuzzy find similar file for basis if no dest file
565 + --detect-renamed try to find renamed files to speed the xfer
566 --compare-dest=DIR also compare received files relative to DIR
567 --copy-dest=DIR ... and include copies of unchanged files
568 --link-dest=DIR hardlink to files in DIR when unchanged
569 @@ -1272,6 +1273,15 @@ Note that the use of the bf(--delete) op
570 fuzzy-match files, so either use bf(--delete-after) or specify some
571 filename exclusions if you need to prevent this.
573 +dit(bf(--detect-renamed)) This option tells rsync to scan the receiving
574 +side for files that have been renamed, and to use any that are found as
575 +alternate basis files to help speed up the transfer.
576 +By default, alternate-basis files are hard-linked into a directory named
577 +".~tmp~" in each file's destination directory, but if you've specified
578 +the bf(--partial-dir) option, that directory will be used instead. These
579 +potential alternate-basis files will be removed as the transfer progresses.
580 +This option conflicts with bf(--inplace) and bf(--append).
582 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
583 the destination machine as an additional hierarchy to compare destination
584 files against doing transfers (if the files are missing in the destination
587 @@ -1027,6 +1027,32 @@ int handle_partial_dir(const char *fname
591 +/* We need to supply our own strcmp function for file list comparisons
592 + * to ensure that signed/unsigned usage is consistent between machines. */
593 +int u_strcmp(const char *p1, const char *p2)
595 + for ( ; *p1; p1++, p2++) {
600 + return (int)*(uchar*)p1 - (int)*(uchar*)p2;
603 +/* We need a memcmp function compares unsigned-byte values. */
604 +int u_memcmp(const void *p1, const void *p2, size_t len)
606 + const uchar *u1 = p1;
607 + const uchar *u2 = p2;
611 + return (int)*u1 - (int)*u2;
618 * Determine if a symlink points outside the current directory tree.
619 * This is considered "unsafe" because e.g. when mirroring somebody