Tweaked a directory copy command in default-acls.test.
[rsync/rsync-patches.git] / detect-renamed.diff
CommitLineData
1fffd582
WD
1This patch adds the --detect-renamed option which makes rsync notice files
2that either (1) match in size & modify-time (plus the basename, if possible)
3or (2) match in size & checksum (when --checksum was also specified) and use
4each match as an alternate basis file to speed up the transfer.
5
6The algorithm attempts to scan the receiving-side's files in an efficient
7manner. If --delete[-before] is enabled, we'll take advantage of the
8pre-transfer delete pass to prepare any alternate-basis-file matches we
9might find. If --delete-before is not enabled, rsync does the rename scan
10during the regular file-sending scan (scanning each directory right before
11the generator starts updating files from that dir). In this latter mode,
12rsync might delay the updating of a file (if no alternate-basis match was
13yet found) until the full scan of the receiving side is complete, at which
14point any delayed files are processed.
15
16I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that
17takes advantage of rsync's pre-existing partial-dir logic. This uses less
18memory than trying to keep track of the matches internally, and also allows
19any deletions or file-updates to occur normally without interfering with
20these alternate-basis discoveries.
21
22After applying this patch, run these commands for a successful build:
23
24 ./prepare-source
25 ./configure (optional if already run)
26 make
27
28TODO:
29
30 We need to never return a match from fattr_find() that has a basis
31 file. This will ensure that we don't try to give a renamed file to
32 a file that can't use it, while missing out on giving it to a file
33 that could use it.
34
35--- old/flist.c
36+++ new/flist.c
37@@ -56,6 +56,7 @@ extern int implied_dirs;
38 extern int prune_empty_dirs;
39 extern int copy_links;
40 extern int copy_unsafe_links;
41+extern int detect_renamed;
42 extern int protocol_version;
43 extern int sanitize_paths;
44 extern const char *io_write_phase;
45@@ -74,6 +75,8 @@ int checksum_len;
46 dev_t filesystem_dev; /* used to implement -x */
47 unsigned int file_struct_len;
48
49+struct file_list the_fattr_list;
50+
51 static char empty_sum[MD4_SUM_LENGTH];
52 static int flist_count_offset;
53
54@@ -260,6 +263,44 @@ static mode_t from_wire_mode(int mode)
55 return (mode_t)mode;
56 }
57
58+static int fattr_compare(struct file_struct **file1, struct file_struct **file2)
59+{
60+ struct file_struct *f1 = *file1;
61+ struct file_struct *f2 = *file2;
62+ int diff;
63+
64+ if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) {
65+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
66+ return 0;
67+ return 1;
68+ }
69+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
70+ return -1;
71+
72+ /* Don't use diff for values that are longer than an int. */
73+ if (f1->length != f2->length)
74+ return f1->length < f2->length ? -1 : 1;
75+
76+ if (always_checksum) {
77+ diff = u_memcmp(f1->u.sum, f2->u.sum, checksum_len);
78+ if (diff)
79+ return diff;
80+ } else if (f1->modtime != f2->modtime)
81+ return f1->modtime < f2->modtime ? -1 : 1;
82+
83+ diff = u_strcmp(f1->basename, f2->basename);
84+ if (diff)
85+ return diff;
86+
87+ if (f1->dirname == f2->dirname)
88+ return 0;
89+ if (!f1->dirname)
90+ return -1;
91+ if (!f2->dirname)
92+ return 1;
93+ return u_strcmp(f1->dirname, f2->dirname);
94+}
95+
96 static void send_directory(int f, struct file_list *flist,
97 char *fbuf, int len);
98
99@@ -1388,6 +1429,25 @@ struct file_list *recv_file_list(int f)
100
101 clean_flist(flist, relative_paths, 1);
102
103+ if (detect_renamed) {
104+ int j = flist->count;
105+ the_fattr_list.count = j;
106+ the_fattr_list.files = new_array(struct file_struct *, j);
107+ if (!the_fattr_list.files)
108+ goto oom;
109+ memcpy(the_fattr_list.files, flist->files,
110+ j * sizeof (struct file_struct *));
111+ qsort(the_fattr_list.files, j,
112+ sizeof the_fattr_list.files[0], (int (*)())fattr_compare);
113+ the_fattr_list.low = 0;
114+ while (j-- > 0) {
115+ struct file_struct *fp = the_fattr_list.files[j];
116+ if (fp->basename && S_ISREG(fp->mode) && fp->length)
117+ break;
118+ }
119+ the_fattr_list.high = j;
120+ }
121+
122 if (f >= 0) {
123 recv_uid_list(f, flist);
124
125--- old/generator.c
126+++ new/generator.c
127@@ -77,6 +77,7 @@ extern char *basis_dir[];
128 extern int compare_dest;
129 extern int copy_dest;
130 extern int link_dest;
131+extern int detect_renamed;
132 extern int whole_file;
133 extern int list_only;
134 extern int read_batch;
135@@ -92,14 +93,17 @@ extern char *backup_dir;
136 extern char *backup_suffix;
137 extern int backup_suffix_len;
138 extern struct file_list *the_file_list;
139+extern struct file_list the_fattr_list;
140 extern struct filter_list_struct server_filter_list;
141
142 static int deletion_count = 0; /* used to implement --max-delete */
143+static int unexplored_dirs = 1;
144 static int can_link_symlinks = 1; /* start out optimistic */
145 static int can_link_devices = 1;
146
147-/* For calling delete_file() */
148+/* For calling delete_item() and delete_in_dir() */
149 #define DEL_FORCE_RECURSE (1<<1) /* recurse even w/o --force */
150+#define DEL_NO_DELETIONS (1<<2)
151 #define DEL_TERSE (1<<3)
152
153
154@@ -109,12 +113,120 @@ static int is_backup_file(char *fn)
155 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
156 }
157
158+/* Search for a regular file that matches either (1) the size & modified
159+ * time (plus the basename, if possible) or (2) the size & checksum. If
160+ * we find an exact match down to the dirname, return -1 because we found
161+ * an up-to-date file in the transfer, not a renamed file. */
162+static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool)
163+{
164+ int low = the_fattr_list.low, high = the_fattr_list.high;
165+ int mid, ok_match = -1, good_match = -1;
166+ struct file_struct *fmid;
167+ int diff;
168+
169+ while (low <= high) {
170+ mid = (low + high) / 2;
171+ fmid = the_fattr_list.files[mid];
172+ if (fmid->length != f->length) {
173+ if (fmid->length < f->length)
174+ low = mid + 1;
175+ else
176+ high = mid - 1;
177+ continue;
178+ }
179+ if (always_checksum) {
180+ if (!f->u.sum) {
181+ if (fmid->modtime == f->modtime
182+ && f_name_cmp(fmid, f) == 0)
183+ return -1; /* assume we can't help */
184+ f->u.sum = pool_alloc(pool, MD4_SUM_LENGTH,
185+ "fattr_find");
186+ file_checksum(fname, f->u.sum, f->length);
187+ }
188+ diff = u_memcmp(fmid->u.sum, f->u.sum, checksum_len);
189+ if (diff) {
190+ if (diff < 0)
191+ low = mid + 1;
192+ else
193+ high = mid - 1;
194+ continue;
195+ }
196+ } else {
197+ if (fmid->modtime != f->modtime) {
198+ if (fmid->modtime < f->modtime)
199+ low = mid + 1;
200+ else
201+ high = mid - 1;
202+ continue;
203+ }
204+ }
205+ ok_match = mid;
206+ diff = u_strcmp(fmid->basename, f->basename);
207+ if (diff == 0) {
208+ good_match = mid;
209+ if (fmid->dirname == f->dirname)
210+ return -1; /* file is up-to-date */
211+ if (!fmid->dirname) {
212+ low = mid + 1;
213+ continue;
214+ }
215+ if (!f->dirname) {
216+ high = mid - 1;
217+ continue;
218+ }
219+ diff = u_strcmp(fmid->dirname, f->dirname);
220+ if (diff == 0)
221+ return -1; /* file is up-to-date */
222+ }
223+ if (diff < 0)
224+ low = mid + 1;
225+ else
226+ high = mid - 1;
227+ }
228+
229+ return good_match >= 0 ? good_match : ok_match;
230+}
231+
232+static void look_for_rename(struct file_struct *file, char *fname,
233+ alloc_pool_t pool)
234+{
235+ struct file_struct *fp;
236+ char *partialptr, *fn;
237+ STRUCT_STAT st;
238+ int ndx;
239+
240+ if ((ndx = fattr_find(file, fname, pool)) < 0)
241+ return;
242+
243+ fp = the_fattr_list.files[ndx];
244+ fn = f_name(fp, NULL);
245+ /* We don't provide an alternate-basis file if there is a basis file. */
246+ if (link_stat(fn, &st, 0) == 0)
247+ return;
248+ if ((partialptr = partial_dir_fname(fn)) == NULL
249+ || !handle_partial_dir(partialptr, PDIR_CREATE))
250+ return;
251+
252+ /* We only use the file if we can hard-link it into our tmp dir. */
253+ if (link(fname, partialptr) == 0) {
254+ if (verbose > 2) {
255+ rprintf(FINFO, "found renamed: %s => %s\n",
256+ fname, partialptr);
257+ }
258+ return;
259+ }
260+
261+ if (errno != EEXIST)
262+ handle_partial_dir(partialptr, PDIR_DELETE);
263+}
264
265 /* Delete a file or directory. If DEL_FORCE_RECURSE is set in the flags, or if
266 * force_delete is set, this will delete recursively.
267 *
268 * Note that fname must point to a MAXPATHLEN buffer if the mode indicates it's
269 * a directory! (The buffer is used for recursion, but returned unchanged.)
270+ *
271+ * Also Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
272 */
273 static int delete_item(char *fname, int mode, int flags)
274 {
275@@ -125,6 +237,8 @@ static int delete_item(char *fname, int
276 char *p;
277
278 if (!S_ISDIR(mode)) {
279+ if (flags & DEL_NO_DELETIONS)
280+ return 0;
281 if (max_delete && ++deletion_count > max_delete)
282 return 0;
283 if (make_backups && (backup_dir || !is_backup_file(fname)))
284@@ -147,6 +261,7 @@ static int delete_item(char *fname, int
285
286 zap_dir = flags & DEL_FORCE_RECURSE || force_delete;
287 if ((max_delete && ++deletion_count > max_delete)
288+ || flags & DEL_NO_DELETIONS
289 || (dry_run && zap_dir)) {
290 ok = 0;
291 errno = ENOTEMPTY;
292@@ -189,6 +304,8 @@ static int delete_item(char *fname, int
293 continue;
294
295 strlcpy(p, fp->basename, remainder);
296+ if (detect_renamed && S_ISREG(fp->mode))
297+ look_for_rename(fp, fname, dirlist->file_pool);
298 delete_item(fname, fp->mode, flags & ~DEL_TERSE);
299 }
300 flist_free(dirlist);
301@@ -197,7 +314,8 @@ static int delete_item(char *fname, int
302
303 pop_local_filters(save_filters);
304
305- if (max_delete && ++deletion_count > max_delete)
306+ if (flags & DEL_NO_DELETIONS
307+ || (max_delete && ++deletion_count > max_delete))
308 return 0;
309
310 if (do_rmdir(fname) == 0) {
311@@ -217,15 +335,19 @@ static int delete_item(char *fname, int
312 * all the --delete-WHEN options. Note that the fbuf pointer must point to a
313 * MAXPATHLEN buffer with the name of the directory in it (the functions we
314 * call will append names onto the end, but the old dir value will be restored
315- * on exit). */
316+ * on exit).
317+ *
318+ * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
319+ */
320 static void delete_in_dir(struct file_list *flist, char *fbuf,
321- struct file_struct *file, STRUCT_STAT *stp)
322+ struct file_struct *file, STRUCT_STAT *stp, int flags)
323 {
324 static int min_depth = MAXPATHLEN, cur_depth = -1;
325 static void *filt_array[MAXPATHLEN/2+1];
326 static int already_warned = 0;
327 struct file_list *dirlist;
328- char delbuf[MAXPATHLEN];
329+ char *p, delbuf[MAXPATHLEN];
330+ unsigned remainder;
331 int dlen, i;
332
333 if (!flist) {
334@@ -239,6 +361,8 @@ static void delete_in_dir(struct file_li
335 if (verbose > 2)
336 rprintf(FINFO, "delete_in_dir(%s)\n", fbuf);
337
338+ flags |= DEL_FORCE_RECURSE;
339+
340 if (allowed_lull)
341 maybe_send_keepalive();
342
343@@ -246,12 +370,14 @@ static void delete_in_dir(struct file_li
344 return; /* Impossible... */
345
346 if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) {
347- if (already_warned)
348+ if (!already_warned) {
349+ rprintf(FINFO,
350+ "IO error encountered -- skipping file deletion\n");
351+ already_warned = 1;
352+ }
353+ if (!detect_renamed)
354 return;
355- rprintf(FINFO,
356- "IO error encountered -- skipping file deletion\n");
357- already_warned = 1;
358- return;
359+ flags |= DEL_NO_DELETIONS;
360 }
361
362 while (cur_depth >= file->dir.depth && cur_depth >= min_depth)
363@@ -262,6 +388,9 @@ static void delete_in_dir(struct file_li
364 dlen = strlen(fbuf);
365 filt_array[cur_depth] = push_local_filters(fbuf, dlen);
366
367+ if (detect_renamed)
368+ unexplored_dirs--;
369+
370 if (one_file_system) {
371 if (file->flags & FLAG_TOP_DIR)
372 filesystem_dev = stp->st_dev;
373@@ -271,18 +400,30 @@ static void delete_in_dir(struct file_li
374
375 dirlist = get_dirlist(fbuf, dlen, 0);
376
377+ p = fbuf + dlen;
378+ if (dlen != 1 || *fbuf != '/')
379+ *p++ = '/';
380+ remainder = MAXPATHLEN - (p - fbuf);
381+
382 /* If an item in dirlist is not found in flist, delete it
383 * from the filesystem. */
384 for (i = dirlist->count; i--; ) {
385 struct file_struct *fp = dirlist->files[i];
386 if (!fp->basename || fp->flags & FLAG_MOUNT_POINT)
387 continue;
388+ if (detect_renamed && S_ISREG(fp->mode)) {
389+ strlcpy(p, fp->basename, remainder);
390+ look_for_rename(fp, fbuf, dirlist->file_pool);
391+ }
392 if (flist_find(flist, fp) < 0) {
393 f_name(fp, delbuf);
394- delete_item(delbuf, fp->mode, DEL_FORCE_RECURSE);
395- }
396+ delete_item(delbuf, fp->mode, flags);
397+ } else if (detect_renamed && S_ISDIR(fp->mode))
398+ unexplored_dirs++;
399 }
400
401+ fbuf[dlen] = '\0';
402+
403 flist_free(dirlist);
404 }
405
406@@ -312,9 +453,9 @@ static void do_delete_pass(struct file_l
407 || !S_ISDIR(st.st_mode))
408 continue;
409
410- delete_in_dir(flist, fbuf, file, &st);
411+ delete_in_dir(flist, fbuf, file, &st, 0);
412 }
413- delete_in_dir(NULL, NULL, NULL, NULL);
414+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
415
416 if (do_progress && !am_server)
417 rprintf(FINFO, " \r");
418@@ -753,6 +894,7 @@ static int try_dests_non(struct file_str
419 return -1;
420 }
421
422+static struct bitbag *delayed_bits = NULL;
423 static int phase = 0;
424
425 /* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir,
426@@ -894,8 +1036,12 @@ static void recv_generator(char *fname,
427 && verbose && code && f_out != -1)
428 rprintf(code, "%s/\n", fname);
429 if (delete_during && f_out != -1 && !phase && dry_run < 2
430- && (file->flags & FLAG_DEL_HERE))
431- delete_in_dir(the_file_list, fname, file, &st);
432+ && (file->flags & FLAG_DEL_HERE)) {
433+ if (detect_renamed && statret != 0)
434+ unexplored_dirs++;
435+ delete_in_dir(the_file_list, fname, file, &st,
436+ delete_during < 0 ? DEL_NO_DELETIONS : 0);
437+ }
438 return;
439 }
440
441@@ -1133,8 +1279,14 @@ static void recv_generator(char *fname,
442 && hard_link_check(file, ndx, fname, statret, &st,
443 itemizing, code, HL_SKIP))
444 return;
445- if (stat_errno == ENOENT)
446+ if (stat_errno == ENOENT) {
447+ if (detect_renamed && unexplored_dirs > 0
448+ && file->length) {
449+ bitbag_set_bit(delayed_bits, ndx);
450+ return;
451+ }
452 goto notify_others;
453+ }
454 rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s",
455 full_fname(fname));
456 return;
457@@ -1309,11 +1461,17 @@ void generate_files(int f_out, struct fi
458 (long)getpid(), flist->count);
459 }
460
461+ if (detect_renamed) {
462+ delayed_bits = bitbag_create(flist->count);
463+ if (!delete_before && !delete_during)
464+ delete_during = -1;
465+ }
466+
467 if (delete_before && !local_name && flist->count > 0)
468 do_delete_pass(flist);
469 do_progress = 0;
470
471- if (append_mode || whole_file < 0)
472+ if (append_mode || detect_renamed || whole_file < 0)
473 whole_file = 0;
474 if (verbose >= 2) {
475 rprintf(FINFO, "delta-transmission %s\n",
476@@ -1368,7 +1526,23 @@ void generate_files(int f_out, struct fi
477 }
478 recv_generator(NULL, NULL, 0, 0, 0, code, -1);
479 if (delete_during)
480- delete_in_dir(NULL, NULL, NULL, NULL);
481+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
482+
483+ if (detect_renamed) {
484+ if (delete_during < 0)
485+ delete_during = 0;
486+ detect_renamed = 0;
487+
488+ for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) {
489+ struct file_struct *file = flist->files[i];
490+ if (local_name)
491+ strlcpy(fbuf, local_name, sizeof fbuf);
492+ else
493+ f_name(file, fbuf);
494+ recv_generator(fbuf, file, i, itemizing,
495+ maybe_ATTRS_REPORT, code, f_out);
496+ }
497+ }
498
499 phase++;
500 csum_length = SUM_LENGTH;
501--- old/options.c
502+++ new/options.c
503@@ -78,6 +78,7 @@ int am_starting_up = 1;
504 int orig_umask = 0;
505 int relative_paths = -1;
506 int implied_dirs = 1;
507+int detect_renamed = 0;
508 int numeric_ids = 0;
509 int allow_8bit_chars = 0;
510 int force_delete = 0;
511@@ -335,6 +336,7 @@ void usage(enum logcode F)
512 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
513 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
514 rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
515+ rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n");
516 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
517 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
518 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
519@@ -480,6 +482,7 @@ static struct poptOption long_options[]
520 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
521 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
522 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
523+ {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 },
524 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
525 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
526 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
afcb578c 527@@ -1334,7 +1337,7 @@ int parse_arguments(int *argc, const cha
1fffd582
WD
528 inplace = 1;
529 }
530
531- if (delay_updates && !partial_dir)
532+ if ((delay_updates || detect_renamed) && !partial_dir)
533 partial_dir = tmp_partialdir;
534
535 if (inplace) {
afcb578c 536@@ -1343,6 +1346,7 @@ int parse_arguments(int *argc, const cha
1fffd582
WD
537 snprintf(err_buf, sizeof err_buf,
538 "--%s cannot be used with --%s\n",
539 append_mode ? "append" : "inplace",
540+ detect_renamed ? "detect-renamed" :
541 delay_updates ? "delay-updates" : "partial-dir");
542 return 0;
543 }
544--- old/rsync.yo
545+++ new/rsync.yo
546@@ -358,6 +358,7 @@ to the detailed description below for a
547 --modify-window=NUM compare mod-times with reduced accuracy
548 -T, --temp-dir=DIR create temporary files in directory DIR
549 -y, --fuzzy find similar file for basis if no dest file
550+ --detect-renamed try to find renamed files to speed the xfer
551 --compare-dest=DIR also compare received files relative to DIR
552 --copy-dest=DIR ... and include copies of unchanged files
553 --link-dest=DIR hardlink to files in DIR when unchanged
554@@ -1183,6 +1184,15 @@ Note that the use of the bf(--delete) op
555 fuzzy-match files, so either use bf(--delete-after) or specify some
556 filename exclusions if you need to prevent this.
557
558+dit(bf(--detect-renamed)) This option tells rsync to scan the receiving
559+side for files that have been renamed, and to use any that are found as
560+alternate basis files to help speed up the transfer.
561+By default, alternate-basis files are hard-linked into a directory named
562+".~tmp~" in each file's destination directory, but if you've specified
563+the bf(--partial-dir) option, that directory will be used instead. These
564+potential alternate-basis files will be removed as the transfer progresses.
565+This option conflicts with bf(--inplace) and bf(--append).
566+
567 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
568 the destination machine as an additional hierarchy to compare destination
569 files against doing transfers (if the files are missing in the destination
570--- old/util.c
571+++ new/util.c
572@@ -997,6 +997,32 @@ int handle_partial_dir(const char *fname
573 return 1;
574 }
575
576+/* We need to supply our own strcmp function for file list comparisons
577+ * to ensure that signed/unsigned usage is consistent between machines. */
578+int u_strcmp(const char *p1, const char *p2)
579+{
580+ for ( ; *p1; p1++, p2++) {
581+ if (*p1 != *p2)
582+ break;
583+ }
584+
585+ return (int)*(uchar*)p1 - (int)*(uchar*)p2;
586+}
587+
588+/* We need a memcmp function compares unsigned-byte values. */
589+int u_memcmp(const void *p1, const void *p2, size_t len)
590+{
591+ const uchar *u1 = p1;
592+ const uchar *u2 = p2;
593+
594+ while (len--) {
595+ if (*u1 != *u2)
596+ return (int)*u1 - (int)*u2;
597+ }
598+
599+ return 0;
600+}
601+
602 /**
603 * Determine if a symlink points outside the current directory tree.
604 * This is considered "unsafe" because e.g. when mirroring somebody