Fixed failing hunks.
[rsync/rsync-patches.git] / detect-renamed.diff
... / ...
CommitLineData
1This patch adds the --detect-renamed option which makes rsync notice files
2that either (1) match in size & modify-time (plus the basename, if possible)
3or (2) match in size & checksum (when --checksum was also specified) and use
4each match as an alternate basis file to speed up the transfer.
5
6The algorithm attempts to scan the receiving-side's files in an efficient
7manner. If --delete[-before] is enabled, we'll take advantage of the
8pre-transfer delete pass to prepare any alternate-basis-file matches we
9might find. If --delete-before is not enabled, rsync does the rename scan
10during the regular file-sending scan (scanning each directory right before
11the generator starts updating files from that dir). In this latter mode,
12rsync might delay the updating of a file (if no alternate-basis match was
13yet found) until the full scan of the receiving side is complete, at which
14point any delayed files are processed.
15
16I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that
17takes advantage of rsync's pre-existing partial-dir logic. This uses less
18memory than trying to keep track of the matches internally, and also allows
19any deletions or file-updates to occur normally without interfering with
20these alternate-basis discoveries.
21
22To use this patch, run these commands for a successful build:
23
24 patch -p1 <patches/detect-renamed.diff
25 ./configure (optional if already run)
26 make
27
28TODO:
29
30 We need to never return a match from fattr_find() that has a basis
31 file. This will ensure that we don't try to give a renamed file to
32 a file that can't use it, while missing out on giving it to a file
33 that could use it.
34
35--- old/flist.c
36+++ new/flist.c
37@@ -53,6 +53,7 @@ extern int non_perishable_cnt;
38 extern int prune_empty_dirs;
39 extern int copy_links;
40 extern int copy_unsafe_links;
41+extern int detect_renamed;
42 extern int protocol_version;
43 extern int sanitize_paths;
44 extern struct stats stats;
45@@ -70,6 +71,8 @@ int checksum_len;
46 dev_t filesystem_dev; /* used to implement -x */
47 unsigned int file_struct_len;
48
49+struct file_list the_fattr_list;
50+
51 static char empty_sum[MD4_SUM_LENGTH];
52 static int flist_count_offset;
53
54@@ -252,6 +255,44 @@ static mode_t from_wire_mode(int mode)
55 return mode;
56 }
57
58+static int fattr_compare(struct file_struct **file1, struct file_struct **file2)
59+{
60+ struct file_struct *f1 = *file1;
61+ struct file_struct *f2 = *file2;
62+ int diff;
63+
64+ if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) {
65+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
66+ return 0;
67+ return 1;
68+ }
69+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
70+ return -1;
71+
72+ /* Don't use diff for values that are longer than an int. */
73+ if (f1->length != f2->length)
74+ return f1->length < f2->length ? -1 : 1;
75+
76+ if (always_checksum) {
77+ diff = u_memcmp(f1->u.sum, f2->u.sum, checksum_len);
78+ if (diff)
79+ return diff;
80+ } else if (f1->modtime != f2->modtime)
81+ return f1->modtime < f2->modtime ? -1 : 1;
82+
83+ diff = u_strcmp(f1->basename, f2->basename);
84+ if (diff)
85+ return diff;
86+
87+ if (f1->dirname == f2->dirname)
88+ return 0;
89+ if (!f1->dirname)
90+ return -1;
91+ if (!f2->dirname)
92+ return 1;
93+ return u_strcmp(f1->dirname, f2->dirname);
94+}
95+
96 static void send_directory(int f, struct file_list *flist,
97 char *fbuf, int len);
98
99@@ -1388,6 +1429,25 @@ struct file_list *recv_file_list(int f)
100
101 clean_flist(flist, relative_paths, 1);
102
103+ if (detect_renamed) {
104+ int j = flist->count;
105+ the_fattr_list.count = j;
106+ the_fattr_list.files = new_array(struct file_struct *, j);
107+ if (!the_fattr_list.files)
108+ goto oom;
109+ memcpy(the_fattr_list.files, flist->files,
110+ j * sizeof (struct file_struct *));
111+ qsort(the_fattr_list.files, j,
112+ sizeof the_fattr_list.files[0], (int (*)())fattr_compare);
113+ the_fattr_list.low = 0;
114+ while (j-- > 0) {
115+ struct file_struct *fp = the_fattr_list.files[j];
116+ if (fp->basename && S_ISREG(fp->mode) && fp->length)
117+ break;
118+ }
119+ the_fattr_list.high = j;
120+ }
121+
122 if (f >= 0) {
123 recv_uid_list(f, flist);
124
125--- old/generator.c
126+++ new/generator.c
127@@ -76,6 +76,7 @@ extern char *basis_dir[];
128 extern int compare_dest;
129 extern int copy_dest;
130 extern int link_dest;
131+extern int detect_renamed;
132 extern int whole_file;
133 extern int list_only;
134 extern int new_root_dir;
135@@ -91,15 +92,18 @@ extern char *backup_dir;
136 extern char *backup_suffix;
137 extern int backup_suffix_len;
138 extern struct file_list *the_file_list;
139+extern struct file_list the_fattr_list;
140 extern struct filter_list_struct server_filter_list;
141
142 int ignore_perishable = 0;
143 int non_perishable_cnt = 0;
144
145 static int deletion_count = 0; /* used to implement --max-delete */
146+static int unexplored_dirs = 1;
147 static FILE *delete_delay_fp = NULL;
148
149-/* For calling delete_item() and delete_dir_contents(). */
150+/* For calling delete_item(), delete_dir_contents(), and delete_in_dir(). */
151+#define DEL_NO_DELETIONS (1<<0)
152 #define DEL_RECURSE (1<<1) /* recurse */
153 #define DEL_DIR_IS_EMPTY (1<<2) /* internal delete_FUNCTIONS use only */
154
155@@ -121,11 +125,120 @@ static int is_backup_file(char *fn)
156 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
157 }
158
159+/* Search for a regular file that matches either (1) the size & modified
160+ * time (plus the basename, if possible) or (2) the size & checksum. If
161+ * we find an exact match down to the dirname, return -1 because we found
162+ * an up-to-date file in the transfer, not a renamed file. */
163+static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool)
164+{
165+ int low = the_fattr_list.low, high = the_fattr_list.high;
166+ int mid, ok_match = -1, good_match = -1;
167+ struct file_struct *fmid;
168+ int diff;
169+
170+ while (low <= high) {
171+ mid = (low + high) / 2;
172+ fmid = the_fattr_list.files[mid];
173+ if (fmid->length != f->length) {
174+ if (fmid->length < f->length)
175+ low = mid + 1;
176+ else
177+ high = mid - 1;
178+ continue;
179+ }
180+ if (always_checksum) {
181+ if (!f->u.sum) {
182+ if (fmid->modtime == f->modtime
183+ && f_name_cmp(fmid, f) == 0)
184+ return -1; /* assume we can't help */
185+ f->u.sum = pool_alloc(pool, MD4_SUM_LENGTH,
186+ "fattr_find");
187+ file_checksum(fname, f->u.sum, f->length);
188+ }
189+ diff = u_memcmp(fmid->u.sum, f->u.sum, checksum_len);
190+ if (diff) {
191+ if (diff < 0)
192+ low = mid + 1;
193+ else
194+ high = mid - 1;
195+ continue;
196+ }
197+ } else {
198+ if (fmid->modtime != f->modtime) {
199+ if (fmid->modtime < f->modtime)
200+ low = mid + 1;
201+ else
202+ high = mid - 1;
203+ continue;
204+ }
205+ }
206+ ok_match = mid;
207+ diff = u_strcmp(fmid->basename, f->basename);
208+ if (diff == 0) {
209+ good_match = mid;
210+ if (fmid->dirname == f->dirname)
211+ return -1; /* file is up-to-date */
212+ if (!fmid->dirname) {
213+ low = mid + 1;
214+ continue;
215+ }
216+ if (!f->dirname) {
217+ high = mid - 1;
218+ continue;
219+ }
220+ diff = u_strcmp(fmid->dirname, f->dirname);
221+ if (diff == 0)
222+ return -1; /* file is up-to-date */
223+ }
224+ if (diff < 0)
225+ low = mid + 1;
226+ else
227+ high = mid - 1;
228+ }
229+
230+ return good_match >= 0 ? good_match : ok_match;
231+}
232+
233+static void look_for_rename(struct file_struct *file, char *fname,
234+ alloc_pool_t pool)
235+{
236+ struct file_struct *fp;
237+ char *partialptr, *fn;
238+ STRUCT_STAT st;
239+ int ndx;
240+
241+ if ((ndx = fattr_find(file, fname, pool)) < 0)
242+ return;
243+
244+ fp = the_fattr_list.files[ndx];
245+ fn = f_name(fp, NULL);
246+ /* We don't provide an alternate-basis file if there is a basis file. */
247+ if (link_stat(fn, &st, 0) == 0)
248+ return;
249+ if ((partialptr = partial_dir_fname(fn)) == NULL
250+ || !handle_partial_dir(partialptr, PDIR_CREATE))
251+ return;
252+
253+ /* We only use the file if we can hard-link it into our tmp dir. */
254+ if (link(fname, partialptr) == 0) {
255+ if (verbose > 2) {
256+ rprintf(FINFO, "found renamed: %s => %s\n",
257+ fname, partialptr);
258+ }
259+ return;
260+ }
261+
262+ if (errno != EEXIST)
263+ handle_partial_dir(partialptr, PDIR_DELETE);
264+}
265+
266 /* Delete a file or directory. If DEL_RECURSE is set in the flags, this will
267 * delete recursively.
268 *
269 * Note that fbuf must point to a MAXPATHLEN buffer if the mode indicates it's
270 * a directory! (The buffer is used for recursion, but returned unchanged.)
271+ *
272+ * Also note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
273 */
274 static enum delret delete_item(char *fbuf, int mode, char *replace, int flags)
275 {
276@@ -147,6 +260,8 @@ static enum delret delete_item(char *fbu
277 goto check_ret;
278 /* OK: try to delete the directory. */
279 }
280+ if (flags & DEL_NO_DELETIONS)
281+ return DR_SUCCESS;
282
283 if (!replace && max_delete >= 0 && ++deletion_count > max_delete)
284 return DR_AT_LIMIT;
285@@ -193,6 +308,8 @@ static enum delret delete_item(char *fbu
286 * its contents, otherwise just checks for content. Returns DR_SUCCESS or
287 * DR_NOT_EMPTY. Note that fname must point to a MAXPATHLEN buffer! (The
288 * buffer is used for recursion, but returned unchanged.)
289+ *
290+ * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
291 */
292 static enum delret delete_dir_contents(char *fname, int flags)
293 {
294@@ -249,6 +366,8 @@ static enum delret delete_dir_contents(c
295 if (S_ISDIR(fp->mode)
296 && delete_dir_contents(fname, flags | DEL_RECURSE) != DR_SUCCESS)
297 ret = DR_NOT_EMPTY;
298+ if (detect_renamed && S_ISREG(fp->mode))
299+ look_for_rename(fp, fname, dirlist->file_pool);
300 if (delete_item(fname, fp->mode, NULL, flags) != DR_SUCCESS)
301 ret = DR_NOT_EMPTY;
302 }
303@@ -333,15 +452,19 @@ static void delayed_deletions(char *delb
304 * all the --delete-WHEN options. Note that the fbuf pointer must point to a
305 * MAXPATHLEN buffer with the name of the directory in it (the functions we
306 * call will append names onto the end, but the old dir value will be restored
307- * on exit). */
308+ * on exit).
309+ *
310+ * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
311+ */
312 static void delete_in_dir(struct file_list *flist, char *fbuf,
313- struct file_struct *file, STRUCT_STAT *stp)
314+ struct file_struct *file, STRUCT_STAT *stp, int flags)
315 {
316 static int min_depth = MAXPATHLEN, cur_depth = -1;
317 static void *filt_array[MAXPATHLEN/2+1];
318 static int already_warned = 0;
319 struct file_list *dirlist;
320- char delbuf[MAXPATHLEN];
321+ char *p, delbuf[MAXPATHLEN];
322+ unsigned remainder;
323 int dlen, i;
324
325 if (!flist) {
326@@ -355,6 +478,8 @@ static void delete_in_dir(struct file_li
327 if (verbose > 2)
328 rprintf(FINFO, "delete_in_dir(%s)\n", fbuf);
329
330+ flags |= DEL_RECURSE;
331+
332 if (allowed_lull)
333 maybe_send_keepalive();
334
335@@ -362,12 +487,14 @@ static void delete_in_dir(struct file_li
336 return; /* Impossible... */
337
338 if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) {
339- if (already_warned)
340+ if (!already_warned) {
341+ rprintf(FINFO,
342+ "IO error encountered -- skipping file deletion\n");
343+ already_warned = 1;
344+ }
345+ if (!detect_renamed)
346 return;
347- rprintf(FINFO,
348- "IO error encountered -- skipping file deletion\n");
349- already_warned = 1;
350- return;
351+ flags |= DEL_NO_DELETIONS;
352 }
353
354 while (cur_depth >= file->dir.depth && cur_depth >= min_depth)
355@@ -378,6 +505,9 @@ static void delete_in_dir(struct file_li
356 dlen = strlen(fbuf);
357 filt_array[cur_depth] = push_local_filters(fbuf, dlen);
358
359+ if (detect_renamed)
360+ unexplored_dirs--;
361+
362 if (one_file_system) {
363 if (file->flags & FLAG_TOP_DIR)
364 filesystem_dev = stp->st_dev;
365@@ -387,6 +517,11 @@ static void delete_in_dir(struct file_li
366
367 dirlist = get_dirlist(fbuf, dlen, 0);
368
369+ p = fbuf + dlen;
370+ if (dlen != 1 || *fbuf != '/')
371+ *p++ = '/';
372+ remainder = MAXPATHLEN - (p - fbuf);
373+
374 /* If an item in dirlist is not found in flist, delete it
375 * from the filesystem. */
376 for (i = dirlist->count; i--; ) {
377@@ -399,15 +534,22 @@ static void delete_in_dir(struct file_li
378 f_name(fp, NULL));
379 continue;
380 }
381+ if (detect_renamed && S_ISREG(fp->mode)) {
382+ strlcpy(p, fp->basename, remainder);
383+ look_for_rename(fp, fbuf, dirlist->file_pool);
384+ }
385 if (flist_find(flist, fp) < 0) {
386 f_name(fp, delbuf);
387- if (delete_delay_fp)
388+ if (delete_delay_fp && !(flags & DEL_NO_DELETIONS))
389 fprintf(delete_delay_fp, "%o %s%c", (short)fp->mode, delbuf, '\0');
390 else
391- delete_item(delbuf, fp->mode, NULL, DEL_RECURSE);
392- }
393+ delete_item(delbuf, fp->mode, NULL, flags);
394+ } else if (detect_renamed && S_ISDIR(fp->mode))
395+ unexplored_dirs++;
396 }
397
398+ fbuf[dlen] = '\0';
399+
400 flist_free(dirlist);
401 }
402
403@@ -437,9 +579,9 @@ static void do_delete_pass(struct file_l
404 || !S_ISDIR(st.st_mode))
405 continue;
406
407- delete_in_dir(flist, fbuf, file, &st);
408+ delete_in_dir(flist, fbuf, file, &st, 0);
409 }
410- delete_in_dir(NULL, NULL, NULL, NULL);
411+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
412
413 if (do_progress && !am_server)
414 rprintf(FINFO, " \r");
415@@ -968,6 +1110,7 @@ static int try_dests_non(struct file_str
416 return j;
417 }
418
419+static struct bitbag *delayed_bits = NULL;
420 static int phase = 0;
421
422 /* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir,
423@@ -1154,8 +1297,12 @@ static void recv_generator(char *fname,
424 if (real_ret != 0 && one_file_system)
425 real_st.st_dev = filesystem_dev;
426 if (delete_during && f_out != -1 && !phase && dry_run < 2
427- && (file->flags & FLAG_DEL_HERE))
428- delete_in_dir(the_file_list, fname, file, &real_st);
429+ && (file->flags & FLAG_DEL_HERE)) {
430+ if (detect_renamed && real_ret != 0)
431+ unexplored_dirs++;
432+ delete_in_dir(the_file_list, fname, file, &real_st,
433+ delete_during < 0 ? DEL_NO_DELETIONS : 0);
434+ }
435 return;
436 }
437
438@@ -1407,8 +1554,14 @@ static void recv_generator(char *fname,
439 && hard_link_check(file, ndx, fname, statret, &st,
440 itemizing, code, HL_SKIP))
441 return;
442- if (stat_errno == ENOENT)
443+ if (stat_errno == ENOENT) {
444+ if (detect_renamed && unexplored_dirs > 0
445+ && file->length) {
446+ bitbag_set_bit(delayed_bits, ndx);
447+ return;
448+ }
449 goto notify_others;
450+ }
451 rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s",
452 full_fname(fname));
453 return;
454@@ -1594,13 +1747,19 @@ void generate_files(int f_out, struct fi
455 (long)getpid(), flist->count);
456 }
457
458+ if (detect_renamed) {
459+ delayed_bits = bitbag_create(flist->count);
460+ if (!delete_before && !delete_during)
461+ delete_during = -1;
462+ }
463+
464 if (delete_before && !local_name && flist->count > 0)
465 do_delete_pass(flist);
466 if (delete_during == 2)
467 start_delete_temp();
468 do_progress = 0;
469
470- if (append_mode || whole_file < 0)
471+ if (append_mode || detect_renamed || whole_file < 0)
472 whole_file = 0;
473 if (verbose >= 2) {
474 rprintf(FINFO, "delta-transmission %s\n",
475@@ -1655,7 +1814,23 @@ void generate_files(int f_out, struct fi
476 }
477 recv_generator(NULL, NULL, 0, 0, 0, code, -1);
478 if (delete_during)
479- delete_in_dir(NULL, NULL, NULL, NULL);
480+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
481+
482+ if (detect_renamed) {
483+ if (delete_during < 0)
484+ delete_during = 0;
485+ detect_renamed = 0;
486+
487+ for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) {
488+ struct file_struct *file = flist->files[i];
489+ if (local_name)
490+ strlcpy(fbuf, local_name, sizeof fbuf);
491+ else
492+ f_name(file, fbuf);
493+ recv_generator(fbuf, file, i, itemizing,
494+ maybe_ATTRS_REPORT, code, f_out);
495+ }
496+ }
497
498 phase++;
499 csum_length = SUM_LENGTH;
500--- old/options.c
501+++ new/options.c
502@@ -78,6 +78,7 @@ int am_generator = 0;
503 int am_starting_up = 1;
504 int relative_paths = -1;
505 int implied_dirs = 1;
506+int detect_renamed = 0;
507 int numeric_ids = 0;
508 int allow_8bit_chars = 0;
509 int force_delete = 0;
510@@ -347,6 +348,7 @@ void usage(enum logcode F)
511 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
512 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
513 rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
514+ rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n");
515 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
516 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
517 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
518@@ -501,6 +503,7 @@ static struct poptOption long_options[]
519 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
520 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
521 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
522+ {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 },
523 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
524 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
525 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
526@@ -1360,7 +1363,7 @@ int parse_arguments(int *argc, const cha
527 inplace = 1;
528 }
529
530- if (delay_updates && !partial_dir)
531+ if ((delay_updates || detect_renamed) && !partial_dir)
532 partial_dir = tmp_partialdir;
533
534 if (inplace) {
535@@ -1369,6 +1372,7 @@ int parse_arguments(int *argc, const cha
536 snprintf(err_buf, sizeof err_buf,
537 "--%s cannot be used with --%s\n",
538 append_mode ? "append" : "inplace",
539+ detect_renamed ? "detect-renamed" :
540 delay_updates ? "delay-updates" : "partial-dir");
541 return 0;
542 }
543@@ -1679,6 +1683,8 @@ void server_options(char **args,int *arg
544 args[ac++] = "--super";
545 if (size_only)
546 args[ac++] = "--size-only";
547+ if (detect_renamed)
548+ args[ac++] = "--detect-renamed";
549 }
550
551 if (modify_window_set) {
552--- old/rsync.yo
553+++ new/rsync.yo
554@@ -364,6 +364,7 @@ to the detailed description below for a
555 --modify-window=NUM compare mod-times with reduced accuracy
556 -T, --temp-dir=DIR create temporary files in directory DIR
557 -y, --fuzzy find similar file for basis if no dest file
558+ --detect-renamed try to find renamed files to speed the xfer
559 --compare-dest=DIR also compare received files relative to DIR
560 --copy-dest=DIR ... and include copies of unchanged files
561 --link-dest=DIR hardlink to files in DIR when unchanged
562@@ -1272,6 +1273,15 @@ Note that the use of the bf(--delete) op
563 fuzzy-match files, so either use bf(--delete-after) or specify some
564 filename exclusions if you need to prevent this.
565
566+dit(bf(--detect-renamed)) This option tells rsync to scan the receiving
567+side for files that have been renamed, and to use any that are found as
568+alternate basis files to help speed up the transfer.
569+By default, alternate-basis files are hard-linked into a directory named
570+".~tmp~" in each file's destination directory, but if you've specified
571+the bf(--partial-dir) option, that directory will be used instead. These
572+potential alternate-basis files will be removed as the transfer progresses.
573+This option conflicts with bf(--inplace) and bf(--append).
574+
575 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
576 the destination machine as an additional hierarchy to compare destination
577 files against doing transfers (if the files are missing in the destination
578--- old/util.c
579+++ new/util.c
580@@ -1027,6 +1027,32 @@ int handle_partial_dir(const char *fname
581 return 1;
582 }
583
584+/* We need to supply our own strcmp function for file list comparisons
585+ * to ensure that signed/unsigned usage is consistent between machines. */
586+int u_strcmp(const char *p1, const char *p2)
587+{
588+ for ( ; *p1; p1++, p2++) {
589+ if (*p1 != *p2)
590+ break;
591+ }
592+
593+ return (int)*(uchar*)p1 - (int)*(uchar*)p2;
594+}
595+
596+/* We need a memcmp function compares unsigned-byte values. */
597+int u_memcmp(const void *p1, const void *p2, size_t len)
598+{
599+ const uchar *u1 = p1;
600+ const uchar *u2 = p2;
601+
602+ while (len--) {
603+ if (*u1 != *u2)
604+ return (int)*u1 - (int)*u2;
605+ }
606+
607+ return 0;
608+}
609+
610 /**
611 * Determine if a symlink points outside the current directory tree.
612 * This is considered "unsafe" because e.g. when mirroring somebody