From: Wayne Davison Date: Tue, 7 Feb 2006 18:32:44 +0000 (+0000) Subject: The --detect-renamed option for finding files in the destination X-Git-Url: https://mattmccutchen.net/rsync/rsync-patches.git/commitdiff_plain/1fffd582d3bfd4b47c106dd9d25e23293ccfa42d The --detect-renamed option for finding files in the destination that were renamed since the last run. --- diff --git a/detect-renamed.diff b/detect-renamed.diff new file mode 100644 index 0000000..6096180 --- /dev/null +++ b/detect-renamed.diff @@ -0,0 +1,604 @@ +This patch adds the --detect-renamed option which makes rsync notice files +that either (1) match in size & modify-time (plus the basename, if possible) +or (2) match in size & checksum (when --checksum was also specified) and use +each match as an alternate basis file to speed up the transfer. + +The algorithm attempts to scan the receiving-side's files in an efficient +manner. If --delete[-before] is enabled, we'll take advantage of the +pre-transfer delete pass to prepare any alternate-basis-file matches we +might find. If --delete-before is not enabled, rsync does the rename scan +during the regular file-sending scan (scanning each directory right before +the generator starts updating files from that dir). In this latter mode, +rsync might delay the updating of a file (if no alternate-basis match was +yet found) until the full scan of the receiving side is complete, at which +point any delayed files are processed. + +I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that +takes advantage of rsync's pre-existing partial-dir logic. This uses less +memory than trying to keep track of the matches internally, and also allows +any deletions or file-updates to occur normally without interfering with +these alternate-basis discoveries. + +After applying this patch, run these commands for a successful build: + + ./prepare-source + ./configure (optional if already run) + make + +TODO: + + We need to never return a match from fattr_find() that has a basis + file. This will ensure that we don't try to give a renamed file to + a file that can't use it, while missing out on giving it to a file + that could use it. + +--- old/flist.c ++++ new/flist.c +@@ -56,6 +56,7 @@ extern int implied_dirs; + extern int prune_empty_dirs; + extern int copy_links; + extern int copy_unsafe_links; ++extern int detect_renamed; + extern int protocol_version; + extern int sanitize_paths; + extern const char *io_write_phase; +@@ -74,6 +75,8 @@ int checksum_len; + dev_t filesystem_dev; /* used to implement -x */ + unsigned int file_struct_len; + ++struct file_list the_fattr_list; ++ + static char empty_sum[MD4_SUM_LENGTH]; + static int flist_count_offset; + +@@ -260,6 +263,44 @@ static mode_t from_wire_mode(int mode) + return (mode_t)mode; + } + ++static int fattr_compare(struct file_struct **file1, struct file_struct **file2) ++{ ++ struct file_struct *f1 = *file1; ++ struct file_struct *f2 = *file2; ++ int diff; ++ ++ if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) { ++ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length) ++ return 0; ++ return 1; ++ } ++ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length) ++ return -1; ++ ++ /* Don't use diff for values that are longer than an int. */ ++ if (f1->length != f2->length) ++ return f1->length < f2->length ? -1 : 1; ++ ++ if (always_checksum) { ++ diff = u_memcmp(f1->u.sum, f2->u.sum, checksum_len); ++ if (diff) ++ return diff; ++ } else if (f1->modtime != f2->modtime) ++ return f1->modtime < f2->modtime ? -1 : 1; ++ ++ diff = u_strcmp(f1->basename, f2->basename); ++ if (diff) ++ return diff; ++ ++ if (f1->dirname == f2->dirname) ++ return 0; ++ if (!f1->dirname) ++ return -1; ++ if (!f2->dirname) ++ return 1; ++ return u_strcmp(f1->dirname, f2->dirname); ++} ++ + static void send_directory(int f, struct file_list *flist, + char *fbuf, int len); + +@@ -1388,6 +1429,25 @@ struct file_list *recv_file_list(int f) + + clean_flist(flist, relative_paths, 1); + ++ if (detect_renamed) { ++ int j = flist->count; ++ the_fattr_list.count = j; ++ the_fattr_list.files = new_array(struct file_struct *, j); ++ if (!the_fattr_list.files) ++ goto oom; ++ memcpy(the_fattr_list.files, flist->files, ++ j * sizeof (struct file_struct *)); ++ qsort(the_fattr_list.files, j, ++ sizeof the_fattr_list.files[0], (int (*)())fattr_compare); ++ the_fattr_list.low = 0; ++ while (j-- > 0) { ++ struct file_struct *fp = the_fattr_list.files[j]; ++ if (fp->basename && S_ISREG(fp->mode) && fp->length) ++ break; ++ } ++ the_fattr_list.high = j; ++ } ++ + if (f >= 0) { + recv_uid_list(f, flist); + +--- old/generator.c ++++ new/generator.c +@@ -77,6 +77,7 @@ extern char *basis_dir[]; + extern int compare_dest; + extern int copy_dest; + extern int link_dest; ++extern int detect_renamed; + extern int whole_file; + extern int list_only; + extern int read_batch; +@@ -92,14 +93,17 @@ extern char *backup_dir; + extern char *backup_suffix; + extern int backup_suffix_len; + extern struct file_list *the_file_list; ++extern struct file_list the_fattr_list; + extern struct filter_list_struct server_filter_list; + + static int deletion_count = 0; /* used to implement --max-delete */ ++static int unexplored_dirs = 1; + static int can_link_symlinks = 1; /* start out optimistic */ + static int can_link_devices = 1; + +-/* For calling delete_file() */ ++/* For calling delete_item() and delete_in_dir() */ + #define DEL_FORCE_RECURSE (1<<1) /* recurse even w/o --force */ ++#define DEL_NO_DELETIONS (1<<2) + #define DEL_TERSE (1<<3) + + +@@ -109,12 +113,120 @@ static int is_backup_file(char *fn) + return k > 0 && strcmp(fn+k, backup_suffix) == 0; + } + ++/* Search for a regular file that matches either (1) the size & modified ++ * time (plus the basename, if possible) or (2) the size & checksum. If ++ * we find an exact match down to the dirname, return -1 because we found ++ * an up-to-date file in the transfer, not a renamed file. */ ++static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool) ++{ ++ int low = the_fattr_list.low, high = the_fattr_list.high; ++ int mid, ok_match = -1, good_match = -1; ++ struct file_struct *fmid; ++ int diff; ++ ++ while (low <= high) { ++ mid = (low + high) / 2; ++ fmid = the_fattr_list.files[mid]; ++ if (fmid->length != f->length) { ++ if (fmid->length < f->length) ++ low = mid + 1; ++ else ++ high = mid - 1; ++ continue; ++ } ++ if (always_checksum) { ++ if (!f->u.sum) { ++ if (fmid->modtime == f->modtime ++ && f_name_cmp(fmid, f) == 0) ++ return -1; /* assume we can't help */ ++ f->u.sum = pool_alloc(pool, MD4_SUM_LENGTH, ++ "fattr_find"); ++ file_checksum(fname, f->u.sum, f->length); ++ } ++ diff = u_memcmp(fmid->u.sum, f->u.sum, checksum_len); ++ if (diff) { ++ if (diff < 0) ++ low = mid + 1; ++ else ++ high = mid - 1; ++ continue; ++ } ++ } else { ++ if (fmid->modtime != f->modtime) { ++ if (fmid->modtime < f->modtime) ++ low = mid + 1; ++ else ++ high = mid - 1; ++ continue; ++ } ++ } ++ ok_match = mid; ++ diff = u_strcmp(fmid->basename, f->basename); ++ if (diff == 0) { ++ good_match = mid; ++ if (fmid->dirname == f->dirname) ++ return -1; /* file is up-to-date */ ++ if (!fmid->dirname) { ++ low = mid + 1; ++ continue; ++ } ++ if (!f->dirname) { ++ high = mid - 1; ++ continue; ++ } ++ diff = u_strcmp(fmid->dirname, f->dirname); ++ if (diff == 0) ++ return -1; /* file is up-to-date */ ++ } ++ if (diff < 0) ++ low = mid + 1; ++ else ++ high = mid - 1; ++ } ++ ++ return good_match >= 0 ? good_match : ok_match; ++} ++ ++static void look_for_rename(struct file_struct *file, char *fname, ++ alloc_pool_t pool) ++{ ++ struct file_struct *fp; ++ char *partialptr, *fn; ++ STRUCT_STAT st; ++ int ndx; ++ ++ if ((ndx = fattr_find(file, fname, pool)) < 0) ++ return; ++ ++ fp = the_fattr_list.files[ndx]; ++ fn = f_name(fp, NULL); ++ /* We don't provide an alternate-basis file if there is a basis file. */ ++ if (link_stat(fn, &st, 0) == 0) ++ return; ++ if ((partialptr = partial_dir_fname(fn)) == NULL ++ || !handle_partial_dir(partialptr, PDIR_CREATE)) ++ return; ++ ++ /* We only use the file if we can hard-link it into our tmp dir. */ ++ if (link(fname, partialptr) == 0) { ++ if (verbose > 2) { ++ rprintf(FINFO, "found renamed: %s => %s\n", ++ fname, partialptr); ++ } ++ return; ++ } ++ ++ if (errno != EEXIST) ++ handle_partial_dir(partialptr, PDIR_DELETE); ++} + + /* Delete a file or directory. If DEL_FORCE_RECURSE is set in the flags, or if + * force_delete is set, this will delete recursively. + * + * Note that fname must point to a MAXPATHLEN buffer if the mode indicates it's + * a directory! (The buffer is used for recursion, but returned unchanged.) ++ * ++ * Also Note: --detect-rename may use this routine with DEL_NO_DELETIONS set! + */ + static int delete_item(char *fname, int mode, int flags) + { +@@ -125,6 +237,8 @@ static int delete_item(char *fname, int + char *p; + + if (!S_ISDIR(mode)) { ++ if (flags & DEL_NO_DELETIONS) ++ return 0; + if (max_delete && ++deletion_count > max_delete) + return 0; + if (make_backups && (backup_dir || !is_backup_file(fname))) +@@ -147,6 +261,7 @@ static int delete_item(char *fname, int + + zap_dir = flags & DEL_FORCE_RECURSE || force_delete; + if ((max_delete && ++deletion_count > max_delete) ++ || flags & DEL_NO_DELETIONS + || (dry_run && zap_dir)) { + ok = 0; + errno = ENOTEMPTY; +@@ -189,6 +304,8 @@ static int delete_item(char *fname, int + continue; + + strlcpy(p, fp->basename, remainder); ++ if (detect_renamed && S_ISREG(fp->mode)) ++ look_for_rename(fp, fname, dirlist->file_pool); + delete_item(fname, fp->mode, flags & ~DEL_TERSE); + } + flist_free(dirlist); +@@ -197,7 +314,8 @@ static int delete_item(char *fname, int + + pop_local_filters(save_filters); + +- if (max_delete && ++deletion_count > max_delete) ++ if (flags & DEL_NO_DELETIONS ++ || (max_delete && ++deletion_count > max_delete)) + return 0; + + if (do_rmdir(fname) == 0) { +@@ -217,15 +335,19 @@ static int delete_item(char *fname, int + * all the --delete-WHEN options. Note that the fbuf pointer must point to a + * MAXPATHLEN buffer with the name of the directory in it (the functions we + * call will append names onto the end, but the old dir value will be restored +- * on exit). */ ++ * on exit). ++ * ++ * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set! ++ */ + static void delete_in_dir(struct file_list *flist, char *fbuf, +- struct file_struct *file, STRUCT_STAT *stp) ++ struct file_struct *file, STRUCT_STAT *stp, int flags) + { + static int min_depth = MAXPATHLEN, cur_depth = -1; + static void *filt_array[MAXPATHLEN/2+1]; + static int already_warned = 0; + struct file_list *dirlist; +- char delbuf[MAXPATHLEN]; ++ char *p, delbuf[MAXPATHLEN]; ++ unsigned remainder; + int dlen, i; + + if (!flist) { +@@ -239,6 +361,8 @@ static void delete_in_dir(struct file_li + if (verbose > 2) + rprintf(FINFO, "delete_in_dir(%s)\n", fbuf); + ++ flags |= DEL_FORCE_RECURSE; ++ + if (allowed_lull) + maybe_send_keepalive(); + +@@ -246,12 +370,14 @@ static void delete_in_dir(struct file_li + return; /* Impossible... */ + + if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) { +- if (already_warned) ++ if (!already_warned) { ++ rprintf(FINFO, ++ "IO error encountered -- skipping file deletion\n"); ++ already_warned = 1; ++ } ++ if (!detect_renamed) + return; +- rprintf(FINFO, +- "IO error encountered -- skipping file deletion\n"); +- already_warned = 1; +- return; ++ flags |= DEL_NO_DELETIONS; + } + + while (cur_depth >= file->dir.depth && cur_depth >= min_depth) +@@ -262,6 +388,9 @@ static void delete_in_dir(struct file_li + dlen = strlen(fbuf); + filt_array[cur_depth] = push_local_filters(fbuf, dlen); + ++ if (detect_renamed) ++ unexplored_dirs--; ++ + if (one_file_system) { + if (file->flags & FLAG_TOP_DIR) + filesystem_dev = stp->st_dev; +@@ -271,18 +400,30 @@ static void delete_in_dir(struct file_li + + dirlist = get_dirlist(fbuf, dlen, 0); + ++ p = fbuf + dlen; ++ if (dlen != 1 || *fbuf != '/') ++ *p++ = '/'; ++ remainder = MAXPATHLEN - (p - fbuf); ++ + /* If an item in dirlist is not found in flist, delete it + * from the filesystem. */ + for (i = dirlist->count; i--; ) { + struct file_struct *fp = dirlist->files[i]; + if (!fp->basename || fp->flags & FLAG_MOUNT_POINT) + continue; ++ if (detect_renamed && S_ISREG(fp->mode)) { ++ strlcpy(p, fp->basename, remainder); ++ look_for_rename(fp, fbuf, dirlist->file_pool); ++ } + if (flist_find(flist, fp) < 0) { + f_name(fp, delbuf); +- delete_item(delbuf, fp->mode, DEL_FORCE_RECURSE); +- } ++ delete_item(delbuf, fp->mode, flags); ++ } else if (detect_renamed && S_ISDIR(fp->mode)) ++ unexplored_dirs++; + } + ++ fbuf[dlen] = '\0'; ++ + flist_free(dirlist); + } + +@@ -312,9 +453,9 @@ static void do_delete_pass(struct file_l + || !S_ISDIR(st.st_mode)) + continue; + +- delete_in_dir(flist, fbuf, file, &st); ++ delete_in_dir(flist, fbuf, file, &st, 0); + } +- delete_in_dir(NULL, NULL, NULL, NULL); ++ delete_in_dir(NULL, NULL, NULL, NULL, 0); + + if (do_progress && !am_server) + rprintf(FINFO, " \r"); +@@ -753,6 +894,7 @@ static int try_dests_non(struct file_str + return -1; + } + ++static struct bitbag *delayed_bits = NULL; + static int phase = 0; + + /* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir, +@@ -894,8 +1036,12 @@ static void recv_generator(char *fname, + && verbose && code && f_out != -1) + rprintf(code, "%s/\n", fname); + if (delete_during && f_out != -1 && !phase && dry_run < 2 +- && (file->flags & FLAG_DEL_HERE)) +- delete_in_dir(the_file_list, fname, file, &st); ++ && (file->flags & FLAG_DEL_HERE)) { ++ if (detect_renamed && statret != 0) ++ unexplored_dirs++; ++ delete_in_dir(the_file_list, fname, file, &st, ++ delete_during < 0 ? DEL_NO_DELETIONS : 0); ++ } + return; + } + +@@ -1133,8 +1279,14 @@ static void recv_generator(char *fname, + && hard_link_check(file, ndx, fname, statret, &st, + itemizing, code, HL_SKIP)) + return; +- if (stat_errno == ENOENT) ++ if (stat_errno == ENOENT) { ++ if (detect_renamed && unexplored_dirs > 0 ++ && file->length) { ++ bitbag_set_bit(delayed_bits, ndx); ++ return; ++ } + goto notify_others; ++ } + rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s", + full_fname(fname)); + return; +@@ -1309,11 +1461,17 @@ void generate_files(int f_out, struct fi + (long)getpid(), flist->count); + } + ++ if (detect_renamed) { ++ delayed_bits = bitbag_create(flist->count); ++ if (!delete_before && !delete_during) ++ delete_during = -1; ++ } ++ + if (delete_before && !local_name && flist->count > 0) + do_delete_pass(flist); + do_progress = 0; + +- if (append_mode || whole_file < 0) ++ if (append_mode || detect_renamed || whole_file < 0) + whole_file = 0; + if (verbose >= 2) { + rprintf(FINFO, "delta-transmission %s\n", +@@ -1368,7 +1526,23 @@ void generate_files(int f_out, struct fi + } + recv_generator(NULL, NULL, 0, 0, 0, code, -1); + if (delete_during) +- delete_in_dir(NULL, NULL, NULL, NULL); ++ delete_in_dir(NULL, NULL, NULL, NULL, 0); ++ ++ if (detect_renamed) { ++ if (delete_during < 0) ++ delete_during = 0; ++ detect_renamed = 0; ++ ++ for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) { ++ struct file_struct *file = flist->files[i]; ++ if (local_name) ++ strlcpy(fbuf, local_name, sizeof fbuf); ++ else ++ f_name(file, fbuf); ++ recv_generator(fbuf, file, i, itemizing, ++ maybe_ATTRS_REPORT, code, f_out); ++ } ++ } + + phase++; + csum_length = SUM_LENGTH; +--- old/options.c ++++ new/options.c +@@ -78,6 +78,7 @@ int am_starting_up = 1; + int orig_umask = 0; + int relative_paths = -1; + int implied_dirs = 1; ++int detect_renamed = 0; + int numeric_ids = 0; + int allow_8bit_chars = 0; + int force_delete = 0; +@@ -335,6 +336,7 @@ void usage(enum logcode F) + rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n"); + rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); + rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n"); ++ rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n"); + rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); + rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); + rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); +@@ -480,6 +482,7 @@ static struct poptOption long_options[] + {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, + {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, + {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, ++ {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 }, + {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, + {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 }, + {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 }, +@@ -1335,7 +1338,7 @@ int parse_arguments(int *argc, const cha + inplace = 1; + } + +- if (delay_updates && !partial_dir) ++ if ((delay_updates || detect_renamed) && !partial_dir) + partial_dir = tmp_partialdir; + + if (inplace) { +@@ -1344,6 +1347,7 @@ int parse_arguments(int *argc, const cha + snprintf(err_buf, sizeof err_buf, + "--%s cannot be used with --%s\n", + append_mode ? "append" : "inplace", ++ detect_renamed ? "detect-renamed" : + delay_updates ? "delay-updates" : "partial-dir"); + return 0; + } +--- old/rsync.yo ++++ new/rsync.yo +@@ -358,6 +358,7 @@ to the detailed description below for a + --modify-window=NUM compare mod-times with reduced accuracy + -T, --temp-dir=DIR create temporary files in directory DIR + -y, --fuzzy find similar file for basis if no dest file ++ --detect-renamed try to find renamed files to speed the xfer + --compare-dest=DIR also compare received files relative to DIR + --copy-dest=DIR ... and include copies of unchanged files + --link-dest=DIR hardlink to files in DIR when unchanged +@@ -1183,6 +1184,15 @@ Note that the use of the bf(--delete) op + fuzzy-match files, so either use bf(--delete-after) or specify some + filename exclusions if you need to prevent this. + ++dit(bf(--detect-renamed)) This option tells rsync to scan the receiving ++side for files that have been renamed, and to use any that are found as ++alternate basis files to help speed up the transfer. ++By default, alternate-basis files are hard-linked into a directory named ++".~tmp~" in each file's destination directory, but if you've specified ++the bf(--partial-dir) option, that directory will be used instead. These ++potential alternate-basis files will be removed as the transfer progresses. ++This option conflicts with bf(--inplace) and bf(--append). ++ + dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on + the destination machine as an additional hierarchy to compare destination + files against doing transfers (if the files are missing in the destination +--- old/util.c ++++ new/util.c +@@ -997,6 +997,32 @@ int handle_partial_dir(const char *fname + return 1; + } + ++/* We need to supply our own strcmp function for file list comparisons ++ * to ensure that signed/unsigned usage is consistent between machines. */ ++int u_strcmp(const char *p1, const char *p2) ++{ ++ for ( ; *p1; p1++, p2++) { ++ if (*p1 != *p2) ++ break; ++ } ++ ++ return (int)*(uchar*)p1 - (int)*(uchar*)p2; ++} ++ ++/* We need a memcmp function compares unsigned-byte values. */ ++int u_memcmp(const void *p1, const void *p2, size_t len) ++{ ++ const uchar *u1 = p1; ++ const uchar *u2 = p2; ++ ++ while (len--) { ++ if (*u1 != *u2) ++ return (int)*u1 - (int)*u2; ++ } ++ ++ return 0; ++} ++ + /** + * Determine if a symlink points outside the current directory tree. + * This is considered "unsafe" because e.g. when mirroring somebody