This patch adds the --detect-renamed option which makes rsync notice files that either (1) match in size & modify-time (plus the basename, if possible) or (2) match in size & checksum (when --checksum was also specified) and use each match as an alternate basis file to speed up the transfer. The algorithm attempts to scan the receiving-side's files in an efficient manner. If --delete[-before] is enabled, we'll take advantage of the pre-transfer delete pass to prepare any alternate-basis-file matches we might find. If --delete-before is not enabled, rsync does the rename scan during the regular file-sending scan (scanning each directory right before the generator starts updating files from that dir). In this latter mode, rsync might delay the updating of a file (if no alternate-basis match was yet found) until the full scan of the receiving side is complete, at which point any delayed files are processed. I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that takes advantage of rsync's pre-existing partial-dir logic. This uses less memory than trying to keep track of the matches internally, and also allows any deletions or file-updates to occur normally without interfering with these alternate-basis discoveries. After applying this patch, run these commands for a successful build: ./prepare-source ./configure (optional if already run) make TODO: We need to never return a match from fattr_find() that has a basis file. This will ensure that we don't try to give a renamed file to a file that can't use it, while missing out on giving it to a file that could use it. --- old/flist.c +++ new/flist.c @@ -56,6 +56,7 @@ extern int implied_dirs; extern int prune_empty_dirs; extern int copy_links; extern int copy_unsafe_links; +extern int detect_renamed; extern int protocol_version; extern int sanitize_paths; extern const char *io_write_phase; @@ -74,6 +75,8 @@ int checksum_len; dev_t filesystem_dev; /* used to implement -x */ unsigned int file_struct_len; +struct file_list the_fattr_list; + static char empty_sum[MD4_SUM_LENGTH]; static int flist_count_offset; @@ -260,6 +263,44 @@ static mode_t from_wire_mode(int mode) return (mode_t)mode; } +static int fattr_compare(struct file_struct **file1, struct file_struct **file2) +{ + struct file_struct *f1 = *file1; + struct file_struct *f2 = *file2; + int diff; + + if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) { + if (!f2->basename || !S_ISREG(f2->mode) || !f2->length) + return 0; + return 1; + } + if (!f2->basename || !S_ISREG(f2->mode) || !f2->length) + return -1; + + /* Don't use diff for values that are longer than an int. */ + if (f1->length != f2->length) + return f1->length < f2->length ? -1 : 1; + + if (always_checksum) { + diff = u_memcmp(f1->u.sum, f2->u.sum, checksum_len); + if (diff) + return diff; + } else if (f1->modtime != f2->modtime) + return f1->modtime < f2->modtime ? -1 : 1; + + diff = u_strcmp(f1->basename, f2->basename); + if (diff) + return diff; + + if (f1->dirname == f2->dirname) + return 0; + if (!f1->dirname) + return -1; + if (!f2->dirname) + return 1; + return u_strcmp(f1->dirname, f2->dirname); +} + static void send_directory(int f, struct file_list *flist, char *fbuf, int len); @@ -1388,6 +1429,25 @@ struct file_list *recv_file_list(int f) clean_flist(flist, relative_paths, 1); + if (detect_renamed) { + int j = flist->count; + the_fattr_list.count = j; + the_fattr_list.files = new_array(struct file_struct *, j); + if (!the_fattr_list.files) + goto oom; + memcpy(the_fattr_list.files, flist->files, + j * sizeof (struct file_struct *)); + qsort(the_fattr_list.files, j, + sizeof the_fattr_list.files[0], (int (*)())fattr_compare); + the_fattr_list.low = 0; + while (j-- > 0) { + struct file_struct *fp = the_fattr_list.files[j]; + if (fp->basename && S_ISREG(fp->mode) && fp->length) + break; + } + the_fattr_list.high = j; + } + if (f >= 0) { recv_uid_list(f, flist); --- old/generator.c +++ new/generator.c @@ -77,6 +77,7 @@ extern char *basis_dir[]; extern int compare_dest; extern int copy_dest; extern int link_dest; +extern int detect_renamed; extern int whole_file; extern int list_only; extern int read_batch; @@ -92,14 +93,17 @@ extern char *backup_dir; extern char *backup_suffix; extern int backup_suffix_len; extern struct file_list *the_file_list; +extern struct file_list the_fattr_list; extern struct filter_list_struct server_filter_list; static int deletion_count = 0; /* used to implement --max-delete */ +static int unexplored_dirs = 1; static int can_link_symlinks = 1; /* start out optimistic */ static int can_link_devices = 1; -/* For calling delete_file() */ +/* For calling delete_item() and delete_in_dir() */ #define DEL_FORCE_RECURSE (1<<1) /* recurse even w/o --force */ +#define DEL_NO_DELETIONS (1<<2) #define DEL_TERSE (1<<3) @@ -109,12 +113,120 @@ static int is_backup_file(char *fn) return k > 0 && strcmp(fn+k, backup_suffix) == 0; } +/* Search for a regular file that matches either (1) the size & modified + * time (plus the basename, if possible) or (2) the size & checksum. If + * we find an exact match down to the dirname, return -1 because we found + * an up-to-date file in the transfer, not a renamed file. */ +static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool) +{ + int low = the_fattr_list.low, high = the_fattr_list.high; + int mid, ok_match = -1, good_match = -1; + struct file_struct *fmid; + int diff; + + while (low <= high) { + mid = (low + high) / 2; + fmid = the_fattr_list.files[mid]; + if (fmid->length != f->length) { + if (fmid->length < f->length) + low = mid + 1; + else + high = mid - 1; + continue; + } + if (always_checksum) { + if (!f->u.sum) { + if (fmid->modtime == f->modtime + && f_name_cmp(fmid, f) == 0) + return -1; /* assume we can't help */ + f->u.sum = pool_alloc(pool, MD4_SUM_LENGTH, + "fattr_find"); + file_checksum(fname, f->u.sum, f->length); + } + diff = u_memcmp(fmid->u.sum, f->u.sum, checksum_len); + if (diff) { + if (diff < 0) + low = mid + 1; + else + high = mid - 1; + continue; + } + } else { + if (fmid->modtime != f->modtime) { + if (fmid->modtime < f->modtime) + low = mid + 1; + else + high = mid - 1; + continue; + } + } + ok_match = mid; + diff = u_strcmp(fmid->basename, f->basename); + if (diff == 0) { + good_match = mid; + if (fmid->dirname == f->dirname) + return -1; /* file is up-to-date */ + if (!fmid->dirname) { + low = mid + 1; + continue; + } + if (!f->dirname) { + high = mid - 1; + continue; + } + diff = u_strcmp(fmid->dirname, f->dirname); + if (diff == 0) + return -1; /* file is up-to-date */ + } + if (diff < 0) + low = mid + 1; + else + high = mid - 1; + } + + return good_match >= 0 ? good_match : ok_match; +} + +static void look_for_rename(struct file_struct *file, char *fname, + alloc_pool_t pool) +{ + struct file_struct *fp; + char *partialptr, *fn; + STRUCT_STAT st; + int ndx; + + if ((ndx = fattr_find(file, fname, pool)) < 0) + return; + + fp = the_fattr_list.files[ndx]; + fn = f_name(fp, NULL); + /* We don't provide an alternate-basis file if there is a basis file. */ + if (link_stat(fn, &st, 0) == 0) + return; + if ((partialptr = partial_dir_fname(fn)) == NULL + || !handle_partial_dir(partialptr, PDIR_CREATE)) + return; + + /* We only use the file if we can hard-link it into our tmp dir. */ + if (link(fname, partialptr) == 0) { + if (verbose > 2) { + rprintf(FINFO, "found renamed: %s => %s\n", + fname, partialptr); + } + return; + } + + if (errno != EEXIST) + handle_partial_dir(partialptr, PDIR_DELETE); +} /* Delete a file or directory. If DEL_FORCE_RECURSE is set in the flags, or if * force_delete is set, this will delete recursively. * * Note that fname must point to a MAXPATHLEN buffer if the mode indicates it's * a directory! (The buffer is used for recursion, but returned unchanged.) + * + * Also Note: --detect-rename may use this routine with DEL_NO_DELETIONS set! */ static int delete_item(char *fname, int mode, int flags) { @@ -125,6 +237,8 @@ static int delete_item(char *fname, int char *p; if (!S_ISDIR(mode)) { + if (flags & DEL_NO_DELETIONS) + return 0; if (max_delete && ++deletion_count > max_delete) return 0; if (make_backups && (backup_dir || !is_backup_file(fname))) @@ -147,6 +261,7 @@ static int delete_item(char *fname, int zap_dir = flags & DEL_FORCE_RECURSE || force_delete; if ((max_delete && ++deletion_count > max_delete) + || flags & DEL_NO_DELETIONS || (dry_run && zap_dir)) { ok = 0; errno = ENOTEMPTY; @@ -189,6 +304,8 @@ static int delete_item(char *fname, int continue; strlcpy(p, fp->basename, remainder); + if (detect_renamed && S_ISREG(fp->mode)) + look_for_rename(fp, fname, dirlist->file_pool); delete_item(fname, fp->mode, flags & ~DEL_TERSE); } flist_free(dirlist); @@ -197,7 +314,8 @@ static int delete_item(char *fname, int pop_local_filters(save_filters); - if (max_delete && ++deletion_count > max_delete) + if (flags & DEL_NO_DELETIONS + || (max_delete && ++deletion_count > max_delete)) return 0; if (do_rmdir(fname) == 0) { @@ -217,15 +335,19 @@ static int delete_item(char *fname, int * all the --delete-WHEN options. Note that the fbuf pointer must point to a * MAXPATHLEN buffer with the name of the directory in it (the functions we * call will append names onto the end, but the old dir value will be restored - * on exit). */ + * on exit). + * + * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set! + */ static void delete_in_dir(struct file_list *flist, char *fbuf, - struct file_struct *file, STRUCT_STAT *stp) + struct file_struct *file, STRUCT_STAT *stp, int flags) { static int min_depth = MAXPATHLEN, cur_depth = -1; static void *filt_array[MAXPATHLEN/2+1]; static int already_warned = 0; struct file_list *dirlist; - char delbuf[MAXPATHLEN]; + char *p, delbuf[MAXPATHLEN]; + unsigned remainder; int dlen, i; if (!flist) { @@ -239,6 +361,8 @@ static void delete_in_dir(struct file_li if (verbose > 2) rprintf(FINFO, "delete_in_dir(%s)\n", fbuf); + flags |= DEL_FORCE_RECURSE; + if (allowed_lull) maybe_send_keepalive(); @@ -246,12 +370,14 @@ static void delete_in_dir(struct file_li return; /* Impossible... */ if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) { - if (already_warned) + if (!already_warned) { + rprintf(FINFO, + "IO error encountered -- skipping file deletion\n"); + already_warned = 1; + } + if (!detect_renamed) return; - rprintf(FINFO, - "IO error encountered -- skipping file deletion\n"); - already_warned = 1; - return; + flags |= DEL_NO_DELETIONS; } while (cur_depth >= file->dir.depth && cur_depth >= min_depth) @@ -262,6 +388,9 @@ static void delete_in_dir(struct file_li dlen = strlen(fbuf); filt_array[cur_depth] = push_local_filters(fbuf, dlen); + if (detect_renamed) + unexplored_dirs--; + if (one_file_system) { if (file->flags & FLAG_TOP_DIR) filesystem_dev = stp->st_dev; @@ -271,18 +400,30 @@ static void delete_in_dir(struct file_li dirlist = get_dirlist(fbuf, dlen, 0); + p = fbuf + dlen; + if (dlen != 1 || *fbuf != '/') + *p++ = '/'; + remainder = MAXPATHLEN - (p - fbuf); + /* If an item in dirlist is not found in flist, delete it * from the filesystem. */ for (i = dirlist->count; i--; ) { struct file_struct *fp = dirlist->files[i]; if (!fp->basename || fp->flags & FLAG_MOUNT_POINT) continue; + if (detect_renamed && S_ISREG(fp->mode)) { + strlcpy(p, fp->basename, remainder); + look_for_rename(fp, fbuf, dirlist->file_pool); + } if (flist_find(flist, fp) < 0) { f_name(fp, delbuf); - delete_item(delbuf, fp->mode, DEL_FORCE_RECURSE); - } + delete_item(delbuf, fp->mode, flags); + } else if (detect_renamed && S_ISDIR(fp->mode)) + unexplored_dirs++; } + fbuf[dlen] = '\0'; + flist_free(dirlist); } @@ -312,9 +453,9 @@ static void do_delete_pass(struct file_l || !S_ISDIR(st.st_mode)) continue; - delete_in_dir(flist, fbuf, file, &st); + delete_in_dir(flist, fbuf, file, &st, 0); } - delete_in_dir(NULL, NULL, NULL, NULL); + delete_in_dir(NULL, NULL, NULL, NULL, 0); if (do_progress && !am_server) rprintf(FINFO, " \r"); @@ -753,6 +894,7 @@ static int try_dests_non(struct file_str return -1; } +static struct bitbag *delayed_bits = NULL; static int phase = 0; /* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir, @@ -894,8 +1036,12 @@ static void recv_generator(char *fname, && verbose && code && f_out != -1) rprintf(code, "%s/\n", fname); if (delete_during && f_out != -1 && !phase && dry_run < 2 - && (file->flags & FLAG_DEL_HERE)) - delete_in_dir(the_file_list, fname, file, &st); + && (file->flags & FLAG_DEL_HERE)) { + if (detect_renamed && statret != 0) + unexplored_dirs++; + delete_in_dir(the_file_list, fname, file, &st, + delete_during < 0 ? DEL_NO_DELETIONS : 0); + } return; } @@ -1133,8 +1279,14 @@ static void recv_generator(char *fname, && hard_link_check(file, ndx, fname, statret, &st, itemizing, code, HL_SKIP)) return; - if (stat_errno == ENOENT) + if (stat_errno == ENOENT) { + if (detect_renamed && unexplored_dirs > 0 + && file->length) { + bitbag_set_bit(delayed_bits, ndx); + return; + } goto notify_others; + } rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s", full_fname(fname)); return; @@ -1309,11 +1461,17 @@ void generate_files(int f_out, struct fi (long)getpid(), flist->count); } + if (detect_renamed) { + delayed_bits = bitbag_create(flist->count); + if (!delete_before && !delete_during) + delete_during = -1; + } + if (delete_before && !local_name && flist->count > 0) do_delete_pass(flist); do_progress = 0; - if (append_mode || whole_file < 0) + if (append_mode || detect_renamed || whole_file < 0) whole_file = 0; if (verbose >= 2) { rprintf(FINFO, "delta-transmission %s\n", @@ -1368,7 +1526,23 @@ void generate_files(int f_out, struct fi } recv_generator(NULL, NULL, 0, 0, 0, code, -1); if (delete_during) - delete_in_dir(NULL, NULL, NULL, NULL); + delete_in_dir(NULL, NULL, NULL, NULL, 0); + + if (detect_renamed) { + if (delete_during < 0) + delete_during = 0; + detect_renamed = 0; + + for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) { + struct file_struct *file = flist->files[i]; + if (local_name) + strlcpy(fbuf, local_name, sizeof fbuf); + else + f_name(file, fbuf); + recv_generator(fbuf, file, i, itemizing, + maybe_ATTRS_REPORT, code, f_out); + } + } phase++; csum_length = SUM_LENGTH; --- old/options.c +++ new/options.c @@ -78,6 +78,7 @@ int am_starting_up = 1; int orig_umask = 0; int relative_paths = -1; int implied_dirs = 1; +int detect_renamed = 0; int numeric_ids = 0; int allow_8bit_chars = 0; int force_delete = 0; @@ -335,6 +336,7 @@ void usage(enum logcode F) rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n"); rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n"); + rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n"); rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); @@ -480,6 +482,7 @@ static struct poptOption long_options[] {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 }, {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 }, {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 }, @@ -1334,7 +1337,7 @@ int parse_arguments(int *argc, const cha inplace = 1; } - if (delay_updates && !partial_dir) + if ((delay_updates || detect_renamed) && !partial_dir) partial_dir = tmp_partialdir; if (inplace) { @@ -1343,6 +1346,7 @@ int parse_arguments(int *argc, const cha snprintf(err_buf, sizeof err_buf, "--%s cannot be used with --%s\n", append_mode ? "append" : "inplace", + detect_renamed ? "detect-renamed" : delay_updates ? "delay-updates" : "partial-dir"); return 0; } --- old/rsync.yo +++ new/rsync.yo @@ -358,6 +358,7 @@ to the detailed description below for a --modify-window=NUM compare mod-times with reduced accuracy -T, --temp-dir=DIR create temporary files in directory DIR -y, --fuzzy find similar file for basis if no dest file + --detect-renamed try to find renamed files to speed the xfer --compare-dest=DIR also compare received files relative to DIR --copy-dest=DIR ... and include copies of unchanged files --link-dest=DIR hardlink to files in DIR when unchanged @@ -1183,6 +1184,15 @@ Note that the use of the bf(--delete) op fuzzy-match files, so either use bf(--delete-after) or specify some filename exclusions if you need to prevent this. +dit(bf(--detect-renamed)) This option tells rsync to scan the receiving +side for files that have been renamed, and to use any that are found as +alternate basis files to help speed up the transfer. +By default, alternate-basis files are hard-linked into a directory named +".~tmp~" in each file's destination directory, but if you've specified +the bf(--partial-dir) option, that directory will be used instead. These +potential alternate-basis files will be removed as the transfer progresses. +This option conflicts with bf(--inplace) and bf(--append). + dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on the destination machine as an additional hierarchy to compare destination files against doing transfers (if the files are missing in the destination --- old/util.c +++ new/util.c @@ -997,6 +997,32 @@ int handle_partial_dir(const char *fname return 1; } +/* We need to supply our own strcmp function for file list comparisons + * to ensure that signed/unsigned usage is consistent between machines. */ +int u_strcmp(const char *p1, const char *p2) +{ + for ( ; *p1; p1++, p2++) { + if (*p1 != *p2) + break; + } + + return (int)*(uchar*)p1 - (int)*(uchar*)p2; +} + +/* We need a memcmp function compares unsigned-byte values. */ +int u_memcmp(const void *p1, const void *p2, size_t len) +{ + const uchar *u1 = p1; + const uchar *u2 = p2; + + while (len--) { + if (*u1 != *u2) + return (int)*u1 - (int)*u2; + } + + return 0; +} + /** * Determine if a symlink points outside the current directory tree. * This is considered "unsafe" because e.g. when mirroring somebody