X-Git-Url: https://mattmccutchen.net/rsync/rsync-patches.git/blobdiff_plain/5388f8591da9340053760831d79662d4a1bb4959..refs/heads/master:/link-by-hash.diff diff --git a/link-by-hash.diff b/link-by-hash.diff index c8dbc04..53fff8f 100644 --- a/link-by-hash.diff +++ b/link-by-hash.diff @@ -1,30 +1,55 @@ -After applying this patch and running configure, you MUST run this -command before "make": - - make proto - -Jason M. Felice writes: +Jason M. Felice wrote: This patch adds the --link-by-hash=DIR option, which hard links received files in a link farm arranged by MD4 file hash. The result is that the system will only store one copy of the unique contents of each file, regardless of the file's name. +To use this patch, run these commands for a successful build: + + patch -p1 = 8 +diff --git a/hashlink.c b/hashlink.c +new file mode 100644 +--- /dev/null ++++ b/hashlink.c +@@ -0,0 +1,339 @@ +/* + Copyright (C) Cronosys, LLC 2004 + @@ -51,14 +76,12 @@ the file's name. + +#ifdef HAVE_LINK + -+char* make_hash_name(struct file_struct *file) ++char *make_hash_name(struct file_struct *file) +{ + char hash[33], *dst; -+ unsigned char *src; -+ unsigned char c; ++ uchar c, *src = (uchar*)F_SUM(file); + int i; + -+ src = (unsigned char*)file->u.sum; + for (dst = hash, i = 0; i < 4; i++, src++) { + c = *src >> 4; + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); @@ -74,7 +97,8 @@ the file's name. + } + *dst = 0; + -+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash); ++ if (asprintf(&dst,"%s/%s",link_by_hash_dir,hash) < 0) ++ out_of_memory("make_hash_name"); + return dst; +} + @@ -130,9 +154,9 @@ the file's name. + if (this_fnbr > *fnbr) + *fnbr = this_fnbr; + -+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct)); -+ asprintf(&hashfile->name,"%s/%s",hashname, -+ di->d_name); ++ hashfile = new_array(struct hashfile_struct, 1); ++ if (asprintf(&hashfile->name,"%s/%s",hashname, di->d_name) < 0) ++ out_of_memory("find_hashfiles"); + if (do_stat(hashfile->name,&st) == -1) { + rsyserr(FERROR, errno, "stat failed: %s", hashfile->name); + kill_hashfile(hashfile); @@ -246,7 +270,7 @@ the file's name. +} + + -+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file) ++int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file) +{ + STRUCT_STAT st; + char *hashname = make_hash_name(file); @@ -254,9 +278,8 @@ the file's name. + char *linkname; + long last_fnbr; + -+ if (file->length == 0) { -+ return robust_rename(fnametmp,fname,0644); -+ } ++ if (F_LENGTH(file) == 0) ++ return robust_rename(fnametmp, fname, NULL, 0644); + + if (do_stat(hashname, &st) == -1) { + char *dirname; @@ -268,18 +291,19 @@ the file's name. + rsyserr(FERROR, errno, "mkdir failed: %s", dirname); + free(hashname); + free(dirname); -+ return robust_rename(fnametmp,fname,0644); ++ return robust_rename(fnametmp, fname, NULL, 0644); + } + free(dirname); + + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) { + rsyserr(FERROR, errno, "mkdir failed: %s", hashname); + free(hashname); -+ return robust_rename(fnametmp,fname,0644); ++ return robust_rename(fnametmp, fname, NULL, 0644); + } + + first = 1; -+ asprintf(&linkname,"%s/0",hashname); ++ if (asprintf(&linkname,"%s/0",hashname) < 0) ++ out_of_memory("link_by_hash"); + rprintf(FINFO, "(1) linkname = %s\n", linkname); + } else { + struct hashfile_struct *hashfiles, *hashfile; @@ -292,7 +316,8 @@ the file's name. + + if (hashfiles == NULL) { + first = 1; -+ asprintf(&linkname,"%s/0",hashname); ++ if (asprintf(&linkname,"%s/0",hashname) < 0) ++ out_of_memory("link_by_hash"); + rprintf(FINFO, "(2) linkname = %s\n", linkname); + } else { + int fd; @@ -313,8 +338,8 @@ the file's name. + kill_hashfile(hashfile); + } else { + first = 1; -+ asprintf(&linkname, "%s/%ld", hashname, -+ last_fnbr + 1); ++ if (asprintf(&linkname, "%s/%ld", hashname, last_fnbr + 1) < 0) ++ out_of_memory("link_by_hash"); + rprintf(FINFO, "(4) linkname = %s\n", linkname); + } + } @@ -329,14 +354,14 @@ the file's name. + if (errno == EMLINK) { + first = 1; + free(linkname); -+ asprintf(&linkname,"%s/%ld",hashname, -+ last_fnbr + 1); ++ if (asprintf(&linkname,"%s/%ld",hashname, last_fnbr + 1) < 0) ++ out_of_memory("link_by_hash"); + rprintf(FINFO, "(5) linkname = %s\n", linkname); + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname); + } else { + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", + linkname, full_fname(fname)); -+ rc = robust_rename(fnametmp,fname,0644); ++ rc = robust_rename(fnametmp, fname, NULL, 0644); + } + } else { + do_unlink(fnametmp); @@ -347,7 +372,7 @@ the file's name. + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", + full_fname(fname),linkname); + -+ rc = robust_rename(fnametmp,fname,0644); ++ rc = robust_rename(fnametmp, fname, NULL, 0644); + if (rc != 0) { + rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"", + full_fname(fnametmp), full_fname(fname)); @@ -363,52 +388,52 @@ the file's name. + free(hashname); + return rc; +} -+ +#endif ---- orig/options.c 2004-10-14 17:11:40 -+++ options.c 2004-10-14 17:24:21 -@@ -126,6 +126,7 @@ char *log_format = NULL; - char *password_file = NULL; - char *rsync_path = RSYNC_PATH; - char *backup_dir = NULL; +diff --git a/options.c b/options.c +--- a/options.c ++++ b/options.c +@@ -158,6 +158,7 @@ char *backup_suffix = NULL; + char *tmpdir = NULL; + char *partial_dir = NULL; + char *basis_dir[MAX_BASIS_DIRS+1]; +char *link_by_hash_dir = NULL; - char backup_dir_buf[MAXPATHLEN]; - int rsync_port = RSYNC_PORT; - int link_dest = 0; -@@ -279,6 +280,7 @@ void usage(enum logcode F) - rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); + char *config_file = NULL; + char *shell_cmd = NULL; + char *logfile_name = NULL; +@@ -746,6 +747,7 @@ void usage(enum logcode F) rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); - rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n"); -+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n"); - rprintf(F," -P equivalent to --partial --progress\n"); - rprintf(F," -z, --compress compress file data\n"); - rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n"); -@@ -312,7 +314,7 @@ void usage(enum logcode F) - enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM, - OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST, - OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, -- OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, -+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_LINK_BY_HASH, - OPT_REFUSED_BASE = 9000}; + rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); + rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); ++ rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n"); + rprintf(F," -z, --compress compress file data during the transfer\n"); + rprintf(F," --compress-level=NUM explicitly set compression level\n"); + rprintf(F," --skip-compress=LIST skip compressing files with a suffix in LIST\n"); +@@ -798,7 +800,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM, + OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP, + OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD, + OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE, +- OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, ++ OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_LINK_BY_HASH, + OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, + OPT_SERVER, OPT_REFUSED_BASE = 9000}; - static struct poptOption long_options[] = { -@@ -371,6 +373,7 @@ static struct poptOption long_options[] - {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 }, - {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 }, - {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 }, +@@ -938,6 +940,7 @@ static struct poptOption long_options[] = { + {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, + {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, + {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0}, - /* TODO: Should this take an optional int giving the compression level? */ - {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, - {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, -@@ -683,6 +686,21 @@ int parse_arguments(int *argc, const cha + {"fuzzy", 'y', POPT_ARG_VAL, &fuzzy_basis, 1, 0, 0 }, + {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 }, + {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 }, +@@ -1764,6 +1767,21 @@ int parse_arguments(int *argc_p, const char ***argv_p) return 0; #endif + case OPT_LINK_BY_HASH: -+#if HAVE_LINK ++#ifdef HAVE_LINK + arg = poptGetOptArg(pc); + if (sanitize_paths) -+ arg = sanitize_path(NULL, arg, NULL, 0); ++ arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT); + link_by_hash_dir = (char *)arg; + break; +#else @@ -421,45 +446,38 @@ the file's name. + default: /* A large opt value means that set_refuse_options() - * turned this option off (opt-BASE is its index). */ -@@ -1144,6 +1162,11 @@ void server_options(char **args,int *arg - args[ac++] = compare_dest; - } + * turned this option off. */ +@@ -2636,6 +2654,11 @@ void server_options(char **args, int *argc_p) + } else if (inplace) + args[ac++] = "--inplace"; + if (link_by_hash_dir && am_sender) { + args[ac++] = "--link-by-hash"; + args[ac++] = link_by_hash_dir; + } + - if (files_from && (!am_sender || remote_filesfrom_file)) { - if (remote_filesfrom_file) { + if (files_from && (!am_sender || filesfrom_host)) { + if (filesfrom_host) { args[ac++] = "--files-from"; ---- orig/receiver.c 2004-09-21 09:40:27 -+++ receiver.c 2004-07-20 21:44:05 -@@ -39,6 +39,7 @@ extern int io_error; - extern char *tmpdir; - extern char *partial_dir; - extern char *compare_dest; -+extern char *link_by_hash_dir; - extern int make_backups; - extern int do_progress; - extern char *backup_dir; -@@ -202,12 +203,13 @@ static int get_tmpname(char *fnametmp, c - +diff --git a/receiver.c b/receiver.c +--- a/receiver.c ++++ b/receiver.c +@@ -196,11 +196,13 @@ int open_tmpfile(char *fnametmp, const char *fname, struct file_struct *file) + } static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, -- char *fname, int fd, OFF_T total_size) -+ char *fname, int fd, OFF_T total_size, char *md4) +- const char *fname, int fd, OFF_T total_size) ++ const char *fname, int fd, OFF_T total_size, ++ const char *md4) { - static char file_sum1[MD4_SUM_LENGTH]; - static char file_sum2[MD4_SUM_LENGTH]; + static char file_sum1[MAX_DIGEST_LEN]; struct map_struct *mapbuf; struct sum_struct sum; -+ struct mdfour mdfour_data; - unsigned int len; ++ md_context mdfour_data; + int32 len; OFF_T offset = 0; OFF_T offset2; -@@ -227,6 +229,9 @@ static int receive_data(int f_in, char * +@@ -220,6 +222,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, } else mapbuf = NULL; @@ -468,35 +486,36 @@ the file's name. + sum_init(checksum_seed); - while ((i = recv_token(f_in, &data)) != 0) { -@@ -243,6 +248,8 @@ static int receive_data(int f_in, char * + if (append_mode > 0) { +@@ -264,6 +269,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, cleanup_got_literal = 1; - sum_update(data,i); + sum_update(data, i); + if (md4) -+ mdfour_update(&mdfour_data,data,i); ++ mdfour_update(&mdfour_data, (uchar*)data, i); if (fd != -1 && write_file(fd,data,i) != i) goto report_write_error; -@@ -267,6 +274,8 @@ static int receive_data(int f_in, char * +@@ -290,6 +297,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, see_token(map, len); - sum_update(map,len); + sum_update(map, len); + if (md4) -+ mdfour_update(&mdfour_data,map,len); ++ mdfour_update(&mdfour_data, (uchar*)map, len); } - if (inplace) { -@@ -306,6 +315,8 @@ static int receive_data(int f_in, char * - } + if (updating_basis_or_equiv) { +@@ -337,6 +346,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, + if (sum_end(file_sum1) != checksum_len) + overflow_exit("checksum_len"); /* Impossible... */ - sum_end(file_sum1); + if (md4) -+ mdfour_result(&mdfour_data, (unsigned char*)md4); - ++ mdfour_result(&mdfour_data, (uchar*)md4); ++ if (mapbuf) unmap_file(mapbuf); -@@ -321,7 +332,7 @@ static int receive_data(int f_in, char * + +@@ -351,7 +363,7 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, static void discard_receive_data(int f_in, OFF_T length) { @@ -504,49 +523,50 @@ the file's name. + receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL); } - -@@ -542,8 +553,12 @@ int recv_files(int f_in, struct file_lis - rprintf(FINFO, "%s\n", safe_fname(fname)); + static void handle_delayed_updates(char *local_name) +@@ -779,7 +791,7 @@ int recv_files(int f_in, int f_out, char *local_name) /* recv file data */ -+#ifdef HAVE_LINK -+ if (link_by_hash_dir) -+ file->u.sum = (char*)malloc(MD4_SUM_LENGTH); -+#endif recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size, -- fname, fd2, file->length); -+ fname, fd2, file->length, file->u.sum); +- fname, fd2, F_LENGTH(file)); ++ fname, fd2, F_LENGTH(file), F_SUM(file)); - log_recv(file, &initial_stats); + log_item(log_code, file, iflags, NULL); ---- orig/rsync.c 2004-09-07 21:45:30 -+++ rsync.c 2004-08-13 18:14:34 -@@ -34,6 +34,7 @@ extern int force_delete; - extern int recurse; +diff --git a/rsync.c b/rsync.c +--- a/rsync.c ++++ b/rsync.c +@@ -49,6 +49,7 @@ extern int flist_eof; + extern int file_old_total; extern int keep_dirlinks; extern int make_backups; +extern char *link_by_hash_dir; - extern char *backup_dir; - extern int inplace; - -@@ -254,7 +255,12 @@ void finish_transfer(char *fname, char * + extern struct file_list *cur_flist, *first_flist, *dir_flist; + extern struct chmod_mode_struct *daemon_chmod_modes; + #ifdef ICONV_OPTION +@@ -644,8 +645,15 @@ int finish_transfer(const char *fname, const char *fnametmp, /* move tmp file over real file */ - if (verbose > 2) + if (DEBUG_GTE(RECV, 1)) rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname); -- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS); +- ret = robust_rename(fnametmp, fname, temp_copy_name, +- file->mode & INITACCESSPERMS); +#ifdef HAVE_LINK + if (link_by_hash_dir) + ret = link_by_hash(fnametmp, fname, file); + else +#endif -+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS); ++ { ++ ret = robust_rename(fnametmp, fname, temp_copy_name, ++ file->mode & INITACCESSPERMS); ++ } if (ret < 0) { - rsyserr(FERROR, errno, "%s %s -> \"%s\"", - ret == -2 ? "copy" : "rename", ---- orig/rsync.h 2004-10-09 03:21:56 -+++ rsync.h 2004-07-03 20:20:15 -@@ -529,6 +529,14 @@ struct stats { - int current_file_index; + rsyserr(FERROR_XFER, errno, "%s %s -> \"%s\"", + ret == -2 ? "copy" : "rename", +diff --git a/rsync.h b/rsync.h +--- a/rsync.h ++++ b/rsync.h +@@ -865,6 +865,14 @@ struct stats { + int xferred_files; }; +struct hashfile_struct { @@ -557,6 +577,17 @@ the file's name. + uint32 nlink; +}; + + struct chmod_mode_struct; - /* we need this function because of the silly way in which duplicate - entries are handled in the file lists - we can't change this + struct flist_ndx_item { +diff --git a/rsync.yo b/rsync.yo +--- a/rsync.yo ++++ b/rsync.yo +@@ -400,6 +400,7 @@ to the detailed description below for a complete description. verb( + --compare-dest=DIR also compare received files relative to DIR + --copy-dest=DIR ... and include copies of unchanged files + --link-dest=DIR hardlink to files in DIR when unchanged ++ --link-by-hash=DIR create hardlinks by hash into DIR + -z, --compress compress file data during the transfer + --compress-level=NUM explicitly set compression level + --skip-compress=LIST skip compressing files with suffix in LIST