Jason M. Felice wrote: This patch adds the --link-by-hash=DIR option, which hard links received files in a link farm arranged by MD4 file hash. The result is that the system will only store one copy of the unique contents of each file, regardless of the file's name. To use this patch, run these commands for a successful build: patch -p1 = 8 diff --git a/hashlink.c b/hashlink.c new file mode 100644 --- /dev/null +++ b/hashlink.c @@ -0,0 +1,339 @@ +/* + Copyright (C) Cronosys, LLC 2004 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +/* This file contains code used by the --link-by-hash option. */ + +#include "rsync.h" + +extern char *link_by_hash_dir; + +#ifdef HAVE_LINK + +char *make_hash_name(struct file_struct *file) +{ + char hash[33], *dst; + uchar c, *src = (uchar*)F_SUM(file); + int i; + + for (dst = hash, i = 0; i < 4; i++, src++) { + c = *src >> 4; + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); + c = *src & 0x0f; + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); + } + *dst++ = '/'; + for (i = 0; i < 12; i++, src++) { + c = *src >> 4; + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); + c = *src & 0x0f; + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); + } + *dst = 0; + + if (asprintf(&dst,"%s/%s",link_by_hash_dir,hash) < 0) + out_of_memory("make_hash_name"); + return dst; +} + + +void kill_hashfile(struct hashfile_struct *hashfile) +{ + if (!hashfile) + return; + free(hashfile->name); + close(hashfile->fd); + free(hashfile); +} + + +void kill_hashfiles(struct hashfile_struct *hashfiles) +{ + struct hashfile_struct *iter, *next; + if ((iter = hashfiles) != NULL) { + do { + next = iter->next; + kill_hashfile(iter); + iter = next; + } while (iter != hashfiles); + } +} + + +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr) +{ + DIR *d; + struct dirent *di; + struct hashfile_struct *hashfiles = NULL, *hashfile; + STRUCT_STAT st; + long this_fnbr; + + *fnbr = 0; + + /* Build a list of potential candidates and open + * them. */ + if ((d = opendir(hashname)) == NULL) { + rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname); + free(hashname); + return NULL; + } + while ((di = readdir(d)) != NULL) { + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) { + continue; + } + + /* We need to have the largest fnbr in case we need to store + * a new file. */ + this_fnbr = atol(di->d_name); + if (this_fnbr > *fnbr) + *fnbr = this_fnbr; + + hashfile = new_array(struct hashfile_struct, 1); + if (asprintf(&hashfile->name,"%s/%s",hashname, di->d_name) < 0) + out_of_memory("find_hashfiles"); + if (do_stat(hashfile->name,&st) == -1) { + rsyserr(FERROR, errno, "stat failed: %s", hashfile->name); + kill_hashfile(hashfile); + continue; + } + if (st.st_size != size) { + kill_hashfile(hashfile); + continue; + } + hashfile->nlink = st.st_nlink; + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY); + if (hashfile->fd == -1) { + rsyserr(FERROR, errno, "open failed: %s", hashfile->name); + kill_hashfile(hashfile); + continue; + } + if (hashfiles == NULL) + hashfiles = hashfile->next = hashfile->prev = hashfile; + else { + hashfile->next = hashfiles; + hashfile->prev = hashfiles->prev; + hashfile->next->prev = hashfile; + hashfile->prev->next = hashfile; + } + } + closedir(d); + + return hashfiles; +} + + +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files) +{ + int amt, hamt; + char buffer[BUFSIZ], cmpbuffer[BUFSIZ]; + struct hashfile_struct *iter, *next, *best; + uint32 nlink; + + if (!files) + return NULL; + + iter = files; /* in case files are 0 bytes */ + while ((amt = read(fd, buffer, BUFSIZ)) > 0) { + iter = files; + do { + /* Icky bit to resync when we steal the first node. */ + if (!files) + files = iter; + + next = iter->next; + + hamt = read(iter->fd, cmpbuffer, BUFSIZ); + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) { + if (iter == files) { + files = files->prev; + } + if (iter->next == iter) { + files = next = NULL; + } else { + next = iter->next; + if (iter == files) { + /* So we know to resync */ + files = NULL; + } + } + iter->next->prev = iter->prev; + iter->prev->next = iter->next; + kill_hashfile(iter); + } + + iter = next; + } while (iter != files); + + if (iter == NULL && files == NULL) { + /* There are no matches. */ + return NULL; + } + } + + if (amt == -1) { + rsyserr(FERROR, errno, "read failed in compare_hashfiles()"); + kill_hashfiles(files); + return NULL; + } + + /* If we only have one file left, use it. */ + if (files == files->next) { + return files; + } + + /* All files which remain in the list are identical and should have + * the same size. We pick the one with the lowest link count (we + * may have rolled over because we hit the maximum link count for + * the filesystem). */ + best = iter = files; + nlink = iter->nlink; + do { + if (iter->nlink < nlink) { + nlink = iter->nlink; + best = iter; + } + iter = iter->next; + } while (iter != files); + + best->next->prev = best->prev; + best->prev->next = best->next; + if (files == best) + files = files->next; + kill_hashfiles(files); + return best; +} + + +int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file) +{ + STRUCT_STAT st; + char *hashname = make_hash_name(file); + int first = 0, rc; + char *linkname; + long last_fnbr; + + if (F_LENGTH(file) == 0) + return robust_rename(fnametmp, fname, NULL, 0644); + + if (do_stat(hashname, &st) == -1) { + char *dirname; + + /* Directory does not exist. */ + dirname = strdup(hashname); + *strrchr(dirname,'/') = 0; + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) { + rsyserr(FERROR, errno, "mkdir failed: %s", dirname); + free(hashname); + free(dirname); + return robust_rename(fnametmp, fname, NULL, 0644); + } + free(dirname); + + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) { + rsyserr(FERROR, errno, "mkdir failed: %s", hashname); + free(hashname); + return robust_rename(fnametmp, fname, NULL, 0644); + } + + first = 1; + if (asprintf(&linkname,"%s/0",hashname) < 0) + out_of_memory("link_by_hash"); + rprintf(FINFO, "(1) linkname = %s\n", linkname); + } else { + struct hashfile_struct *hashfiles, *hashfile; + + if (do_stat(fnametmp,&st) == -1) { + rsyserr(FERROR, errno, "stat failed: %s", fname); + return -1; + } + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr); + + if (hashfiles == NULL) { + first = 1; + if (asprintf(&linkname,"%s/0",hashname) < 0) + out_of_memory("link_by_hash"); + rprintf(FINFO, "(2) linkname = %s\n", linkname); + } else { + int fd; + /* Search for one identical to us. */ + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) { + rsyserr(FERROR, errno, "open failed: %s", fnametmp); + kill_hashfiles(hashfiles); + return -1; + } + hashfile = compare_hashfiles(fd, hashfiles); + hashfiles = NULL; + close(fd); + + if (hashfile) { + first = 0; + linkname = strdup(hashfile->name); + rprintf(FINFO, "(3) linkname = %s\n", linkname); + kill_hashfile(hashfile); + } else { + first = 1; + if (asprintf(&linkname, "%s/%ld", hashname, last_fnbr + 1) < 0) + out_of_memory("link_by_hash"); + rprintf(FINFO, "(4) linkname = %s\n", linkname); + } + } + } + + if (!first) { + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", + linkname, full_fname(fname)); + robust_unlink(fname); + rc = do_link(linkname, fname); + if (rc == -1) { + if (errno == EMLINK) { + first = 1; + free(linkname); + if (asprintf(&linkname,"%s/%ld",hashname, last_fnbr + 1) < 0) + out_of_memory("link_by_hash"); + rprintf(FINFO, "(5) linkname = %s\n", linkname); + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname); + } else { + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", + linkname, full_fname(fname)); + rc = robust_rename(fnametmp, fname, NULL, 0644); + } + } else { + do_unlink(fnametmp); + } + } + + if (first) { + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", + full_fname(fname),linkname); + + rc = robust_rename(fnametmp, fname, NULL, 0644); + if (rc != 0) { + rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"", + full_fname(fnametmp), full_fname(fname)); + } + rc = do_link(fname,linkname); + if (rc != 0) { + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", + full_fname(fname), linkname); + } + } + + free(linkname); + free(hashname); + return rc; +} +#endif diff --git a/options.c b/options.c --- a/options.c +++ b/options.c @@ -158,6 +158,7 @@ char *backup_suffix = NULL; char *tmpdir = NULL; char *partial_dir = NULL; char *basis_dir[MAX_BASIS_DIRS+1]; +char *link_by_hash_dir = NULL; char *config_file = NULL; char *shell_cmd = NULL; char *logfile_name = NULL; @@ -746,6 +747,7 @@ void usage(enum logcode F) rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); + rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n"); rprintf(F," -z, --compress compress file data during the transfer\n"); rprintf(F," --compress-level=NUM explicitly set compression level\n"); rprintf(F," --skip-compress=LIST skip compressing files with a suffix in LIST\n"); @@ -798,7 +800,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM, OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP, OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD, OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE, - OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, + OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_LINK_BY_HASH, OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_SERVER, OPT_REFUSED_BASE = 9000}; @@ -938,6 +940,7 @@ static struct poptOption long_options[] = { {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0}, {"fuzzy", 'y', POPT_ARG_VAL, &fuzzy_basis, 1, 0, 0 }, {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 }, {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 }, @@ -1764,6 +1767,21 @@ int parse_arguments(int *argc_p, const char ***argv_p) return 0; #endif + case OPT_LINK_BY_HASH: +#ifdef HAVE_LINK + arg = poptGetOptArg(pc); + if (sanitize_paths) + arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT); + link_by_hash_dir = (char *)arg; + break; +#else + snprintf(err_buf, sizeof err_buf, + "hard links are not supported on this %s\n", + am_server ? "server" : "client"); + rprintf(FERROR, "ERROR: %s", err_buf); + return 0; +#endif + default: /* A large opt value means that set_refuse_options() * turned this option off. */ @@ -2636,6 +2654,11 @@ void server_options(char **args, int *argc_p) } else if (inplace) args[ac++] = "--inplace"; + if (link_by_hash_dir && am_sender) { + args[ac++] = "--link-by-hash"; + args[ac++] = link_by_hash_dir; + } + if (files_from && (!am_sender || filesfrom_host)) { if (filesfrom_host) { args[ac++] = "--files-from"; diff --git a/receiver.c b/receiver.c --- a/receiver.c +++ b/receiver.c @@ -196,11 +196,13 @@ int open_tmpfile(char *fnametmp, const char *fname, struct file_struct *file) } static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, - const char *fname, int fd, OFF_T total_size) + const char *fname, int fd, OFF_T total_size, + const char *md4) { static char file_sum1[MAX_DIGEST_LEN]; struct map_struct *mapbuf; struct sum_struct sum; + md_context mdfour_data; int32 len; OFF_T offset = 0; OFF_T offset2; @@ -220,6 +222,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, } else mapbuf = NULL; + if (md4) + mdfour_begin(&mdfour_data); + sum_init(checksum_seed); if (append_mode > 0) { @@ -264,6 +269,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, cleanup_got_literal = 1; sum_update(data, i); + if (md4) + mdfour_update(&mdfour_data, (uchar*)data, i); if (fd != -1 && write_file(fd,data,i) != i) goto report_write_error; @@ -290,6 +297,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, see_token(map, len); sum_update(map, len); + if (md4) + mdfour_update(&mdfour_data, (uchar*)map, len); } if (updating_basis_or_equiv) { @@ -337,6 +346,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, if (sum_end(file_sum1) != checksum_len) overflow_exit("checksum_len"); /* Impossible... */ + if (md4) + mdfour_result(&mdfour_data, (uchar*)md4); + if (mapbuf) unmap_file(mapbuf); @@ -351,7 +363,7 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, static void discard_receive_data(int f_in, OFF_T length) { - receive_data(f_in, NULL, -1, 0, NULL, -1, length); + receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL); } static void handle_delayed_updates(char *local_name) @@ -779,7 +791,7 @@ int recv_files(int f_in, int f_out, char *local_name) /* recv file data */ recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size, - fname, fd2, F_LENGTH(file)); + fname, fd2, F_LENGTH(file), F_SUM(file)); log_item(log_code, file, iflags, NULL); diff --git a/rsync.c b/rsync.c --- a/rsync.c +++ b/rsync.c @@ -49,6 +49,7 @@ extern int flist_eof; extern int file_old_total; extern int keep_dirlinks; extern int make_backups; +extern char *link_by_hash_dir; extern struct file_list *cur_flist, *first_flist, *dir_flist; extern struct chmod_mode_struct *daemon_chmod_modes; #ifdef ICONV_OPTION @@ -644,8 +645,15 @@ int finish_transfer(const char *fname, const char *fnametmp, /* move tmp file over real file */ if (DEBUG_GTE(RECV, 1)) rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname); - ret = robust_rename(fnametmp, fname, temp_copy_name, - file->mode & INITACCESSPERMS); +#ifdef HAVE_LINK + if (link_by_hash_dir) + ret = link_by_hash(fnametmp, fname, file); + else +#endif + { + ret = robust_rename(fnametmp, fname, temp_copy_name, + file->mode & INITACCESSPERMS); + } if (ret < 0) { rsyserr(FERROR_XFER, errno, "%s %s -> \"%s\"", ret == -2 ? "copy" : "rename", diff --git a/rsync.h b/rsync.h --- a/rsync.h +++ b/rsync.h @@ -865,6 +865,14 @@ struct stats { int xferred_files; }; +struct hashfile_struct { + struct hashfile_struct *next; + struct hashfile_struct *prev; + char *name; + int fd; + uint32 nlink; +}; + struct chmod_mode_struct; struct flist_ndx_item { diff --git a/rsync.yo b/rsync.yo --- a/rsync.yo +++ b/rsync.yo @@ -400,6 +400,7 @@ to the detailed description below for a complete description. verb( --compare-dest=DIR also compare received files relative to DIR --copy-dest=DIR ... and include copies of unchanged files --link-dest=DIR hardlink to files in DIR when unchanged + --link-by-hash=DIR create hardlinks by hash into DIR -z, --compress compress file data during the transfer --compress-level=NUM explicitly set compression level --skip-compress=LIST skip compressing files with suffix in LIST