3 This patch adds the --link-by-hash=DIR option, which hard links received
4 files in a link farm arranged by MD4 file hash. The result is that the system
5 will only store one copy of the unique contents of each file, regardless of
8 To use this patch, run these commands for a successful build:
10 patch -p1 <patches/link-by-hash.diff
15 diff --git a/Makefile.in b/Makefile.in
16 index feacb90..b27b1e7 100644
19 @@ -37,7 +37,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
20 util.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
21 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
22 fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
23 -OBJS3=progress.o pipe.o
24 +OBJS3=progress.o pipe.o hashlink.o
25 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
26 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
27 popt/popthelp.o popt/poptparse.o
28 diff --git a/flist.c b/flist.c
29 index 09b4fc5..570bcee 100644
32 @@ -73,6 +73,7 @@ extern int sender_keeps_checksum;
33 extern int unsort_ndx;
34 extern struct stats stats;
35 extern char *filesfrom_host;
36 +extern char *link_by_hash_dir;
37 extern char *usermap, *groupmap;
39 extern char curr_dir[MAXPATHLEN];
40 @@ -881,7 +882,7 @@ static struct file_struct *recv_file_entry(struct file_list *flist,
41 extra_len += EXTRA_LEN;
44 - if (always_checksum && S_ISREG(mode))
45 + if ((always_checksum || link_by_hash_dir) && S_ISREG(mode))
46 extra_len += SUM_EXTRA_CNT * EXTRA_LEN;
49 diff --git a/hashlink.c b/hashlink.c
51 index 0000000..15e2a73
56 + Copyright (C) Cronosys, LLC 2004
58 + This program is free software; you can redistribute it and/or modify
59 + it under the terms of the GNU General Public License as published by
60 + the Free Software Foundation; either version 2 of the License, or
61 + (at your option) any later version.
63 + This program is distributed in the hope that it will be useful,
64 + but WITHOUT ANY WARRANTY; without even the implied warranty of
65 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
66 + GNU General Public License for more details.
68 + You should have received a copy of the GNU General Public License
69 + along with this program; if not, write to the Free Software
70 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
73 +/* This file contains code used by the --link-by-hash option. */
77 +extern char *link_by_hash_dir;
81 +char *make_hash_name(struct file_struct *file)
83 + char hash[33], *dst;
84 + uchar c, *src = (uchar*)F_SUM(file);
87 + for (dst = hash, i = 0; i < 4; i++, src++) {
89 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
91 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
94 + for (i = 0; i < 12; i++, src++) {
96 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
98 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
102 + if (asprintf(&dst,"%s/%s",link_by_hash_dir,hash) < 0)
103 + out_of_memory("make_hash_name");
108 +void kill_hashfile(struct hashfile_struct *hashfile)
112 + free(hashfile->name);
113 + close(hashfile->fd);
118 +void kill_hashfiles(struct hashfile_struct *hashfiles)
120 + struct hashfile_struct *iter, *next;
121 + if ((iter = hashfiles) != NULL) {
124 + kill_hashfile(iter);
126 + } while (iter != hashfiles);
131 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
135 + struct hashfile_struct *hashfiles = NULL, *hashfile;
141 + /* Build a list of potential candidates and open
143 + if ((d = opendir(hashname)) == NULL) {
144 + rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
148 + while ((di = readdir(d)) != NULL) {
149 + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
153 + /* We need to have the largest fnbr in case we need to store
155 + this_fnbr = atol(di->d_name);
156 + if (this_fnbr > *fnbr)
159 + hashfile = new_array(struct hashfile_struct, 1);
160 + if (asprintf(&hashfile->name,"%s/%s",hashname, di->d_name) < 0)
161 + out_of_memory("find_hashfiles");
162 + if (do_stat(hashfile->name,&st) == -1) {
163 + rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
164 + kill_hashfile(hashfile);
167 + if (st.st_size != size) {
168 + kill_hashfile(hashfile);
171 + hashfile->nlink = st.st_nlink;
172 + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
173 + if (hashfile->fd == -1) {
174 + rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
175 + kill_hashfile(hashfile);
178 + if (hashfiles == NULL)
179 + hashfiles = hashfile->next = hashfile->prev = hashfile;
181 + hashfile->next = hashfiles;
182 + hashfile->prev = hashfiles->prev;
183 + hashfile->next->prev = hashfile;
184 + hashfile->prev->next = hashfile;
193 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
196 + char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
197 + struct hashfile_struct *iter, *next, *best;
203 + iter = files; /* in case files are 0 bytes */
204 + while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
207 + /* Icky bit to resync when we steal the first node. */
213 + hamt = read(iter->fd, cmpbuffer, BUFSIZ);
214 + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
215 + if (iter == files) {
216 + files = files->prev;
218 + if (iter->next == iter) {
219 + files = next = NULL;
222 + if (iter == files) {
223 + /* So we know to resync */
227 + iter->next->prev = iter->prev;
228 + iter->prev->next = iter->next;
229 + kill_hashfile(iter);
233 + } while (iter != files);
235 + if (iter == NULL && files == NULL) {
236 + /* There are no matches. */
242 + rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
243 + kill_hashfiles(files);
247 + /* If we only have one file left, use it. */
248 + if (files == files->next) {
252 + /* All files which remain in the list are identical and should have
253 + * the same size. We pick the one with the lowest link count (we
254 + * may have rolled over because we hit the maximum link count for
255 + * the filesystem). */
256 + best = iter = files;
257 + nlink = iter->nlink;
259 + if (iter->nlink < nlink) {
260 + nlink = iter->nlink;
264 + } while (iter != files);
266 + best->next->prev = best->prev;
267 + best->prev->next = best->next;
269 + files = files->next;
270 + kill_hashfiles(files);
275 +int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file)
278 + char *hashname = make_hash_name(file);
283 + if (F_LENGTH(file) == 0)
284 + return robust_rename(fnametmp, fname, NULL, 0644);
286 + if (do_stat(hashname, &st) == -1) {
289 + /* Directory does not exist. */
290 + dirname = strdup(hashname);
291 + *strrchr(dirname,'/') = 0;
292 + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
293 + rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
296 + return robust_rename(fnametmp, fname, NULL, 0644);
300 + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
301 + rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
303 + return robust_rename(fnametmp, fname, NULL, 0644);
307 + if (asprintf(&linkname,"%s/0",hashname) < 0)
308 + out_of_memory("link_by_hash");
309 + rprintf(FINFO, "(1) linkname = %s\n", linkname);
311 + struct hashfile_struct *hashfiles, *hashfile;
313 + if (do_stat(fnametmp,&st) == -1) {
314 + rsyserr(FERROR, errno, "stat failed: %s", fname);
317 + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
319 + if (hashfiles == NULL) {
321 + if (asprintf(&linkname,"%s/0",hashname) < 0)
322 + out_of_memory("link_by_hash");
323 + rprintf(FINFO, "(2) linkname = %s\n", linkname);
326 + /* Search for one identical to us. */
327 + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
328 + rsyserr(FERROR, errno, "open failed: %s", fnametmp);
329 + kill_hashfiles(hashfiles);
332 + hashfile = compare_hashfiles(fd, hashfiles);
338 + linkname = strdup(hashfile->name);
339 + rprintf(FINFO, "(3) linkname = %s\n", linkname);
340 + kill_hashfile(hashfile);
343 + if (asprintf(&linkname, "%s/%ld", hashname, last_fnbr + 1) < 0)
344 + out_of_memory("link_by_hash");
345 + rprintf(FINFO, "(4) linkname = %s\n", linkname);
351 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
352 + linkname, full_fname(fname));
353 + robust_unlink(fname);
354 + rc = do_link(linkname, fname);
356 + if (errno == EMLINK) {
359 + if (asprintf(&linkname,"%s/%ld",hashname, last_fnbr + 1) < 0)
360 + out_of_memory("link_by_hash");
361 + rprintf(FINFO, "(5) linkname = %s\n", linkname);
362 + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
364 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
365 + linkname, full_fname(fname));
366 + rc = robust_rename(fnametmp, fname, NULL, 0644);
369 + do_unlink(fnametmp);
374 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
375 + full_fname(fname),linkname);
377 + rc = robust_rename(fnametmp, fname, NULL, 0644);
379 + rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
380 + full_fname(fnametmp), full_fname(fname));
382 + rc = do_link(fname,linkname);
384 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
385 + full_fname(fname), linkname);
394 diff --git a/options.c b/options.c
395 index e7c6c61..73b1aa4 100644
398 @@ -158,6 +158,7 @@ char *backup_suffix = NULL;
400 char *partial_dir = NULL;
401 char *basis_dir[MAX_BASIS_DIRS+1];
402 +char *link_by_hash_dir = NULL;
403 char *config_file = NULL;
404 char *shell_cmd = NULL;
405 char *logfile_name = NULL;
406 @@ -745,6 +746,7 @@ void usage(enum logcode F)
407 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
408 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
409 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
410 + rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
411 rprintf(F," -z, --compress compress file data during the transfer\n");
412 rprintf(F," --compress-level=NUM explicitly set compression level\n");
413 rprintf(F," --skip-compress=LIST skip compressing files with a suffix in LIST\n");
414 @@ -798,7 +800,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
415 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
416 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
417 OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG,
418 - OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN,
419 + OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_LINK_BY_HASH,
420 OPT_SERVER, OPT_REFUSED_BASE = 9000};
422 static struct poptOption long_options[] = {
423 @@ -937,6 +939,7 @@ static struct poptOption long_options[] = {
424 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
425 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
426 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
427 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
428 {"fuzzy", 'y', POPT_ARG_VAL, &fuzzy_basis, 1, 0, 0 },
429 {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
430 {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
431 @@ -1742,6 +1745,21 @@ int parse_arguments(int *argc_p, const char ***argv_p)
435 + case OPT_LINK_BY_HASH:
437 + arg = poptGetOptArg(pc);
438 + if (sanitize_paths)
439 + arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
440 + link_by_hash_dir = (char *)arg;
443 + snprintf(err_buf, sizeof err_buf,
444 + "hard links are not supported on this %s\n",
445 + am_server ? "server" : "client");
446 + rprintf(FERROR, "ERROR: %s", err_buf);
451 /* A large opt value means that set_refuse_options()
452 * turned this option off. */
453 @@ -2584,6 +2602,11 @@ void server_options(char **args, int *argc_p)
455 args[ac++] = "--inplace";
457 + if (link_by_hash_dir && am_sender) {
458 + args[ac++] = "--link-by-hash";
459 + args[ac++] = link_by_hash_dir;
462 if (files_from && (!am_sender || filesfrom_host)) {
463 if (filesfrom_host) {
464 args[ac++] = "--files-from";
465 diff --git a/receiver.c b/receiver.c
466 index 4325e30..2709d5e 100644
469 @@ -164,11 +164,13 @@ int open_tmpfile(char *fnametmp, const char *fname, struct file_struct *file)
472 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
473 - const char *fname, int fd, OFF_T total_size)
474 + const char *fname, int fd, OFF_T total_size,
477 static char file_sum1[MAX_DIGEST_LEN];
478 struct map_struct *mapbuf;
479 struct sum_struct sum;
480 + md_context mdfour_data;
484 @@ -188,6 +190,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
489 + mdfour_begin(&mdfour_data);
491 sum_init(checksum_seed);
493 if (append_mode > 0) {
494 @@ -232,6 +237,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
495 cleanup_got_literal = 1;
499 + mdfour_update(&mdfour_data, (uchar*)data, i);
501 if (fd != -1 && write_file(fd,data,i) != i)
502 goto report_write_error;
503 @@ -258,6 +265,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
506 sum_update(map, len);
508 + mdfour_update(&mdfour_data, (uchar*)map, len);
511 if (updating_basis_or_equiv) {
512 @@ -305,6 +314,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
513 if (sum_end(file_sum1) != checksum_len)
514 overflow_exit("checksum_len"); /* Impossible... */
517 + mdfour_result(&mdfour_data, (uchar*)md4);
522 @@ -319,7 +331,7 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
524 static void discard_receive_data(int f_in, OFF_T length)
526 - receive_data(f_in, NULL, -1, 0, NULL, -1, length);
527 + receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
530 static void handle_delayed_updates(char *local_name)
531 @@ -744,7 +756,7 @@ int recv_files(int f_in, char *local_name)
534 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
535 - fname, fd2, F_LENGTH(file));
536 + fname, fd2, F_LENGTH(file), F_SUM(file));
538 log_item(log_code, file, &initial_stats, iflags, NULL);
540 diff --git a/rsync.c b/rsync.c
541 index 2c026a2..87f6d54 100644
544 @@ -48,6 +48,7 @@ extern int flist_eof;
545 extern int msgs2stderr;
546 extern int keep_dirlinks;
547 extern int make_backups;
548 +extern char *link_by_hash_dir;
549 extern struct file_list *cur_flist, *first_flist, *dir_flist;
550 extern struct chmod_mode_struct *daemon_chmod_modes;
552 @@ -575,8 +576,15 @@ int finish_transfer(const char *fname, const char *fnametmp,
553 /* move tmp file over real file */
554 if (DEBUG_GTE(RECV, 1))
555 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
556 - ret = robust_rename(fnametmp, fname, temp_copy_name,
557 - file->mode & INITACCESSPERMS);
559 + if (link_by_hash_dir)
560 + ret = link_by_hash(fnametmp, fname, file);
564 + ret = robust_rename(fnametmp, fname, temp_copy_name,
565 + file->mode & INITACCESSPERMS);
568 rsyserr(FERROR_XFER, errno, "%s %s -> \"%s\"",
569 ret == -2 ? "copy" : "rename",
570 diff --git a/rsync.h b/rsync.h
571 index be7cf8a..d4e2aca 100644
574 @@ -853,6 +853,14 @@ struct stats {
578 +struct hashfile_struct {
579 + struct hashfile_struct *next;
580 + struct hashfile_struct *prev;
586 struct chmod_mode_struct;
588 struct flist_ndx_item {
589 diff --git a/rsync.yo b/rsync.yo
590 index 941f7a5..568b481 100644
593 @@ -400,6 +400,7 @@ to the detailed description below for a complete description. verb(
594 --compare-dest=DIR also compare received files relative to DIR
595 --copy-dest=DIR ... and include copies of unchanged files
596 --link-dest=DIR hardlink to files in DIR when unchanged
597 + --link-by-hash=DIR create hardlinks by hash into DIR
598 -z, --compress compress file data during the transfer
599 --compress-level=NUM explicitly set compression level
600 --skip-compress=LIST skip compressing files with suffix in LIST