3 This patch adds the --link-by-hash=DIR option, which hard links received
4 files in a link farm arranged by MD4 file hash. The result is that the system
5 will only store one copy of the unique contents of each file, regardless of
8 To use this patch, run these commands for a successful build:
10 patch -p1 <patches/link-by-hash.diff
17 @@ -35,7 +35,7 @@ OBJS1=flist.o rsync.o generator.o receiv
18 util.o main.o checksum.o match.o syscall.o log.o backup.o
19 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20 fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
21 -OBJS3=progress.o pipe.o
22 +OBJS3=progress.o pipe.o hashlink.o
23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
25 popt/popthelp.o popt/poptparse.o
28 @@ -61,6 +61,7 @@ extern int copy_unsafe_links;
29 extern int protocol_version;
30 extern int sanitize_paths;
31 extern struct stats stats;
32 +extern char *link_by_hash_dir;
34 extern char curr_dir[MAXPATHLEN];
36 @@ -748,7 +749,7 @@ static struct file_struct *recv_file_ent
37 extra_len += (S_ISDIR(mode) ? 2 : 1) * EXTRA_LEN;
40 - if (always_checksum && S_ISREG(mode))
41 + if ((always_checksum || link_by_hash_dir) && S_ISREG(mode))
42 extra_len += SUM_EXTRA_CNT * EXTRA_LEN;
44 if (file_length > 0xFFFFFFFFu && S_ISREG(mode))
49 + Copyright (C) Cronosys, LLC 2004
51 + This program is free software; you can redistribute it and/or modify
52 + it under the terms of the GNU General Public License as published by
53 + the Free Software Foundation; either version 2 of the License, or
54 + (at your option) any later version.
56 + This program is distributed in the hope that it will be useful,
57 + but WITHOUT ANY WARRANTY; without even the implied warranty of
58 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
59 + GNU General Public License for more details.
61 + You should have received a copy of the GNU General Public License
62 + along with this program; if not, write to the Free Software
63 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
66 +/* This file contains code used by the --link-by-hash option. */
70 +extern char *link_by_hash_dir;
74 +char *make_hash_name(struct file_struct *file)
76 + char hash[33], *dst;
77 + uchar c, *src = (uchar*)F_SUM(file);
80 + for (dst = hash, i = 0; i < 4; i++, src++) {
82 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
84 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
87 + for (i = 0; i < 12; i++, src++) {
89 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
91 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
95 + asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
100 +void kill_hashfile(struct hashfile_struct *hashfile)
104 + free(hashfile->name);
105 + close(hashfile->fd);
110 +void kill_hashfiles(struct hashfile_struct *hashfiles)
112 + struct hashfile_struct *iter, *next;
113 + if ((iter = hashfiles) != NULL) {
116 + kill_hashfile(iter);
118 + } while (iter != hashfiles);
123 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
127 + struct hashfile_struct *hashfiles = NULL, *hashfile;
133 + /* Build a list of potential candidates and open
135 + if ((d = opendir(hashname)) == NULL) {
136 + rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
140 + while ((di = readdir(d)) != NULL) {
141 + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
145 + /* We need to have the largest fnbr in case we need to store
147 + this_fnbr = atol(di->d_name);
148 + if (this_fnbr > *fnbr)
151 + hashfile = new_array(struct hashfile_struct, 1);
152 + asprintf(&hashfile->name,"%s/%s",hashname,
154 + if (do_stat(hashfile->name,&st) == -1) {
155 + rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
156 + kill_hashfile(hashfile);
159 + if (st.st_size != size) {
160 + kill_hashfile(hashfile);
163 + hashfile->nlink = st.st_nlink;
164 + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
165 + if (hashfile->fd == -1) {
166 + rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
167 + kill_hashfile(hashfile);
170 + if (hashfiles == NULL)
171 + hashfiles = hashfile->next = hashfile->prev = hashfile;
173 + hashfile->next = hashfiles;
174 + hashfile->prev = hashfiles->prev;
175 + hashfile->next->prev = hashfile;
176 + hashfile->prev->next = hashfile;
185 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
188 + char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
189 + struct hashfile_struct *iter, *next, *best;
195 + iter = files; /* in case files are 0 bytes */
196 + while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
199 + /* Icky bit to resync when we steal the first node. */
205 + hamt = read(iter->fd, cmpbuffer, BUFSIZ);
206 + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
207 + if (iter == files) {
208 + files = files->prev;
210 + if (iter->next == iter) {
211 + files = next = NULL;
214 + if (iter == files) {
215 + /* So we know to resync */
219 + iter->next->prev = iter->prev;
220 + iter->prev->next = iter->next;
221 + kill_hashfile(iter);
225 + } while (iter != files);
227 + if (iter == NULL && files == NULL) {
228 + /* There are no matches. */
234 + rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
235 + kill_hashfiles(files);
239 + /* If we only have one file left, use it. */
240 + if (files == files->next) {
244 + /* All files which remain in the list are identical and should have
245 + * the same size. We pick the one with the lowest link count (we
246 + * may have rolled over because we hit the maximum link count for
247 + * the filesystem). */
248 + best = iter = files;
249 + nlink = iter->nlink;
251 + if (iter->nlink < nlink) {
252 + nlink = iter->nlink;
256 + } while (iter != files);
258 + best->next->prev = best->prev;
259 + best->prev->next = best->next;
261 + files = files->next;
262 + kill_hashfiles(files);
267 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
270 + char *hashname = make_hash_name(file);
275 + if (F_LENGTH(file) == 0)
276 + return robust_rename(fnametmp, fname, NULL, 0644);
278 + if (do_stat(hashname, &st) == -1) {
281 + /* Directory does not exist. */
282 + dirname = strdup(hashname);
283 + *strrchr(dirname,'/') = 0;
284 + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
285 + rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
288 + return robust_rename(fnametmp, fname, NULL, 0644);
292 + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
293 + rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
295 + return robust_rename(fnametmp, fname, NULL, 0644);
299 + asprintf(&linkname,"%s/0",hashname);
300 + rprintf(FINFO, "(1) linkname = %s\n", linkname);
302 + struct hashfile_struct *hashfiles, *hashfile;
304 + if (do_stat(fnametmp,&st) == -1) {
305 + rsyserr(FERROR, errno, "stat failed: %s", fname);
308 + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
310 + if (hashfiles == NULL) {
312 + asprintf(&linkname,"%s/0",hashname);
313 + rprintf(FINFO, "(2) linkname = %s\n", linkname);
316 + /* Search for one identical to us. */
317 + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
318 + rsyserr(FERROR, errno, "open failed: %s", fnametmp);
319 + kill_hashfiles(hashfiles);
322 + hashfile = compare_hashfiles(fd, hashfiles);
328 + linkname = strdup(hashfile->name);
329 + rprintf(FINFO, "(3) linkname = %s\n", linkname);
330 + kill_hashfile(hashfile);
333 + asprintf(&linkname, "%s/%ld", hashname,
335 + rprintf(FINFO, "(4) linkname = %s\n", linkname);
341 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
342 + linkname, full_fname(fname));
343 + robust_unlink(fname);
344 + rc = do_link(linkname, fname);
346 + if (errno == EMLINK) {
349 + asprintf(&linkname,"%s/%ld",hashname,
351 + rprintf(FINFO, "(5) linkname = %s\n", linkname);
352 + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
354 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
355 + linkname, full_fname(fname));
356 + rc = robust_rename(fnametmp, fname, NULL, 0644);
359 + do_unlink(fnametmp);
364 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
365 + full_fname(fname),linkname);
367 + rc = robust_rename(fnametmp, fname, NULL, 0644);
369 + rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
370 + full_fname(fnametmp), full_fname(fname));
372 + rc = do_link(fname,linkname);
374 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
375 + full_fname(fname), linkname);
386 @@ -146,6 +146,7 @@ char *backup_suffix = NULL;
388 char *partial_dir = NULL;
389 char *basis_dir[MAX_BASIS_DIRS+1];
390 +char *link_by_hash_dir = NULL;
391 char *config_file = NULL;
392 char *shell_cmd = NULL;
393 char *logfile_name = NULL;
394 @@ -362,6 +363,7 @@ void usage(enum logcode F)
395 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
396 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
397 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
398 + rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
399 rprintf(F," -z, --compress compress file data during the transfer\n");
400 rprintf(F," --compress-level=NUM explicitly set compression level\n");
401 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
402 @@ -411,7 +413,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
403 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
404 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
405 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
407 + OPT_NO_D, OPT_LINK_BY_HASH,
408 OPT_SERVER, OPT_REFUSED_BASE = 9000};
410 static struct poptOption long_options[] = {
411 @@ -523,6 +525,7 @@ static struct poptOption long_options[]
412 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
413 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
414 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
415 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
416 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
417 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
418 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
419 @@ -1148,6 +1151,21 @@ int parse_arguments(int *argc, const cha
423 + case OPT_LINK_BY_HASH:
425 + arg = poptGetOptArg(pc);
426 + if (sanitize_paths)
427 + arg = sanitize_path(NULL, arg, NULL, 0, NULL);
428 + link_by_hash_dir = (char *)arg;
431 + snprintf(err_buf, sizeof err_buf,
432 + "hard links are not supported on this %s\n",
433 + am_server ? "server" : "client");
434 + rprintf(FERROR, "ERROR: %s", err_buf);
439 /* A large opt value means that set_refuse_options()
440 * turned this option off. */
441 @@ -1814,6 +1832,11 @@ void server_options(char **args,int *arg
445 + if (link_by_hash_dir && am_sender) {
446 + args[ac++] = "--link-by-hash";
447 + args[ac++] = link_by_hash_dir;
450 if (files_from && (!am_sender || filesfrom_host)) {
451 if (filesfrom_host) {
452 args[ac++] = "--files-from";
455 @@ -125,12 +125,14 @@ int get_tmpname(char *fnametmp, char *fn
458 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
459 - const char *fname, int fd, OFF_T total_size)
460 + const char *fname, int fd, OFF_T total_size,
463 static char file_sum1[MAX_DIGEST_LEN];
464 static char file_sum2[MAX_DIGEST_LEN];
465 struct map_struct *mapbuf;
466 struct sum_struct sum;
467 + md_context mdfour_data;
471 @@ -150,6 +152,9 @@ static int receive_data(int f_in, char *
476 + mdfour_begin(&mdfour_data);
478 sum_init(checksum_seed);
480 if (append_mode > 0) {
481 @@ -192,6 +197,8 @@ static int receive_data(int f_in, char *
482 cleanup_got_literal = 1;
486 + mdfour_update(&mdfour_data, (uchar*)data, i);
488 if (fd != -1 && write_file(fd,data,i) != i)
489 goto report_write_error;
490 @@ -218,6 +225,8 @@ static int receive_data(int f_in, char *
493 sum_update(map, len);
495 + mdfour_update(&mdfour_data, (uchar*)map, len);
498 if (updating_basis) {
499 @@ -260,6 +269,8 @@ static int receive_data(int f_in, char *
502 sum_len = sum_end(file_sum1);
504 + mdfour_result(&mdfour_data, (uchar*)md4);
508 @@ -275,7 +286,7 @@ static int receive_data(int f_in, char *
510 static void discard_receive_data(int f_in, OFF_T length)
512 - receive_data(f_in, NULL, -1, 0, NULL, -1, length);
513 + receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
516 static void handle_delayed_updates(char *local_name)
517 @@ -646,7 +657,7 @@ int recv_files(int f_in, char *local_nam
520 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
521 - fname, fd2, F_LENGTH(file));
522 + fname, fd2, F_LENGTH(file), F_SUM(file));
524 log_item(log_code, file, &initial_stats, iflags, NULL);
528 @@ -51,6 +51,7 @@ extern int inplace;
529 extern int flist_eof;
530 extern int keep_dirlinks;
531 extern int make_backups;
532 +extern char *link_by_hash_dir;
533 extern struct file_list *cur_flist, *first_flist, *dir_flist;
534 extern struct chmod_mode_struct *daemon_chmod_modes;
536 @@ -410,8 +411,15 @@ void finish_transfer(const char *fname,
537 /* move tmp file over real file */
539 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
540 - ret = robust_rename(fnametmp, fname, partialptr,
541 - file->mode & INITACCESSPERMS);
543 + if (link_by_hash_dir)
544 + ret = link_by_hash(fnametmp, fname, file);
548 + ret = robust_rename(fnametmp, fname, partialptr,
549 + file->mode & INITACCESSPERMS);
552 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
553 ret == -2 ? "copy" : "rename",
556 @@ -758,6 +758,14 @@ struct stats {
557 int current_file_index;
560 +struct hashfile_struct {
561 + struct hashfile_struct *next;
562 + struct hashfile_struct *prev;
568 struct chmod_mode_struct;
570 #define EMPTY_ITEM_LIST {NULL, 0, 0}
573 @@ -369,6 +369,7 @@ to the detailed description below for a
574 --compare-dest=DIR also compare received files relative to DIR
575 --copy-dest=DIR ... and include copies of unchanged files
576 --link-dest=DIR hardlink to files in DIR when unchanged
577 + --link-by-hash=DIR create hardlinks by hash into DIR
578 -z, --compress compress file data during the transfer
579 --compress-level=NUM explicitly set compression level
580 -C, --cvs-exclude auto-ignore files in the same way CVS does