1 After applying this patch, run these commands for a successful build:
4 ./configure (optional if already run)
7 Jason M. Felice writes:
9 This patch adds the --link-by-hash=DIR option, which hard links received
10 files in a link farm arranged by MD4 file hash. The result is that the system
11 will only store one copy of the unique contents of each file, regardless of
17 @@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18 main.o checksum.o match.o syscall.o log.o backup.o
19 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20 fileio.o batch.o clientname.o chmod.o
21 -OBJS3=progress.o pipe.o
22 +OBJS3=progress.o pipe.o hashlink.o
23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
25 popt/popthelp.o popt/poptparse.o
30 + Copyright (C) Cronosys, LLC 2004
32 + This program is free software; you can redistribute it and/or modify
33 + it under the terms of the GNU General Public License as published by
34 + the Free Software Foundation; either version 2 of the License, or
35 + (at your option) any later version.
37 + This program is distributed in the hope that it will be useful,
38 + but WITHOUT ANY WARRANTY; without even the implied warranty of
39 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40 + GNU General Public License for more details.
42 + You should have received a copy of the GNU General Public License
43 + along with this program; if not, write to the Free Software
44 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
47 +/* This file contains code used by the --link-by-hash option. */
51 +extern char *link_by_hash_dir;
55 +char* make_hash_name(struct file_struct *file)
57 + char hash[33], *dst;
62 + src = (unsigned char*)file->u.sum;
63 + for (dst = hash, i = 0; i < 4; i++, src++) {
65 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
70 + for (i = 0; i < 12; i++, src++) {
72 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
78 + asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
83 +void kill_hashfile(struct hashfile_struct *hashfile)
87 + free(hashfile->name);
88 + close(hashfile->fd);
93 +void kill_hashfiles(struct hashfile_struct *hashfiles)
95 + struct hashfile_struct *iter, *next;
96 + if ((iter = hashfiles) != NULL) {
99 + kill_hashfile(iter);
101 + } while (iter != hashfiles);
106 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
110 + struct hashfile_struct *hashfiles = NULL, *hashfile;
116 + /* Build a list of potential candidates and open
118 + if ((d = opendir(hashname)) == NULL) {
119 + rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
123 + while ((di = readdir(d)) != NULL) {
124 + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
128 + /* We need to have the largest fnbr in case we need to store
130 + this_fnbr = atol(di->d_name);
131 + if (this_fnbr > *fnbr)
134 + hashfile = new_array(struct hashfile_struct, 1);
135 + asprintf(&hashfile->name,"%s/%s",hashname,
137 + if (do_stat(hashfile->name,&st) == -1) {
138 + rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139 + kill_hashfile(hashfile);
142 + if (st.st_size != size) {
143 + kill_hashfile(hashfile);
146 + hashfile->nlink = st.st_nlink;
147 + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148 + if (hashfile->fd == -1) {
149 + rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150 + kill_hashfile(hashfile);
153 + if (hashfiles == NULL)
154 + hashfiles = hashfile->next = hashfile->prev = hashfile;
156 + hashfile->next = hashfiles;
157 + hashfile->prev = hashfiles->prev;
158 + hashfile->next->prev = hashfile;
159 + hashfile->prev->next = hashfile;
168 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
171 + char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172 + struct hashfile_struct *iter, *next, *best;
178 + iter = files; /* in case files are 0 bytes */
179 + while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
182 + /* Icky bit to resync when we steal the first node. */
188 + hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189 + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190 + if (iter == files) {
191 + files = files->prev;
193 + if (iter->next == iter) {
194 + files = next = NULL;
197 + if (iter == files) {
198 + /* So we know to resync */
202 + iter->next->prev = iter->prev;
203 + iter->prev->next = iter->next;
204 + kill_hashfile(iter);
208 + } while (iter != files);
210 + if (iter == NULL && files == NULL) {
211 + /* There are no matches. */
217 + rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218 + kill_hashfiles(files);
222 + /* If we only have one file left, use it. */
223 + if (files == files->next) {
227 + /* All files which remain in the list are identical and should have
228 + * the same size. We pick the one with the lowest link count (we
229 + * may have rolled over because we hit the maximum link count for
230 + * the filesystem). */
231 + best = iter = files;
232 + nlink = iter->nlink;
234 + if (iter->nlink < nlink) {
235 + nlink = iter->nlink;
239 + } while (iter != files);
241 + best->next->prev = best->prev;
242 + best->prev->next = best->next;
244 + files = files->next;
245 + kill_hashfiles(files);
250 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
253 + char *hashname = make_hash_name(file);
258 + if (file->length == 0) {
259 + return robust_rename(fnametmp,fname,0644);
262 + if (do_stat(hashname, &st) == -1) {
265 + /* Directory does not exist. */
266 + dirname = strdup(hashname);
267 + *strrchr(dirname,'/') = 0;
268 + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
269 + rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
272 + return robust_rename(fnametmp,fname,0644);
276 + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
277 + rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
279 + return robust_rename(fnametmp,fname,0644);
283 + asprintf(&linkname,"%s/0",hashname);
284 + rprintf(FINFO, "(1) linkname = %s\n", linkname);
286 + struct hashfile_struct *hashfiles, *hashfile;
288 + if (do_stat(fnametmp,&st) == -1) {
289 + rsyserr(FERROR, errno, "stat failed: %s", fname);
292 + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
294 + if (hashfiles == NULL) {
296 + asprintf(&linkname,"%s/0",hashname);
297 + rprintf(FINFO, "(2) linkname = %s\n", linkname);
300 + /* Search for one identical to us. */
301 + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
302 + rsyserr(FERROR, errno, "open failed: %s", fnametmp);
303 + kill_hashfiles(hashfiles);
306 + hashfile = compare_hashfiles(fd, hashfiles);
312 + linkname = strdup(hashfile->name);
313 + rprintf(FINFO, "(3) linkname = %s\n", linkname);
314 + kill_hashfile(hashfile);
317 + asprintf(&linkname, "%s/%ld", hashname,
319 + rprintf(FINFO, "(4) linkname = %s\n", linkname);
325 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
326 + linkname, full_fname(fname));
327 + robust_unlink(fname);
328 + rc = do_link(linkname, fname);
330 + if (errno == EMLINK) {
333 + asprintf(&linkname,"%s/%ld",hashname,
335 + rprintf(FINFO, "(5) linkname = %s\n", linkname);
336 + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
338 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339 + linkname, full_fname(fname));
340 + rc = robust_rename(fnametmp,fname,0644);
343 + do_unlink(fnametmp);
348 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
349 + full_fname(fname),linkname);
351 + rc = robust_rename(fnametmp,fname,0644);
353 + rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
354 + full_fname(fnametmp), full_fname(fname));
356 + rc = do_link(fname,linkname);
358 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
359 + full_fname(fname), linkname);
371 @@ -145,6 +145,7 @@ char *backup_suffix = NULL;
373 char *partial_dir = NULL;
374 char *basis_dir[MAX_BASIS_DIRS+1];
375 +char *link_by_hash_dir = NULL;
376 char *config_file = NULL;
377 char *shell_cmd = NULL;
378 char *log_format = NULL;
379 @@ -338,6 +339,7 @@ void usage(enum logcode F)
380 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
381 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
382 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
383 + rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
384 rprintf(F," -z, --compress compress file data during the transfer\n");
385 rprintf(F," --compress-level=NUM explicitly set compression level\n");
386 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
387 @@ -385,7 +387,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
388 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
389 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
390 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
392 + OPT_NO_D, OPT_LINK_BY_HASH,
393 OPT_SERVER, OPT_REFUSED_BASE = 9000};
395 static struct poptOption long_options[] = {
396 @@ -480,6 +482,7 @@ static struct poptOption long_options[]
397 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
398 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
399 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
400 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
401 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
402 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
403 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
404 @@ -1060,6 +1063,21 @@ int parse_arguments(int *argc, const cha
408 + case OPT_LINK_BY_HASH:
410 + arg = poptGetOptArg(pc);
411 + if (sanitize_paths)
412 + arg = sanitize_path(NULL, arg, NULL, 0);
413 + link_by_hash_dir = (char *)arg;
416 + snprintf(err_buf, sizeof err_buf,
417 + "hard links are not supported on this %s\n",
418 + am_server ? "server" : "client");
419 + rprintf(FERROR, "ERROR: %s", err_buf);
424 /* A large opt value means that set_refuse_options()
425 * turned this option off. */
426 @@ -1708,6 +1726,11 @@ void server_options(char **args,int *arg
430 + if (link_by_hash_dir && am_sender) {
431 + args[ac++] = "--link-by-hash";
432 + args[ac++] = link_by_hash_dir;
435 if (files_from && (!am_sender || filesfrom_host)) {
436 if (filesfrom_host) {
437 args[ac++] = "--files-from";
440 @@ -54,6 +54,7 @@ extern int delay_updates;
441 extern struct stats stats;
442 extern char *log_format;
444 +extern char *link_by_hash_dir;
445 extern char *partial_dir;
446 extern char *basis_dir[];
447 extern struct file_list *the_file_list;
448 @@ -125,12 +126,13 @@ static int get_tmpname(char *fnametmp, c
451 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
452 - char *fname, int fd, OFF_T total_size)
453 + char *fname, int fd, OFF_T total_size, char *md4)
455 static char file_sum1[MD4_SUM_LENGTH];
456 static char file_sum2[MD4_SUM_LENGTH];
457 struct map_struct *mapbuf;
458 struct sum_struct sum;
459 + struct mdfour mdfour_data;
463 @@ -150,6 +152,9 @@ static int receive_data(int f_in, char *
468 + mdfour_begin(&mdfour_data);
470 sum_init(checksum_seed);
473 @@ -192,6 +197,8 @@ static int receive_data(int f_in, char *
474 cleanup_got_literal = 1;
478 + mdfour_update(&mdfour_data,data,i);
480 if (fd != -1 && write_file(fd,data,i) != i)
481 goto report_write_error;
482 @@ -218,6 +225,8 @@ static int receive_data(int f_in, char *
485 sum_update(map, len);
487 + mdfour_update(&mdfour_data,map,len);
491 @@ -258,6 +267,8 @@ static int receive_data(int f_in, char *
496 + mdfour_result(&mdfour_data, (unsigned char*)md4);
500 @@ -273,7 +284,7 @@ static int receive_data(int f_in, char *
502 static void discard_receive_data(int f_in, OFF_T length)
504 - receive_data(f_in, NULL, -1, 0, NULL, -1, length);
505 + receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
508 static void handle_delayed_updates(struct file_list *flist, char *local_name)
509 @@ -605,8 +616,12 @@ int recv_files(int f_in, struct file_lis
510 rprintf(FINFO, "%s\n", fname);
514 + if (link_by_hash_dir)
515 + file->u.sum = new_array(char, MD4_SUM_LENGTH);
517 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
518 - fname, fd2, file->length);
519 + fname, fd2, file->length, file->u.sum);
521 if (!log_before_transfer)
522 log_item(file, &initial_stats, iflags, NULL);
525 @@ -50,6 +50,7 @@ extern int inplace;
526 extern int keep_dirlinks;
527 extern int make_backups;
528 extern struct stats stats;
529 +extern char *link_by_hash_dir;
531 #if defined HAVE_ICONV_OPEN && defined HAVE_ICONV_H
532 iconv_t ic_chck = (iconv_t)-1;
533 @@ -266,8 +267,15 @@ void finish_transfer(char *fname, char *
534 /* move tmp file over real file */
536 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
537 - ret = robust_rename(fnametmp, fname, partialptr,
538 - file->mode & INITACCESSPERMS);
540 + if (link_by_hash_dir)
541 + ret = link_by_hash(fnametmp, fname, file);
545 + ret = robust_rename(fnametmp, fname, partialptr,
546 + file->mode & INITACCESSPERMS);
549 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
550 ret == -2 ? "copy" : "rename",
553 @@ -640,6 +640,14 @@ struct stats {
554 int current_file_index;
557 +struct hashfile_struct {
558 + struct hashfile_struct *next;
559 + struct hashfile_struct *prev;
565 struct chmod_mode_struct;
567 #include "byteorder.h"
570 @@ -361,6 +361,7 @@ to the detailed description below for a
571 --compare-dest=DIR also compare received files relative to DIR
572 --copy-dest=DIR ... and include copies of unchanged files
573 --link-dest=DIR hardlink to files in DIR when unchanged
574 + --link-by-hash=DIR create hardlinks by hash into DIR
575 -z, --compress compress file data during the transfer
576 --compress-level=NUM explicitly set compression level
577 -C, --cvs-exclude auto-ignore files in the same way CVS does