1 After applying this patch and running configure, you MUST run this
6 Jason M. Felice writes:
8 This patch adds the --link-by-hash=DIR option, which hard links received
9 files in a link farm arranged by MD4 file hash. The result is that the system
10 will only store one copy of the unique contents of each file, regardless of
14 --- Makefile.in 2 May 2004 17:04:14 -0000 1.100
15 +++ Makefile.in 13 May 2004 19:04:49 -0000
16 @@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20 -OBJS3=progress.o pipe.o
21 +OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
25 --- /dev/null 1 Jan 1970 00:00:00 -0000
26 +++ hashlink.c 13 May 2004 19:04:49 -0000
29 + Copyright (C) Cronosys, LLC 2004
31 + This program is free software; you can redistribute it and/or modify
32 + it under the terms of the GNU General Public License as published by
33 + the Free Software Foundation; either version 2 of the License, or
34 + (at your option) any later version.
36 + This program is distributed in the hope that it will be useful,
37 + but WITHOUT ANY WARRANTY; without even the implied warranty of
38 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39 + GNU General Public License for more details.
41 + You should have received a copy of the GNU General Public License
42 + along with this program; if not, write to the Free Software
43 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
46 +/* This file contains code used by the --link-by-hash option. */
50 +extern char *link_by_hash_dir;
54 +char* make_hash_name(struct file_struct *file)
56 + char hash[33], *dst;
61 + src = (unsigned char*)file->u.sum;
62 + for (dst = hash, i = 0; i < 4; i++, src++) {
64 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
69 + for (i = 0; i < 12; i++, src++) {
71 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
77 + asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
82 +void kill_hashfile(struct hashfile_struct *hashfile)
86 + free(hashfile->name);
87 + close(hashfile->fd);
92 +void kill_hashfiles(struct hashfile_struct *hashfiles)
94 + struct hashfile_struct *iter, *next;
95 + if ((iter = hashfiles) != NULL) {
98 + kill_hashfile(iter);
100 + } while (iter != hashfiles);
105 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
109 + struct hashfile_struct *hashfiles = NULL, *hashfile;
115 + /* Build a list of potential candidates and open
117 + if ((d = opendir(hashname)) == NULL) {
118 + rprintf(FERROR,"opendir \"%s\": %s\n",
119 + hashname, strerror(errno));
123 + while ((di = readdir(d)) != NULL) {
124 + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
128 + /* We need to have the largest fnbr in case we need to store
130 + this_fnbr = atol(di->d_name);
131 + if (this_fnbr > *fnbr)
134 + hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
135 + asprintf(&hashfile->name,"%s/%s",hashname,
137 + if (do_stat(hashfile->name,&st) == -1) {
138 + rprintf(FERROR,"%s: %s", hashfile->name,
140 + kill_hashfile(hashfile);
143 + if (st.st_size != size) {
144 + kill_hashfile(hashfile);
147 + hashfile->nlink = st.st_nlink;
148 + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
149 + if (hashfile->fd == -1) {
150 + rprintf(FERROR,"%s: %s\n", hashfile->name,
152 + kill_hashfile(hashfile);
155 + if (hashfiles == NULL)
156 + hashfiles = hashfile->next = hashfile->prev = hashfile;
158 + hashfile->next = hashfiles;
159 + hashfile->prev = hashfiles->prev;
160 + hashfile->next->prev = hashfile;
161 + hashfile->prev->next = hashfile;
170 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
173 + char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
174 + struct hashfile_struct *iter, *next, *best;
180 + iter = files; /* in case files are 0 bytes */
181 + while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
184 + /* Icky bit to resync when we steal the first node. */
190 + hamt = read(iter->fd, cmpbuffer, BUFSIZ);
191 + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
192 + if (iter == files) {
193 + files = files->prev;
195 + if (iter->next == iter) {
196 + files = next = NULL;
199 + if (iter == files) {
200 + /* So we know to resync */
204 + iter->next->prev = iter->prev;
205 + iter->prev->next = iter->next;
206 + kill_hashfile(iter);
210 + } while (iter != files);
212 + if (iter == NULL && files == NULL) {
213 + /* There are no matches. */
220 + rprintf(FERROR,"%s",strerror(errno));
221 + kill_hashfiles(files);
225 + /* If we only have one file left, use it. */
226 + if (files == files->next) {
230 + /* All files which remain in the list are identical and should have
231 + * the same size. We pick the one with the lowest link count (we
232 + * may have rolled over because we hit the maximum link count for
233 + * the filesystem). */
234 + best = iter = files;
235 + nlink = iter->nlink;
237 + if (iter->nlink < nlink) {
238 + nlink = iter->nlink;
242 + } while (iter != files);
244 + best->next->prev = best->prev;
245 + best->prev->next = best->next;
247 + files = files->next;
248 + kill_hashfiles(files);
253 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
256 + char *hashname = make_hash_name(file);
261 + if (file->length == 0) {
262 + return robust_rename(fnametmp,fname,0644);
265 + if (do_stat(hashname, &st) == -1) {
268 + /* Directory does not exist. */
269 + dirname = strdup(hashname);
270 + *strrchr(dirname,'/') = 0;
271 + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
272 + rprintf(FERROR, "mkdir %s: %s\n", dirname,
276 + return robust_rename(fnametmp,fname,0644);
280 + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
281 + rprintf(FERROR, "mkdir %s: %s\n", hashname,
284 + return robust_rename(fnametmp,fname,0644);
288 + asprintf(&linkname,"%s/0",hashname);
289 + rprintf(FINFO, "(1) linkname = %s\n", linkname);
292 + struct hashfile_struct *hashfiles, *hashfile;
295 + if (do_stat(fnametmp,&st) == -1) {
296 + rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
299 + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
301 + if (hashfiles == NULL) {
303 + asprintf(&linkname,"%s/0",hashname);
304 + rprintf(FINFO, "(2) linkname = %s\n", linkname);
307 + /* Search for one identical to us. */
308 + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
309 + rprintf(FERROR,"%s: %s\n",fnametmp,
311 + kill_hashfiles(hashfiles);
314 + hashfile = compare_hashfiles(fd, hashfiles);
319 + linkname = strdup(hashfile->name);
320 + rprintf(FINFO, "(3) linkname = %s\n", linkname);
321 + kill_hashfile(hashfile);
324 + asprintf(&linkname, "%s/%ld", hashname,
326 + rprintf(FINFO, "(4) linkname = %s\n", linkname);
332 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
333 + linkname, full_fname(fname));
334 + rc = do_link(linkname, fname);
336 + if (errno == EMLINK) {
339 + asprintf(&linkname,"%s/%ld",hashname,
341 + rprintf(FINFO, "(5) linkname = %s\n", linkname);
342 + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
344 + rprintf(FERROR,"link \"%s\" -> %s: %s\n",
345 + linkname,full_fname(fname),
347 + robust_unlink(fname);
348 + rc = robust_rename(fnametmp,fname,0644);
351 + do_unlink(fnametmp);
356 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
357 + full_fname(fname),linkname);
359 + rc = robust_rename(fnametmp,fname,0644);
361 + rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
362 + full_fname(fnametmp),full_fname(fname),
365 + rc = do_link(fname,linkname);
367 + rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
368 + full_fname(fname),linkname,
379 --- options.c 6 May 2004 21:08:01 -0000 1.148
380 +++ options.c 13 May 2004 19:04:49 -0000
381 @@ -121,6 +121,7 @@ char *log_format = NULL;
382 char *password_file = NULL;
383 char *rsync_path = RSYNC_PATH;
384 char *backup_dir = NULL;
385 +char *link_by_hash_dir = NULL;
386 char backup_dir_buf[MAXPATHLEN];
387 int rsync_port = RSYNC_PORT;
389 @@ -266,6 +267,7 @@ void usage(enum logcode F)
390 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
391 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
392 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
393 + rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
394 rprintf(F," -P equivalent to --partial --progress\n");
395 rprintf(F," -z, --compress compress file data\n");
396 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
397 @@ -305,7 +307,7 @@ void usage(enum logcode F)
398 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
399 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
400 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
401 - OPT_READ_BATCH, OPT_WRITE_BATCH,
402 + OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
403 OPT_REFUSED_BASE = 9000};
405 static struct poptOption long_options[] = {
406 @@ -362,6 +364,7 @@ static struct poptOption long_options[]
407 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
408 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
409 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
410 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
411 /* TODO: Should this take an optional int giving the compression level? */
412 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
413 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
414 @@ -584,6 +587,19 @@ int parse_arguments(int *argc, const cha
418 + case OPT_LINK_BY_HASH:
420 + link_by_hash_dir = (char *)poptGetOptArg(pc);
421 + checksum_seed = FIXED_CHECKSUM_SEED;
424 + snprintf(err_buf, sizeof err_buf,
425 + "hard links are not supported on this %s\n",
426 + am_server ? "server" : "client");
427 + rprintf(FERROR, "ERROR: %s", err_buf);
432 /* A large opt value means that set_refuse_options()
433 * turned this option off (opt-BASE is its index). */
434 @@ -951,6 +967,11 @@ void server_options(char **args,int *arg
436 args[ac++] = link_dest ? "--link-dest" : "--compare-dest";
437 args[ac++] = compare_dest;
440 + if (link_by_hash_dir && am_sender) {
441 + args[ac++] = "--link-by-hash";
442 + args[ac++] = link_by_hash_dir;
445 if (files_from && (!am_sender || remote_filesfrom_file)) {
446 --- receiver.c 13 May 2004 07:08:22 -0000 1.77
447 +++ receiver.c 13 May 2004 19:04:49 -0000
448 @@ -46,6 +46,7 @@ extern int module_id;
449 extern int ignore_errors;
450 extern int orig_umask;
451 extern int keep_partial;
452 +extern char *link_by_hash_dir;
454 static void delete_one(char *fn, int is_dir)
456 @@ -191,10 +192,11 @@ static int get_tmpname(char *fnametmp, c
459 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
461 + OFF_T total_size,char *md4)
464 struct sum_struct sum;
465 + struct mdfour mdfour_data;
469 @@ -204,7 +206,9 @@ static int receive_data(int f_in,struct
472 read_sum_head(f_in, &sum);
475 + mdfour_begin(&mdfour_data);
479 while ((i = recv_token(f_in, &data)) != 0) {
480 @@ -221,6 +225,8 @@ static int receive_data(int f_in,struct
481 cleanup_got_literal = 1;
485 + mdfour_update(&mdfour_data,data,i);
487 if (fd != -1 && write_file(fd,data,i) != i) {
488 rprintf(FERROR, "write failed on %s: %s\n",
489 @@ -248,6 +254,8 @@ static int receive_data(int f_in,struct
494 + mdfour_update(&mdfour_data,map,len);
497 if (fd != -1 && write_file(fd,map,len) != (int) len) {
498 @@ -270,6 +278,8 @@ static int receive_data(int f_in,struct
503 + mdfour_result(&mdfour_data, (unsigned char*)md4);
505 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
507 @@ -373,7 +383,7 @@ int recv_files(int f_in,struct file_list
508 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
509 rprintf(FERROR, "fstat %s failed: %s\n",
510 full_fname(fnamecmp), strerror(errno));
511 - receive_data(f_in,NULL,-1,NULL,file->length);
512 + receive_data(f_in,NULL,-1,NULL,file->length,NULL);
516 @@ -386,7 +396,7 @@ int recv_files(int f_in,struct file_list
518 rprintf(FERROR,"recv_files: %s is a directory\n",
519 full_fname(fnamecmp));
520 - receive_data(f_in, NULL, -1, NULL, file->length);
521 + receive_data(f_in,NULL,-1,NULL,file->length,NULL);
525 @@ -438,7 +448,7 @@ int recv_files(int f_in,struct file_list
527 rprintf(FERROR, "mkstemp %s failed: %s\n",
528 full_fname(fnametmp), strerror(errno));
529 - receive_data(f_in,mapbuf,-1,NULL,file->length);
530 + receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
531 if (mapbuf) unmap_file(mapbuf);
532 if (fd1 != -1) close(fd1);
534 @@ -451,7 +461,12 @@ int recv_files(int f_in,struct file_list
538 - recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
540 + if (link_by_hash_dir) {
541 + file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
544 + recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
546 log_recv(file, &initial_stats);
548 --- rsync.c 13 May 2004 18:51:22 -0000 1.138
549 +++ rsync.c 13 May 2004 19:04:49 -0000
550 @@ -31,6 +31,7 @@ extern int am_generator;
551 extern int preserve_uid;
552 extern int preserve_gid;
553 extern int make_backups;
554 +extern char *link_by_hash_dir;
558 @@ -236,8 +237,12 @@ void finish_transfer(char *fname, char *
559 if (make_backups && !make_backup(fname))
562 - /* move tmp file over real file */
563 - ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
565 + if (link_by_hash_dir)
566 + ret = link_by_hash(fnametmp,fname,file);
569 + ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
571 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
572 ret == -2 ? "copy" : "rename",
573 --- rsync.h 13 May 2004 18:51:22 -0000 1.203
574 +++ rsync.h 13 May 2004 19:04:50 -0000
575 @@ -521,6 +521,14 @@ struct stats {
576 int current_file_index;
579 +struct hashfile_struct {
580 + struct hashfile_struct *next;
581 + struct hashfile_struct *prev;
588 /* we need this function because of the silly way in which duplicate
589 entries are handled in the file lists - we can't change this