1 To: rsync@lists.samba.org
2 From: "Jason M. Felice" <jfelice@cronosys.com>
3 Subject: [patch] Add `--link-by-hash' option (rev 5).
4 Date: Mon, 23 Feb 2004 13:29:08 -0500
6 This patch adds the --link-by-hash=DIR option, which hard links received
7 files in a link farm arranged by MD4 file hash. The result is that the system
8 will only store one copy of the unique contents of each file, regardless of
12 * Fixed silly logic error.
15 * Updated for committed robust_rename() patch, other changes in CVS.
18 * Don't link empty files.
19 * Roll over to new file when filesystem maximum link count is reached.
20 * If link fails for another reason, leave non-linked file there.
21 * Depends on rsync-rename.diff
24 * This revision is actually against CVS HEAD (I didn't realize I was working
25 from a stale rsync'd CVS).
26 * Apply permissions after linking (permissions were lost if we already had
27 a copy of the file in the link farm).
32 -0 +351 hashlink.c (new)
39 --- hashlink.c 1969-12-31 19:00:00.000000000 -0500
40 +++ hashlink.c 2004-02-23 10:30:45.000000000 -0500
43 + Copyright (C) Cronosys, LLC 2004
45 + This program is free software; you can redistribute it and/or modify
46 + it under the terms of the GNU General Public License as published by
47 + the Free Software Foundation; either version 2 of the License, or
48 + (at your option) any later version.
50 + This program is distributed in the hope that it will be useful,
51 + but WITHOUT ANY WARRANTY; without even the implied warranty of
52 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
53 + GNU General Public License for more details.
55 + You should have received a copy of the GNU General Public License
56 + along with this program; if not, write to the Free Software
57 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
60 +/* This file contains code used by the --link-by-hash option. */
64 +extern char *link_by_hash_dir;
68 +char* make_hash_name(struct file_struct *file)
70 + char hash[33], *dst;
75 + src = (unsigned char*)file->u.sum;
76 + for (dst = hash, i = 0; i < 4; i++, src++) {
78 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
80 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
83 + for (i = 0; i < 12; i++, src++) {
85 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
87 + *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
91 + asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
96 +void kill_hashfile(struct hashfile_struct *hashfile)
100 + free(hashfile->name);
101 + close(hashfile->fd);
106 +void kill_hashfiles(struct hashfile_struct *hashfiles)
108 + struct hashfile_struct *iter, *next;
109 + if ((iter = hashfiles) != NULL) {
112 + kill_hashfile(iter);
114 + } while (iter != hashfiles);
119 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
123 + struct hashfile_struct *hashfiles = NULL, *hashfile;
129 + /* Build a list of potential candidates and open
131 + if ((d = opendir(hashname)) == NULL) {
132 + rprintf(FERROR,"opendir \"%s\": %s\n",
133 + hashname, strerror(errno));
137 + while ((di = readdir(d)) != NULL) {
138 + if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
142 + /* We need to have the largest fnbr in case we need to store
144 + this_fnbr = atol(di->d_name);
145 + if (this_fnbr > *fnbr)
148 + hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
149 + asprintf(&hashfile->name,"%s/%s",hashname,
151 + if (do_stat(hashfile->name,&st) == -1) {
152 + rprintf(FERROR,"%s: %s", hashfile->name,
154 + kill_hashfile(hashfile);
157 + if (st.st_size != size) {
158 + kill_hashfile(hashfile);
161 + hashfile->nlink = st.st_nlink;
162 + hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
163 + if (hashfile->fd == -1) {
164 + rprintf(FERROR,"%s: %s\n", hashfile->name,
166 + kill_hashfile(hashfile);
169 + if (hashfiles == NULL)
170 + hashfiles = hashfile->next = hashfile->prev = hashfile;
172 + hashfile->next = hashfiles;
173 + hashfile->prev = hashfiles->prev;
174 + hashfile->next->prev = hashfile;
175 + hashfile->prev->next = hashfile;
184 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
187 + char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
188 + struct hashfile_struct *iter, *next, *best;
194 + iter = files; /* in case files are 0 bytes */
195 + while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
198 + /* Icky bit to resync when we steal the first node. */
204 + hamt = read(iter->fd, cmpbuffer, BUFSIZ);
205 + if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
206 + if (iter == files) {
207 + files = files->prev;
209 + if (iter->next == iter) {
210 + files = next = NULL;
213 + if (iter == files) {
214 + /* So we know to resync */
218 + iter->next->prev = iter->prev;
219 + iter->prev->next = iter->next;
220 + kill_hashfile(iter);
224 + } while (iter != files);
226 + if (iter == NULL && files == NULL) {
227 + /* There are no matches. */
234 + rprintf(FERROR,"%s",strerror(errno));
235 + kill_hashfiles(files);
239 + /* If we only have one file left, use it. */
240 + if (files == files->next) {
244 + /* All files which remain in the list are identical and should have
245 + * the same size. We pick the one with the lowest link count (we
246 + * may have rolled over because we hit the maximum link count for
247 + * the filesystem). */
248 + best = iter = files;
249 + nlink = iter->nlink;
251 + if (iter->nlink < nlink) {
252 + nlink = iter->nlink;
256 + } while (iter != files);
258 + best->next->prev = best->prev;
259 + best->prev->next = best->next;
261 + files = files->next;
262 + kill_hashfiles(files);
267 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
270 + char *hashname = make_hash_name(file);
275 + if (file->length == 0) {
276 + return robust_rename(fnametmp,fname,0644);
279 + if (do_stat(hashname, &st) == -1) {
282 + /* Directory does not exist. */
283 + dirname = strdup(hashname);
284 + *strrchr(dirname,'/') = 0;
285 + if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
286 + rprintf(FERROR, "mkdir %s: %s\n", dirname,
290 + return robust_rename(fnametmp,fname,0644);
294 + if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
295 + rprintf(FERROR, "mkdir %s: %s\n", hashname,
298 + return robust_rename(fnametmp,fname,0644);
302 + asprintf(&linkname,"%s/0",hashname);
303 + rprintf(FINFO, "(1) linkname = %s\n", linkname);
306 + struct hashfile_struct *hashfiles, *hashfile;
309 + if (do_stat(fnametmp,&st) == -1) {
310 + rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
313 + hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
315 + if (hashfiles == NULL) {
317 + asprintf(&linkname,"%s/0",hashname);
318 + rprintf(FINFO, "(2) linkname = %s\n", linkname);
321 + /* Search for one identical to us. */
322 + if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
323 + rprintf(FERROR,"%s: %s\n",fnametmp,
325 + kill_hashfiles(hashfiles);
328 + hashfile = compare_hashfiles(fd, hashfiles);
333 + linkname = strdup(hashfile->name);
334 + rprintf(FINFO, "(3) linkname = %s\n", linkname);
335 + kill_hashfile(hashfile);
338 + asprintf(&linkname, "%s/%ld", hashname,
340 + rprintf(FINFO, "(4) linkname = %s\n", linkname);
346 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
347 + linkname, full_fname(fname));
348 + rc = do_link(linkname, fname);
350 + if (errno == EMLINK) {
353 + asprintf(&linkname,"%s/%ld",hashname,
355 + rprintf(FINFO, "(5) linkname = %s\n", linkname);
356 + rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
358 + rprintf(FERROR,"link \"%s\" -> %s: %s\n",
359 + linkname,full_fname(fname),
361 + robust_unlink(fname);
362 + rc = robust_rename(fnametmp,fname,0644);
365 + do_unlink(fnametmp);
370 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
371 + full_fname(fname),linkname);
373 + rc = robust_rename(fnametmp,fname,0644);
375 + rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
376 + full_fname(fnametmp),full_fname(fname),
379 + rc = do_link(fname,linkname);
381 + rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
382 + full_fname(fname),linkname,
393 --- Makefile.in 2004-02-23 10:22:51.000000000 -0500
394 +++ Makefile.in 2004-02-23 10:22:51.000000000 -0500
396 main.o checksum.o match.o syscall.o log.o backup.o
397 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
398 fileio.o batch.o clientname.o
399 -OBJS3=progress.o pipe.o
400 +OBJS3=progress.o pipe.o hashlink.o
401 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
402 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
403 popt/popthelp.o popt/poptparse.o
404 --- options.c 2004-02-23 10:22:51.000000000 -0500
405 +++ options.c 2004-02-23 10:29:14.000000000 -0500
407 char *password_file = NULL;
408 char *rsync_path = RSYNC_PATH;
409 char *backup_dir = NULL;
410 +char *link_by_hash_dir = NULL;
411 char backup_dir_buf[MAXPATHLEN];
412 int rsync_port = RSYNC_PORT;
415 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
416 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
417 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
418 + rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
419 rprintf(F," -P equivalent to --partial --progress\n");
420 rprintf(F," -z, --compress compress file data\n");
421 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
423 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
424 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
425 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
426 - OPT_READ_BATCH, OPT_WRITE_BATCH,
427 + OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
428 OPT_REFUSED_BASE = 9000};
430 static struct poptOption long_options[] = {
432 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
433 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
434 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
435 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
436 /* TODO: Should this take an optional int giving the compression level? */
437 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
438 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
443 + case OPT_LINK_BY_HASH:
445 + link_by_hash_dir = (char *)poptGetOptArg(pc);
446 + checksum_seed = FIXED_CHECKSUM_SEED;
449 + snprintf(err_buf, sizeof err_buf,
450 + "hard links are not supported on this %s\n",
451 + am_server ? "server" : "client");
452 + rprintf(FERROR, "ERROR: %s", err_buf);
457 /* A large opt value means that set_refuse_options()
458 * turned this option off (opt-BASE is its index). */
460 args[ac++] = compare_dest;
463 + if (link_by_hash_dir && am_sender) {
464 + args[ac++] = "--link-by-hash";
465 + args[ac++] = link_by_hash_dir;
468 if (files_from && (!am_sender || remote_filesfrom_file)) {
469 if (remote_filesfrom_file) {
470 args[ac++] = "--files-from";
471 --- proto.h 2004-02-23 10:22:51.000000000 -0500
472 +++ proto.h 2004-02-23 11:06:03.000000000 -0500
474 void write_sum_head(int f, struct sum_struct *sum);
475 void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
476 void generate_files(int f, struct file_list *flist, char *local_name);
477 +char* make_hash_name(struct file_struct *file);
478 +void kill_hashfile(struct hashfile_struct *hashfile);
479 +void kill_hashfiles(struct hashfile_struct *hashfiles);
480 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
481 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
482 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
483 void init_hard_links(struct file_list *flist);
484 int hard_link_check(struct file_struct *file, int skip);
485 void do_hard_links(void);
486 --- receiver.c 2004-02-23 10:22:51.000000000 -0500
487 +++ receiver.c 2004-02-23 10:22:51.000000000 -0500
488 @@ -186,10 +186,11 @@
491 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
493 + OFF_T total_size,char *md4)
496 struct sum_struct sum;
497 + struct mdfour mdfour_data;
504 read_sum_head(f_in, &sum);
507 + mdfour_begin(&mdfour_data);
511 while ((i = recv_token(f_in, &data)) != 0) {
513 cleanup_got_literal = 1;
517 + mdfour_update(&mdfour_data,data,i);
519 if (fd != -1 && write_file(fd,data,i) != i) {
520 rprintf(FERROR, "write failed on %s: %s\n",
526 + mdfour_update(&mdfour_data,map,len);
529 if (fd != -1 && write_file(fd,map,len) != (int) len) {
535 + mdfour_result(&mdfour_data, (unsigned char*)md4);
537 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
540 extern int preserve_perms;
541 extern int delete_after;
542 extern int orig_umask;
543 + extern char *link_by_hash_dir;
544 struct stats initial_stats;
548 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
549 rprintf(FERROR, "fstat %s failed: %s\n",
550 full_fname(fnamecmp), strerror(errno));
551 - receive_data(f_in,NULL,-1,NULL,file->length);
552 + receive_data(f_in,NULL,-1,NULL,file->length,NULL);
558 rprintf(FERROR,"recv_files: %s is a directory\n",
559 full_fname(fnamecmp));
560 - receive_data(f_in, NULL, -1, NULL, file->length);
561 + receive_data(f_in,NULL,-1,NULL,file->length,NULL);
567 rprintf(FERROR, "mkstemp %s failed: %s\n",
568 full_fname(fnametmp), strerror(errno));
569 - receive_data(f_in,mapbuf,-1,NULL,file->length);
570 + receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
571 if (mapbuf) unmap_file(mapbuf);
572 if (fd1 != -1) close(fd1);
578 - recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
580 + if (link_by_hash_dir) {
581 + file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
584 + recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
586 log_recv(file, &initial_stats);
588 --- rsync.c 2004-02-23 10:22:51.000000000 -0500
589 +++ rsync.c 2004-02-23 12:49:33.000000000 -0500
591 extern int preserve_gid;
592 extern int preserve_perms;
593 extern int make_backups;
594 +extern char *link_by_hash_dir;
599 if (make_backups && !make_backup(fname))
602 - /* move tmp file over real file */
603 - ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
605 + if (link_by_hash_dir)
606 + ret = link_by_hash(fnametmp,fname,file);
609 + ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
612 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
613 ret == -2 ? "copy" : "rename",
614 --- rsync.h 2004-02-23 10:22:51.000000000 -0500
615 +++ rsync.h 2004-02-23 12:42:59.000000000 -0500
617 int current_file_index;
620 +struct hashfile_struct {
621 + struct hashfile_struct *next;
622 + struct hashfile_struct *prev;
629 /* we need this function because of the silly way in which duplicate
630 entries are handled in the file lists - we can't change this