Added an exclude-list convenience function.
[rsync/rsync-patches.git] / link-by-hash.diff
... / ...
CommitLineData
1To: rsync@lists.samba.org
2From: "Jason M. Felice" <jfelice@cronosys.com>
3Subject: [patch] Add `--link-by-hash' option (rev 5).
4Date: Mon, 23 Feb 2004 13:29:08 -0500
5
6This patch adds the --link-by-hash=DIR option, which hard links received
7files in a link farm arranged by MD4 file hash. The result is that the system
8will only store one copy of the unique contents of each file, regardless of
9the file's name.
10
11(rev 5)
12* Fixed silly logic error.
13
14(rev 4)
15* Updated for committed robust_rename() patch, other changes in CVS.
16
17(rev 3)
18* Don't link empty files.
19* Roll over to new file when filesystem maximum link count is reached.
20* If link fails for another reason, leave non-linked file there.
21* Depends on rsync-rename.diff
22
23(rev 2)
24* This revision is actually against CVS HEAD (I didn't realize I was working
25 from a stale rsync'd CVS).
26* Apply permissions after linking (permissions were lost if we already had
27 a copy of the file in the link farm).
28
29Patch Summary:
30
31 -1 +1 Makefile.in
32 -0 +351 hashlink.c (new)
33 -1 +22 options.c
34 -0 +6 proto.h
35 -6 +21 receiver.c
36 -2 +8 rsync.c
37 -0 +8 rsync.h
38
39--- hashlink.c 1969-12-31 19:00:00.000000000 -0500
40+++ hashlink.c 2004-02-23 10:30:45.000000000 -0500
41@@ -0,0 +1,351 @@
42+/*
43+ Copyright (C) Cronosys, LLC 2004
44+
45+ This program is free software; you can redistribute it and/or modify
46+ it under the terms of the GNU General Public License as published by
47+ the Free Software Foundation; either version 2 of the License, or
48+ (at your option) any later version.
49+
50+ This program is distributed in the hope that it will be useful,
51+ but WITHOUT ANY WARRANTY; without even the implied warranty of
52+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
53+ GNU General Public License for more details.
54+
55+ You should have received a copy of the GNU General Public License
56+ along with this program; if not, write to the Free Software
57+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
58+*/
59+
60+/* This file contains code used by the --link-by-hash option. */
61+
62+#include "rsync.h"
63+
64+extern char *link_by_hash_dir;
65+
66+#ifdef HAVE_LINK
67+
68+char* make_hash_name(struct file_struct *file)
69+{
70+ char hash[33], *dst;
71+ unsigned char *src;
72+ unsigned char c;
73+ int i;
74+
75+ src = (unsigned char*)file->u.sum;
76+ for (dst = hash, i = 0; i < 4; i++, src++) {
77+ c = *src >> 4;
78+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
79+ c = *src & 0x0f;
80+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
81+ }
82+ *dst++ = '/';
83+ for (i = 0; i < 12; i++, src++) {
84+ c = *src >> 4;
85+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
86+ c = *src & 0x0f;
87+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
88+ }
89+ *dst = 0;
90+
91+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
92+ return dst;
93+}
94+
95+
96+void kill_hashfile(struct hashfile_struct *hashfile)
97+{
98+ if (!hashfile)
99+ return;
100+ free(hashfile->name);
101+ close(hashfile->fd);
102+ free(hashfile);
103+}
104+
105+
106+void kill_hashfiles(struct hashfile_struct *hashfiles)
107+{
108+ struct hashfile_struct *iter, *next;
109+ if ((iter = hashfiles) != NULL) {
110+ do {
111+ next = iter->next;
112+ kill_hashfile(iter);
113+ iter = next;
114+ } while (iter != hashfiles);
115+ }
116+}
117+
118+
119+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
120+{
121+ DIR *d;
122+ struct dirent *di;
123+ struct hashfile_struct *hashfiles = NULL, *hashfile;
124+ STRUCT_STAT st;
125+ long this_fnbr;
126+
127+ *fnbr = 0;
128+
129+ /* Build a list of potential candidates and open
130+ * them. */
131+ if ((d = opendir(hashname)) == NULL) {
132+ rprintf(FERROR,"opendir \"%s\": %s\n",
133+ hashname, strerror(errno));
134+ free(hashname);
135+ return NULL;
136+ }
137+ while ((di = readdir(d)) != NULL) {
138+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
139+ continue;
140+ }
141+
142+ /* We need to have the largest fnbr in case we need to store
143+ * a new file. */
144+ this_fnbr = atol(di->d_name);
145+ if (this_fnbr > *fnbr)
146+ *fnbr = this_fnbr;
147+
148+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
149+ asprintf(&hashfile->name,"%s/%s",hashname,
150+ di->d_name);
151+ if (do_stat(hashfile->name,&st) == -1) {
152+ rprintf(FERROR,"%s: %s", hashfile->name,
153+ strerror(errno));
154+ kill_hashfile(hashfile);
155+ continue;
156+ }
157+ if (st.st_size != size) {
158+ kill_hashfile(hashfile);
159+ continue;
160+ }
161+ hashfile->nlink = st.st_nlink;
162+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
163+ if (hashfile->fd == -1) {
164+ rprintf(FERROR,"%s: %s\n", hashfile->name,
165+ strerror(errno));
166+ kill_hashfile(hashfile);
167+ continue;
168+ }
169+ if (hashfiles == NULL)
170+ hashfiles = hashfile->next = hashfile->prev = hashfile;
171+ else {
172+ hashfile->next = hashfiles;
173+ hashfile->prev = hashfiles->prev;
174+ hashfile->next->prev = hashfile;
175+ hashfile->prev->next = hashfile;
176+ }
177+ }
178+ closedir(d);
179+
180+ return hashfiles;
181+}
182+
183+
184+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
185+{
186+ int amt, hamt;
187+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
188+ struct hashfile_struct *iter, *next, *best;
189+ uint32 nlink;
190+
191+ if (!files)
192+ return NULL;
193+
194+ iter = files; /* in case files are 0 bytes */
195+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
196+ iter = files;
197+ do {
198+ /* Icky bit to resync when we steal the first node. */
199+ if (!files)
200+ files = iter;
201+
202+ next = iter->next;
203+
204+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
205+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
206+ if (iter == files) {
207+ files = files->prev;
208+ }
209+ if (iter->next == iter) {
210+ files = next = NULL;
211+ } else {
212+ next = iter->next;
213+ if (iter == files) {
214+ /* So we know to resync */
215+ files = NULL;
216+ }
217+ }
218+ iter->next->prev = iter->prev;
219+ iter->prev->next = iter->next;
220+ kill_hashfile(iter);
221+ }
222+
223+ iter = next;
224+ } while (iter != files);
225+
226+ if (iter == NULL && files == NULL) {
227+ /* There are no matches. */
228+ return NULL;
229+ }
230+
231+ }
232+
233+ if (amt == -1) {
234+ rprintf(FERROR,"%s",strerror(errno));
235+ kill_hashfiles(files);
236+ return NULL;
237+ }
238+
239+ /* If we only have one file left, use it. */
240+ if (files == files->next) {
241+ return files;
242+ }
243+
244+ /* All files which remain in the list are identical and should have
245+ * the same size. We pick the one with the lowest link count (we
246+ * may have rolled over because we hit the maximum link count for
247+ * the filesystem). */
248+ best = iter = files;
249+ nlink = iter->nlink;
250+ do {
251+ if (iter->nlink < nlink) {
252+ nlink = iter->nlink;
253+ best = iter;
254+ }
255+ iter = iter->next;
256+ } while (iter != files);
257+
258+ best->next->prev = best->prev;
259+ best->prev->next = best->next;
260+ if (files == best)
261+ files = files->next;
262+ kill_hashfiles(files);
263+ return best;
264+}
265+
266+
267+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
268+{
269+ STRUCT_STAT st;
270+ char *hashname = make_hash_name(file);
271+ int first = 0, rc;
272+ char *linkname;
273+ long last_fnbr;
274+
275+ if (file->length == 0) {
276+ return robust_rename(fnametmp,fname,0644);
277+ }
278+
279+ if (do_stat(hashname, &st) == -1) {
280+ char *dirname;
281+
282+ /* Directory does not exist. */
283+ dirname = strdup(hashname);
284+ *strrchr(dirname,'/') = 0;
285+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
286+ rprintf(FERROR, "mkdir %s: %s\n", dirname,
287+ strerror(errno));
288+ free(hashname);
289+ free(dirname);
290+ return robust_rename(fnametmp,fname,0644);
291+ }
292+ free(dirname);
293+
294+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
295+ rprintf(FERROR, "mkdir %s: %s\n", hashname,
296+ strerror(errno));
297+ free(hashname);
298+ return robust_rename(fnametmp,fname,0644);
299+ }
300+
301+ first = 1;
302+ asprintf(&linkname,"%s/0",hashname);
303+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
304+
305+ } else {
306+ struct hashfile_struct *hashfiles, *hashfile;
307+ int fd;
308+
309+ if (do_stat(fnametmp,&st) == -1) {
310+ rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
311+ return -1;
312+ }
313+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
314+
315+ if (hashfiles == NULL) {
316+ first = 1;
317+ asprintf(&linkname,"%s/0",hashname);
318+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
319+ } else {
320+
321+ /* Search for one identical to us. */
322+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
323+ rprintf(FERROR,"%s: %s\n",fnametmp,
324+ strerror(errno));
325+ kill_hashfiles(hashfiles);
326+ return -1;
327+ }
328+ hashfile = compare_hashfiles(fd, hashfiles);
329+ hashfiles = NULL;
330+
331+ if (hashfile) {
332+ first = 0;
333+ linkname = strdup(hashfile->name);
334+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
335+ kill_hashfile(hashfile);
336+ } else {
337+ first = 1;
338+ asprintf(&linkname, "%s/%ld", hashname,
339+ last_fnbr + 1);
340+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
341+ }
342+ }
343+ }
344+
345+ if (!first) {
346+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
347+ linkname, full_fname(fname));
348+ rc = do_link(linkname, fname);
349+ if (rc == -1) {
350+ if (errno == EMLINK) {
351+ first = 1;
352+ free(linkname);
353+ asprintf(&linkname,"%s/%ld",hashname,
354+ last_fnbr + 1);
355+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
356+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
357+ } else {
358+ rprintf(FERROR,"link \"%s\" -> %s: %s\n",
359+ linkname,full_fname(fname),
360+ strerror(errno));
361+ robust_unlink(fname);
362+ rc = robust_rename(fnametmp,fname,0644);
363+ }
364+ } else {
365+ do_unlink(fnametmp);
366+ }
367+ }
368+
369+ if (first) {
370+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
371+ full_fname(fname),linkname);
372+
373+ rc = robust_rename(fnametmp,fname,0644);
374+ if (rc != 0) {
375+ rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
376+ full_fname(fnametmp),full_fname(fname),
377+ strerror(errno));
378+ }
379+ rc = do_link(fname,linkname);
380+ if (rc != 0) {
381+ rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
382+ full_fname(fname),linkname,
383+ strerror(errno));
384+ }
385+ }
386+
387+ free(linkname);
388+ free(hashname);
389+ return rc;
390+}
391+
392+#endif
393--- Makefile.in 10 Feb 2004 17:06:11 -0000 1.98
394+++ Makefile.in 15 Apr 2004 19:18:59 -0000
395@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
396 main.o checksum.o match.o syscall.o log.o backup.o
397 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
398 fileio.o batch.o clientname.o
399-OBJS3=progress.o pipe.o
400+OBJS3=progress.o pipe.o hashlink.o
401 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
402 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
403 popt/popthelp.o popt/poptparse.o
404--- options.c 14 Apr 2004 23:33:34 -0000 1.146
405+++ options.c 15 Apr 2004 19:19:00 -0000
406@@ -121,6 +121,7 @@ char *log_format = NULL;
407 char *password_file = NULL;
408 char *rsync_path = RSYNC_PATH;
409 char *backup_dir = NULL;
410+char *link_by_hash_dir = NULL;
411 char backup_dir_buf[MAXPATHLEN];
412 int rsync_port = RSYNC_PORT;
413 int link_dest = 0;
414@@ -266,6 +267,7 @@ void usage(enum logcode F)
415 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
416 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
417 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
418+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
419 rprintf(F," -P equivalent to --partial --progress\n");
420 rprintf(F," -z, --compress compress file data\n");
421 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
422@@ -305,7 +307,7 @@ void usage(enum logcode F)
423 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
424 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
425 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
426- OPT_READ_BATCH, OPT_WRITE_BATCH,
427+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
428 OPT_REFUSED_BASE = 9000};
429
430 static struct poptOption long_options[] = {
431@@ -362,6 +364,7 @@ static struct poptOption long_options[]
432 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
433 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
434 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
435+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
436 /* TODO: Should this take an optional int giving the compression level? */
437 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
438 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
439@@ -584,6 +587,19 @@ int parse_arguments(int *argc, const cha
440 return 0;
441 #endif
442
443+ case OPT_LINK_BY_HASH:
444+#if HAVE_LINK
445+ link_by_hash_dir = (char *)poptGetOptArg(pc);
446+ checksum_seed = FIXED_CHECKSUM_SEED;
447+ break;
448+#else
449+ snprintf(err_buf, sizeof err_buf,
450+ "hard links are not supported on this %s\n",
451+ am_server ? "server" : "client");
452+ rprintf(FERROR, "ERROR: %s", err_buf);
453+ return 0;
454+#endif
455+
456 default:
457 /* A large opt value means that set_refuse_options()
458 * turned this option off (opt-BASE is its index). */
459@@ -951,6 +967,11 @@ void server_options(char **args,int *arg
460 */
461 args[ac++] = link_dest ? "--link-dest" : "--compare-dest";
462 args[ac++] = compare_dest;
463+ }
464+
465+ if (link_by_hash_dir && am_sender) {
466+ args[ac++] = "--link-by-hash";
467+ args[ac++] = link_by_hash_dir;
468 }
469
470 if (files_from && (!am_sender || remote_filesfrom_file)) {
471--- proto.h 14 Apr 2004 23:33:30 -0000 1.188
472+++ proto.h 15 Apr 2004 19:19:00 -0000
473@@ -92,6 +92,12 @@ char *f_name(struct file_struct *f);
474 void write_sum_head(int f, struct sum_struct *sum);
475 void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
476 void generate_files(int f, struct file_list *flist, char *local_name);
477+char* make_hash_name(struct file_struct *file);
478+void kill_hashfile(struct hashfile_struct *hashfile);
479+void kill_hashfiles(struct hashfile_struct *hashfiles);
480+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
481+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
482+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
483 void init_hard_links(struct file_list *flist);
484 int hard_link_check(struct file_struct *file, int skip);
485 void do_hard_links(void);
486--- receiver.c 23 Mar 2004 16:50:40 -0000 1.75
487+++ receiver.c 15 Apr 2004 19:19:00 -0000
488@@ -45,6 +45,7 @@ extern int cleanup_got_literal;
489 extern int module_id;
490 extern int ignore_errors;
491 extern int orig_umask;
492+extern char *link_by_hash_dir;
493
494 static void delete_one(char *fn, int is_dir)
495 {
496@@ -190,10 +191,11 @@ static int get_tmpname(char *fnametmp, c
497
498
499 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
500- OFF_T total_size)
501+ OFF_T total_size,char *md4)
502 {
503 int i;
504 struct sum_struct sum;
505+ struct mdfour mdfour_data;
506 unsigned int len;
507 OFF_T offset = 0;
508 OFF_T offset2;
509@@ -203,7 +205,9 @@ static int receive_data(int f_in,struct
510 char *map=NULL;
511
512 read_sum_head(f_in, &sum);
513-
514+ if (md4)
515+ mdfour_begin(&mdfour_data);
516+
517 sum_init();
518
519 while ((i = recv_token(f_in, &data)) != 0) {
520@@ -220,6 +224,8 @@ static int receive_data(int f_in,struct
521 cleanup_got_literal = 1;
522
523 sum_update(data,i);
524+ if (md4)
525+ mdfour_update(&mdfour_data,data,i);
526
527 if (fd != -1 && write_file(fd,data,i) != i) {
528 rprintf(FERROR, "write failed on %s: %s\n",
529@@ -247,6 +253,8 @@ static int receive_data(int f_in,struct
530
531 see_token(map, len);
532 sum_update(map,len);
533+ if (md4)
534+ mdfour_update(&mdfour_data,map,len);
535 }
536
537 if (fd != -1 && write_file(fd,map,len) != (int) len) {
538@@ -269,6 +277,8 @@ static int receive_data(int f_in,struct
539 }
540
541 sum_end(file_sum1);
542+ if (md4)
543+ mdfour_result(&mdfour_data, (unsigned char*)md4);
544
545 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
546 if (verbose > 2) {
547@@ -372,7 +382,7 @@ int recv_files(int f_in,struct file_list
548 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
549 rprintf(FERROR, "fstat %s failed: %s\n",
550 full_fname(fnamecmp), strerror(errno));
551- receive_data(f_in,NULL,-1,NULL,file->length);
552+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
553 close(fd1);
554 continue;
555 }
556@@ -385,7 +395,7 @@ int recv_files(int f_in,struct file_list
557 */
558 rprintf(FERROR,"recv_files: %s is a directory\n",
559 full_fname(fnamecmp));
560- receive_data(f_in, NULL, -1, NULL, file->length);
561+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
562 close(fd1);
563 continue;
564 }
565@@ -437,7 +447,7 @@ int recv_files(int f_in,struct file_list
566 if (fd2 == -1) {
567 rprintf(FERROR, "mkstemp %s failed: %s\n",
568 full_fname(fnametmp), strerror(errno));
569- receive_data(f_in,mapbuf,-1,NULL,file->length);
570+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
571 if (mapbuf) unmap_file(mapbuf);
572 if (fd1 != -1) close(fd1);
573 continue;
574@@ -450,7 +460,12 @@ int recv_files(int f_in,struct file_list
575 }
576
577 /* recv file data */
578- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
579+#ifdef HAVE_LINK
580+ if (link_by_hash_dir) {
581+ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
582+ }
583+#endif
584+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
585
586 log_recv(file, &initial_stats);
587
588--- rsync.c 23 Mar 2004 16:16:15 -0000 1.135
589+++ rsync.c 15 Apr 2004 19:19:00 -0000
590@@ -33,6 +33,7 @@ extern int preserve_uid;
591 extern int preserve_gid;
592 extern int preserve_perms;
593 extern int make_backups;
594+extern char *link_by_hash_dir;
595
596
597 /*
598@@ -235,8 +236,12 @@ void finish_transfer(char *fname, char *
599 if (make_backups && !make_backup(fname))
600 return;
601
602- /* move tmp file over real file */
603- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
604+#ifdef HAVE_LINK
605+ if (link_by_hash_dir)
606+ ret = link_by_hash(fnametmp,fname,file);
607+ else
608+#endif
609+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
610 if (ret < 0) {
611 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
612 ret == -2 ? "copy" : "rename",
613--- rsync.h 14 Apr 2004 23:33:37 -0000 1.196
614+++ rsync.h 15 Apr 2004 19:19:00 -0000
615@@ -519,6 +519,14 @@ struct stats {
616 int current_file_index;
617 };
618
619+struct hashfile_struct {
620+ struct hashfile_struct *next;
621+ struct hashfile_struct *prev;
622+ char *name;
623+ int fd;
624+ uint32 nlink;
625+};
626+
627
628 /* we need this function because of the silly way in which duplicate
629 entries are handled in the file lists - we can't change this