- Improved the recently-added exclude-list comment.
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
8a529471
WD
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
2eb075b2
WD
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
2eb075b2 13
8a529471
WD
14--- Makefile.in 2 May 2004 17:04:14 -0000 1.100
15+++ Makefile.in 13 May 2004 19:04:49 -0000
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
25--- /dev/null 1 Jan 1970 00:00:00 -0000
26+++ hashlink.c 13 May 2004 19:04:49 -0000
c57f4101
WD
27@@ -0,0 +1,351 @@
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#ifdef HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
114+
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
118+ rprintf(FERROR,"opendir \"%s\": %s\n",
119+ hashname, strerror(errno));
120+ free(hashname);
121+ return NULL;
122+ }
123+ while ((di = readdir(d)) != NULL) {
124+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125+ continue;
126+ }
127+
128+ /* We need to have the largest fnbr in case we need to store
129+ * a new file. */
130+ this_fnbr = atol(di->d_name);
131+ if (this_fnbr > *fnbr)
132+ *fnbr = this_fnbr;
133+
134+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
135+ asprintf(&hashfile->name,"%s/%s",hashname,
136+ di->d_name);
137+ if (do_stat(hashfile->name,&st) == -1) {
138+ rprintf(FERROR,"%s: %s", hashfile->name,
139+ strerror(errno));
140+ kill_hashfile(hashfile);
141+ continue;
142+ }
143+ if (st.st_size != size) {
144+ kill_hashfile(hashfile);
145+ continue;
146+ }
147+ hashfile->nlink = st.st_nlink;
148+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
149+ if (hashfile->fd == -1) {
150+ rprintf(FERROR,"%s: %s\n", hashfile->name,
151+ strerror(errno));
152+ kill_hashfile(hashfile);
153+ continue;
154+ }
155+ if (hashfiles == NULL)
156+ hashfiles = hashfile->next = hashfile->prev = hashfile;
157+ else {
158+ hashfile->next = hashfiles;
159+ hashfile->prev = hashfiles->prev;
160+ hashfile->next->prev = hashfile;
161+ hashfile->prev->next = hashfile;
162+ }
163+ }
164+ closedir(d);
165+
166+ return hashfiles;
167+}
168+
169+
170+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
171+{
172+ int amt, hamt;
173+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
174+ struct hashfile_struct *iter, *next, *best;
175+ uint32 nlink;
176+
177+ if (!files)
178+ return NULL;
179+
180+ iter = files; /* in case files are 0 bytes */
181+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
182+ iter = files;
183+ do {
184+ /* Icky bit to resync when we steal the first node. */
185+ if (!files)
186+ files = iter;
187+
188+ next = iter->next;
189+
190+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
191+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
192+ if (iter == files) {
193+ files = files->prev;
194+ }
195+ if (iter->next == iter) {
196+ files = next = NULL;
197+ } else {
198+ next = iter->next;
199+ if (iter == files) {
200+ /* So we know to resync */
201+ files = NULL;
202+ }
203+ }
204+ iter->next->prev = iter->prev;
205+ iter->prev->next = iter->next;
206+ kill_hashfile(iter);
207+ }
208+
209+ iter = next;
210+ } while (iter != files);
211+
212+ if (iter == NULL && files == NULL) {
213+ /* There are no matches. */
214+ return NULL;
215+ }
216+
217+ }
218+
219+ if (amt == -1) {
220+ rprintf(FERROR,"%s",strerror(errno));
221+ kill_hashfiles(files);
222+ return NULL;
223+ }
224+
225+ /* If we only have one file left, use it. */
226+ if (files == files->next) {
227+ return files;
228+ }
229+
230+ /* All files which remain in the list are identical and should have
231+ * the same size. We pick the one with the lowest link count (we
232+ * may have rolled over because we hit the maximum link count for
233+ * the filesystem). */
234+ best = iter = files;
235+ nlink = iter->nlink;
236+ do {
237+ if (iter->nlink < nlink) {
238+ nlink = iter->nlink;
239+ best = iter;
240+ }
241+ iter = iter->next;
242+ } while (iter != files);
243+
244+ best->next->prev = best->prev;
245+ best->prev->next = best->next;
246+ if (files == best)
247+ files = files->next;
248+ kill_hashfiles(files);
249+ return best;
250+}
251+
252+
253+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
254+{
255+ STRUCT_STAT st;
256+ char *hashname = make_hash_name(file);
257+ int first = 0, rc;
258+ char *linkname;
259+ long last_fnbr;
260+
261+ if (file->length == 0) {
262+ return robust_rename(fnametmp,fname,0644);
263+ }
264+
265+ if (do_stat(hashname, &st) == -1) {
266+ char *dirname;
267+
268+ /* Directory does not exist. */
269+ dirname = strdup(hashname);
270+ *strrchr(dirname,'/') = 0;
271+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
272+ rprintf(FERROR, "mkdir %s: %s\n", dirname,
273+ strerror(errno));
274+ free(hashname);
275+ free(dirname);
276+ return robust_rename(fnametmp,fname,0644);
277+ }
278+ free(dirname);
279+
280+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
281+ rprintf(FERROR, "mkdir %s: %s\n", hashname,
282+ strerror(errno));
283+ free(hashname);
284+ return robust_rename(fnametmp,fname,0644);
285+ }
286+
287+ first = 1;
288+ asprintf(&linkname,"%s/0",hashname);
289+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
290+
291+ } else {
292+ struct hashfile_struct *hashfiles, *hashfile;
293+ int fd;
294+
295+ if (do_stat(fnametmp,&st) == -1) {
296+ rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
297+ return -1;
298+ }
299+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
300+
301+ if (hashfiles == NULL) {
302+ first = 1;
303+ asprintf(&linkname,"%s/0",hashname);
304+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
305+ } else {
306+
307+ /* Search for one identical to us. */
308+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
309+ rprintf(FERROR,"%s: %s\n",fnametmp,
310+ strerror(errno));
311+ kill_hashfiles(hashfiles);
312+ return -1;
313+ }
314+ hashfile = compare_hashfiles(fd, hashfiles);
315+ hashfiles = NULL;
316+
317+ if (hashfile) {
318+ first = 0;
319+ linkname = strdup(hashfile->name);
320+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
321+ kill_hashfile(hashfile);
322+ } else {
323+ first = 1;
324+ asprintf(&linkname, "%s/%ld", hashname,
325+ last_fnbr + 1);
326+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
327+ }
328+ }
329+ }
330+
331+ if (!first) {
332+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
333+ linkname, full_fname(fname));
334+ rc = do_link(linkname, fname);
335+ if (rc == -1) {
336+ if (errno == EMLINK) {
337+ first = 1;
338+ free(linkname);
339+ asprintf(&linkname,"%s/%ld",hashname,
340+ last_fnbr + 1);
341+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
342+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
343+ } else {
344+ rprintf(FERROR,"link \"%s\" -> %s: %s\n",
345+ linkname,full_fname(fname),
346+ strerror(errno));
347+ robust_unlink(fname);
348+ rc = robust_rename(fnametmp,fname,0644);
349+ }
350+ } else {
351+ do_unlink(fnametmp);
352+ }
353+ }
354+
355+ if (first) {
356+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
357+ full_fname(fname),linkname);
358+
359+ rc = robust_rename(fnametmp,fname,0644);
360+ if (rc != 0) {
361+ rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
362+ full_fname(fnametmp),full_fname(fname),
363+ strerror(errno));
364+ }
365+ rc = do_link(fname,linkname);
366+ if (rc != 0) {
367+ rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
368+ full_fname(fname),linkname,
369+ strerror(errno));
370+ }
371+ }
372+
373+ free(linkname);
374+ free(hashname);
375+ return rc;
376+}
377+
378+#endif
8a529471
WD
379--- options.c 6 May 2004 21:08:01 -0000 1.148
380+++ options.c 13 May 2004 19:04:49 -0000
54691942 381@@ -121,6 +121,7 @@ char *log_format = NULL;
c57f4101
WD
382 char *password_file = NULL;
383 char *rsync_path = RSYNC_PATH;
384 char *backup_dir = NULL;
385+char *link_by_hash_dir = NULL;
386 char backup_dir_buf[MAXPATHLEN];
387 int rsync_port = RSYNC_PORT;
388 int link_dest = 0;
54691942 389@@ -266,6 +267,7 @@ void usage(enum logcode F)
c57f4101
WD
390 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
391 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
392 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
393+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
394 rprintf(F," -P equivalent to --partial --progress\n");
395 rprintf(F," -z, --compress compress file data\n");
396 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
54691942 397@@ -305,7 +307,7 @@ void usage(enum logcode F)
c57f4101
WD
398 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
399 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
400 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
401- OPT_READ_BATCH, OPT_WRITE_BATCH,
402+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
403 OPT_REFUSED_BASE = 9000};
404
405 static struct poptOption long_options[] = {
54691942 406@@ -362,6 +364,7 @@ static struct poptOption long_options[]
c57f4101
WD
407 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
408 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
54691942 409 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
c57f4101
WD
410+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
411 /* TODO: Should this take an optional int giving the compression level? */
412 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
413 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
54691942 414@@ -584,6 +587,19 @@ int parse_arguments(int *argc, const cha
c57f4101
WD
415 return 0;
416 #endif
417
418+ case OPT_LINK_BY_HASH:
419+#if HAVE_LINK
420+ link_by_hash_dir = (char *)poptGetOptArg(pc);
421+ checksum_seed = FIXED_CHECKSUM_SEED;
422+ break;
423+#else
424+ snprintf(err_buf, sizeof err_buf,
425+ "hard links are not supported on this %s\n",
426+ am_server ? "server" : "client");
427+ rprintf(FERROR, "ERROR: %s", err_buf);
428+ return 0;
429+#endif
430+
431 default:
432 /* A large opt value means that set_refuse_options()
433 * turned this option off (opt-BASE is its index). */
54691942
WD
434@@ -951,6 +967,11 @@ void server_options(char **args,int *arg
435 */
436 args[ac++] = link_dest ? "--link-dest" : "--compare-dest";
c57f4101 437 args[ac++] = compare_dest;
54691942
WD
438+ }
439+
c57f4101
WD
440+ if (link_by_hash_dir && am_sender) {
441+ args[ac++] = "--link-by-hash";
442+ args[ac++] = link_by_hash_dir;
54691942
WD
443 }
444
c57f4101 445 if (files_from && (!am_sender || remote_filesfrom_file)) {
8a529471
WD
446--- receiver.c 13 May 2004 07:08:22 -0000 1.77
447+++ receiver.c 13 May 2004 19:04:49 -0000
448@@ -46,6 +46,7 @@ extern int module_id;
54691942
WD
449 extern int ignore_errors;
450 extern int orig_umask;
8a529471 451 extern int keep_partial;
54691942
WD
452+extern char *link_by_hash_dir;
453
454 static void delete_one(char *fn, int is_dir)
455 {
8a529471 456@@ -191,10 +192,11 @@ static int get_tmpname(char *fnametmp, c
c57f4101
WD
457
458
459 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
460- OFF_T total_size)
461+ OFF_T total_size,char *md4)
462 {
463 int i;
464 struct sum_struct sum;
465+ struct mdfour mdfour_data;
466 unsigned int len;
467 OFF_T offset = 0;
468 OFF_T offset2;
8a529471 469@@ -204,7 +206,9 @@ static int receive_data(int f_in,struct
c57f4101
WD
470 char *map=NULL;
471
472 read_sum_head(f_in, &sum);
473-
474+ if (md4)
475+ mdfour_begin(&mdfour_data);
476+
477 sum_init();
478
479 while ((i = recv_token(f_in, &data)) != 0) {
8a529471 480@@ -221,6 +225,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
481 cleanup_got_literal = 1;
482
483 sum_update(data,i);
484+ if (md4)
485+ mdfour_update(&mdfour_data,data,i);
486
487 if (fd != -1 && write_file(fd,data,i) != i) {
488 rprintf(FERROR, "write failed on %s: %s\n",
8a529471 489@@ -248,6 +254,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
490
491 see_token(map, len);
492 sum_update(map,len);
493+ if (md4)
494+ mdfour_update(&mdfour_data,map,len);
495 }
496
497 if (fd != -1 && write_file(fd,map,len) != (int) len) {
8a529471 498@@ -270,6 +278,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
499 }
500
501 sum_end(file_sum1);
502+ if (md4)
503+ mdfour_result(&mdfour_data, (unsigned char*)md4);
504
505 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
506 if (verbose > 2) {
8a529471 507@@ -373,7 +383,7 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
508 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
509 rprintf(FERROR, "fstat %s failed: %s\n",
510 full_fname(fnamecmp), strerror(errno));
511- receive_data(f_in,NULL,-1,NULL,file->length);
512+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
513 close(fd1);
514 continue;
515 }
8a529471 516@@ -386,7 +396,7 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
517 */
518 rprintf(FERROR,"recv_files: %s is a directory\n",
519 full_fname(fnamecmp));
520- receive_data(f_in, NULL, -1, NULL, file->length);
521+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
522 close(fd1);
523 continue;
524 }
8a529471 525@@ -438,7 +448,7 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
526 if (fd2 == -1) {
527 rprintf(FERROR, "mkstemp %s failed: %s\n",
528 full_fname(fnametmp), strerror(errno));
529- receive_data(f_in,mapbuf,-1,NULL,file->length);
530+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
531 if (mapbuf) unmap_file(mapbuf);
532 if (fd1 != -1) close(fd1);
533 continue;
8a529471 534@@ -451,7 +461,12 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
535 }
536
537 /* recv file data */
538- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
539+#ifdef HAVE_LINK
540+ if (link_by_hash_dir) {
541+ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
542+ }
543+#endif
544+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
545
546 log_recv(file, &initial_stats);
547
8a529471
WD
548--- rsync.c 13 May 2004 18:51:22 -0000 1.138
549+++ rsync.c 13 May 2004 19:04:49 -0000
550@@ -31,6 +31,7 @@ extern int am_generator;
551 extern int preserve_uid;
c57f4101 552 extern int preserve_gid;
c57f4101
WD
553 extern int make_backups;
554+extern char *link_by_hash_dir;
555
556
557 /*
8a529471 558@@ -236,8 +237,12 @@ void finish_transfer(char *fname, char *
c57f4101
WD
559 if (make_backups && !make_backup(fname))
560 return;
561
2eb075b2
WD
562- /* move tmp file over real file */
563- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
c57f4101 564+#ifdef HAVE_LINK
2eb075b2
WD
565+ if (link_by_hash_dir)
566+ ret = link_by_hash(fnametmp,fname,file);
567+ else
c57f4101 568+#endif
2eb075b2 569+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
54691942 570 if (ret < 0) {
2eb075b2
WD
571 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
572 ret == -2 ? "copy" : "rename",
8a529471
WD
573--- rsync.h 13 May 2004 18:51:22 -0000 1.203
574+++ rsync.h 13 May 2004 19:04:50 -0000
575@@ -521,6 +521,14 @@ struct stats {
c57f4101
WD
576 int current_file_index;
577 };
578
579+struct hashfile_struct {
580+ struct hashfile_struct *next;
581+ struct hashfile_struct *prev;
582+ char *name;
583+ int fd;
584+ uint32 nlink;
585+};
586+
587
588 /* we need this function because of the silly way in which duplicate
589 entries are handled in the file lists - we can't change this