Fixed some patch fuzz.
[rsync/rsync-patches.git] / link-by-hash.diff
... / ...
CommitLineData
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
13
14--- orig/Makefile.in 2004-11-03 11:56:03
15+++ Makefile.in 2004-07-03 20:20:15
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
25--- orig/hashlink.c 2004-09-24 16:44:25
26+++ hashlink.c 2004-09-24 16:44:25
27@@ -0,0 +1,340 @@
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#if HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
114+
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
118+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
119+ free(hashname);
120+ return NULL;
121+ }
122+ while ((di = readdir(d)) != NULL) {
123+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124+ continue;
125+ }
126+
127+ /* We need to have the largest fnbr in case we need to store
128+ * a new file. */
129+ this_fnbr = atol(di->d_name);
130+ if (this_fnbr > *fnbr)
131+ *fnbr = this_fnbr;
132+
133+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134+ asprintf(&hashfile->name,"%s/%s",hashname,
135+ di->d_name);
136+ if (do_stat(hashfile->name,&st) == -1) {
137+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
138+ kill_hashfile(hashfile);
139+ continue;
140+ }
141+ if (st.st_size != size) {
142+ kill_hashfile(hashfile);
143+ continue;
144+ }
145+ hashfile->nlink = st.st_nlink;
146+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147+ if (hashfile->fd == -1) {
148+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
149+ kill_hashfile(hashfile);
150+ continue;
151+ }
152+ if (hashfiles == NULL)
153+ hashfiles = hashfile->next = hashfile->prev = hashfile;
154+ else {
155+ hashfile->next = hashfiles;
156+ hashfile->prev = hashfiles->prev;
157+ hashfile->next->prev = hashfile;
158+ hashfile->prev->next = hashfile;
159+ }
160+ }
161+ closedir(d);
162+
163+ return hashfiles;
164+}
165+
166+
167+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168+{
169+ int amt, hamt;
170+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171+ struct hashfile_struct *iter, *next, *best;
172+ uint32 nlink;
173+
174+ if (!files)
175+ return NULL;
176+
177+ iter = files; /* in case files are 0 bytes */
178+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179+ iter = files;
180+ do {
181+ /* Icky bit to resync when we steal the first node. */
182+ if (!files)
183+ files = iter;
184+
185+ next = iter->next;
186+
187+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189+ if (iter == files) {
190+ files = files->prev;
191+ }
192+ if (iter->next == iter) {
193+ files = next = NULL;
194+ } else {
195+ next = iter->next;
196+ if (iter == files) {
197+ /* So we know to resync */
198+ files = NULL;
199+ }
200+ }
201+ iter->next->prev = iter->prev;
202+ iter->prev->next = iter->next;
203+ kill_hashfile(iter);
204+ }
205+
206+ iter = next;
207+ } while (iter != files);
208+
209+ if (iter == NULL && files == NULL) {
210+ /* There are no matches. */
211+ return NULL;
212+ }
213+ }
214+
215+ if (amt == -1) {
216+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
217+ kill_hashfiles(files);
218+ return NULL;
219+ }
220+
221+ /* If we only have one file left, use it. */
222+ if (files == files->next) {
223+ return files;
224+ }
225+
226+ /* All files which remain in the list are identical and should have
227+ * the same size. We pick the one with the lowest link count (we
228+ * may have rolled over because we hit the maximum link count for
229+ * the filesystem). */
230+ best = iter = files;
231+ nlink = iter->nlink;
232+ do {
233+ if (iter->nlink < nlink) {
234+ nlink = iter->nlink;
235+ best = iter;
236+ }
237+ iter = iter->next;
238+ } while (iter != files);
239+
240+ best->next->prev = best->prev;
241+ best->prev->next = best->next;
242+ if (files == best)
243+ files = files->next;
244+ kill_hashfiles(files);
245+ return best;
246+}
247+
248+
249+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
250+{
251+ STRUCT_STAT st;
252+ char *hashname = make_hash_name(file);
253+ int first = 0, rc;
254+ char *linkname;
255+ long last_fnbr;
256+
257+ if (file->length == 0) {
258+ return robust_rename(fnametmp,fname,0644);
259+ }
260+
261+ if (do_stat(hashname, &st) == -1) {
262+ char *dirname;
263+
264+ /* Directory does not exist. */
265+ dirname = strdup(hashname);
266+ *strrchr(dirname,'/') = 0;
267+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
268+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
269+ free(hashname);
270+ free(dirname);
271+ return robust_rename(fnametmp,fname,0644);
272+ }
273+ free(dirname);
274+
275+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
276+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
277+ free(hashname);
278+ return robust_rename(fnametmp,fname,0644);
279+ }
280+
281+ first = 1;
282+ asprintf(&linkname,"%s/0",hashname);
283+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
284+ } else {
285+ struct hashfile_struct *hashfiles, *hashfile;
286+
287+ if (do_stat(fnametmp,&st) == -1) {
288+ rsyserr(FERROR, errno, "stat failed: %s", fname);
289+ return -1;
290+ }
291+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292+
293+ if (hashfiles == NULL) {
294+ first = 1;
295+ asprintf(&linkname,"%s/0",hashname);
296+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
297+ } else {
298+ int fd;
299+ /* Search for one identical to us. */
300+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
301+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
302+ kill_hashfiles(hashfiles);
303+ return -1;
304+ }
305+ hashfile = compare_hashfiles(fd, hashfiles);
306+ hashfiles = NULL;
307+ close(fd);
308+
309+ if (hashfile) {
310+ first = 0;
311+ linkname = strdup(hashfile->name);
312+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
313+ kill_hashfile(hashfile);
314+ } else {
315+ first = 1;
316+ asprintf(&linkname, "%s/%ld", hashname,
317+ last_fnbr + 1);
318+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
319+ }
320+ }
321+ }
322+
323+ if (!first) {
324+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325+ linkname, full_fname(fname));
326+ robust_unlink(fname);
327+ rc = do_link(linkname, fname);
328+ if (rc == -1) {
329+ if (errno == EMLINK) {
330+ first = 1;
331+ free(linkname);
332+ asprintf(&linkname,"%s/%ld",hashname,
333+ last_fnbr + 1);
334+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
335+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336+ } else {
337+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338+ linkname, full_fname(fname));
339+ rc = robust_rename(fnametmp,fname,0644);
340+ }
341+ } else {
342+ do_unlink(fnametmp);
343+ }
344+ }
345+
346+ if (first) {
347+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348+ full_fname(fname),linkname);
349+
350+ rc = robust_rename(fnametmp,fname,0644);
351+ if (rc != 0) {
352+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353+ full_fname(fnametmp), full_fname(fname));
354+ }
355+ rc = do_link(fname,linkname);
356+ if (rc != 0) {
357+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358+ full_fname(fname), linkname);
359+ }
360+ }
361+
362+ free(linkname);
363+ free(hashname);
364+ return rc;
365+}
366+
367+#endif
368--- orig/options.c 2005-02-11 18:21:45
369+++ options.c 2005-01-28 19:32:26
370@@ -131,6 +131,7 @@ char *log_format = NULL;
371 char *password_file = NULL;
372 char *rsync_path = RSYNC_PATH;
373 char *backup_dir = NULL;
374+char *link_by_hash_dir = NULL;
375 char backup_dir_buf[MAXPATHLEN];
376 int rsync_port = 0;
377 int compare_dest = 0;
378@@ -305,6 +306,7 @@ void usage(enum logcode F)
379 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
380 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
381 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
382+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
383 rprintf(F," -z, --compress compress file data\n");
384 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
385 rprintf(F," -f, --filter=RULE add a file-filtering RULE\n");
386@@ -343,7 +345,7 @@ void usage(enum logcode F)
387 enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
388 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST,
389 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
390- OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_MAX_SIZE,
391+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_MAX_SIZE, OPT_LINK_BY_HASH,
392 OPT_REFUSED_BASE = 9000};
393
394 static struct poptOption long_options[] = {
395@@ -411,6 +413,7 @@ static struct poptOption long_options[]
396 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
397 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
398 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
399+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
400 /* TODO: Should this take an optional int giving the compression level? */
401 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
402 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
403@@ -831,6 +834,21 @@ int parse_arguments(int *argc, const cha
404 basis_dir[basis_dir_cnt++] = (char *)arg;
405 break;
406
407+ case OPT_LINK_BY_HASH:
408+#if HAVE_LINK
409+ arg = poptGetOptArg(pc);
410+ if (sanitize_paths)
411+ arg = sanitize_path(NULL, arg, NULL, 0);
412+ link_by_hash_dir = (char *)arg;
413+ break;
414+#else
415+ snprintf(err_buf, sizeof err_buf,
416+ "hard links are not supported on this %s\n",
417+ am_server ? "server" : "client");
418+ rprintf(FERROR, "ERROR: %s", err_buf);
419+ return 0;
420+#endif
421+
422 default:
423 /* A large opt value means that set_refuse_options()
424 * turned this option off. */
425@@ -1366,6 +1384,11 @@ void server_options(char **args,int *arg
426 }
427 }
428
429+ if (link_by_hash_dir && am_sender) {
430+ args[ac++] = "--link-by-hash";
431+ args[ac++] = link_by_hash_dir;
432+ }
433+
434 if (files_from && (!am_sender || remote_filesfrom_file)) {
435 if (remote_filesfrom_file) {
436 args[ac++] = "--files-from";
437--- orig/receiver.c 2005-02-11 10:53:14
438+++ receiver.c 2005-01-15 21:29:13
439@@ -35,6 +35,7 @@ extern int preserve_hard_links;
440 extern int preserve_perms;
441 extern int io_error;
442 extern char *tmpdir;
443+extern char *link_by_hash_dir;
444 extern char *partial_dir;
445 extern char *basis_dir[];
446 extern int basis_dir_cnt;
447@@ -137,12 +138,13 @@ static int get_tmpname(char *fnametmp, c
448
449
450 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
451- char *fname, int fd, OFF_T total_size)
452+ char *fname, int fd, OFF_T total_size, char *md4)
453 {
454 static char file_sum1[MD4_SUM_LENGTH];
455 static char file_sum2[MD4_SUM_LENGTH];
456 struct map_struct *mapbuf;
457 struct sum_struct sum;
458+ struct mdfour mdfour_data;
459 int32 len;
460 OFF_T offset = 0;
461 OFF_T offset2;
462@@ -162,6 +164,9 @@ static int receive_data(int f_in, char *
463 } else
464 mapbuf = NULL;
465
466+ if (md4)
467+ mdfour_begin(&mdfour_data);
468+
469 sum_init(checksum_seed);
470
471 while ((i = recv_token(f_in, &data)) != 0) {
472@@ -178,6 +183,8 @@ static int receive_data(int f_in, char *
473 cleanup_got_literal = 1;
474
475 sum_update(data, i);
476+ if (md4)
477+ mdfour_update(&mdfour_data,data,i);
478
479 if (fd != -1 && write_file(fd,data,i) != i)
480 goto report_write_error;
481@@ -204,6 +211,8 @@ static int receive_data(int f_in, char *
482
483 see_token(map, len);
484 sum_update(map, len);
485+ if (md4)
486+ mdfour_update(&mdfour_data,map,len);
487 }
488
489 if (inplace) {
490@@ -244,6 +253,8 @@ static int receive_data(int f_in, char *
491 }
492
493 sum_end(file_sum1);
494+ if (md4)
495+ mdfour_result(&mdfour_data, (unsigned char*)md4);
496
497 if (mapbuf)
498 unmap_file(mapbuf);
499@@ -259,7 +270,7 @@ static int receive_data(int f_in, char *
500
501 static void discard_receive_data(int f_in, OFF_T length)
502 {
503- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
504+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
505 }
506
507
508@@ -501,8 +512,12 @@ int recv_files(int f_in, struct file_lis
509 rprintf(FINFO, "%s\n", safe_fname(fname));
510
511 /* recv file data */
512+#if HAVE_LINK
513+ if (link_by_hash_dir)
514+ file->u.sum = (char*)malloc(MD4_SUM_LENGTH);
515+#endif
516 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
517- fname, fd2, file->length);
518+ fname, fd2, file->length, file->u.sum);
519
520 log_recv(file, &initial_stats);
521
522--- orig/rsync.c 2005-02-07 20:41:57
523+++ rsync.c 2005-02-07 21:11:30
524@@ -36,6 +36,7 @@ extern int force_delete;
525 extern int recurse;
526 extern int keep_dirlinks;
527 extern int make_backups;
528+extern char *link_by_hash_dir;
529 extern char *backup_dir;
530 extern int inplace;
531
532@@ -287,7 +288,12 @@ void finish_transfer(char *fname, char *
533 rprintf(FINFO, "renaming %s to %s\n",
534 safe_fname(fnametmp), safe_fname(fname));
535 }
536- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
537+#if HAVE_LINK
538+ if (link_by_hash_dir)
539+ ret = link_by_hash(fnametmp, fname, file);
540+ else
541+#endif
542+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
543 if (ret < 0) {
544 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
545 ret == -2 ? "copy" : "rename",
546--- orig/rsync.h 2005-02-07 20:41:57
547+++ rsync.h 2004-07-03 20:20:15
548@@ -600,6 +600,14 @@ struct stats {
549 int current_file_index;
550 };
551
552+struct hashfile_struct {
553+ struct hashfile_struct *next;
554+ struct hashfile_struct *prev;
555+ char *name;
556+ int fd;
557+ uint32 nlink;
558+};
559+
560
561 #include "byteorder.h"
562 #include "lib/mdfour.h"
563--- orig/rsync.yo 2005-02-11 23:14:49
564+++ rsync.yo 2005-01-28 19:32:45
565@@ -354,6 +354,7 @@ to the detailed description below for a
566 --compare-dest=DIR also compare received files relative to DIR
567 --copy-dest=DIR ... and include copies of unchanged files
568 --link-dest=DIR hardlink to files in DIR when unchanged
569+ --link-by-hash=DIR create hardlinks by hash to DIR for regular files
570 -z, --compress compress file data
571 -C, --cvs-exclude auto-ignore files in the same way CVS does
572 -f, --filter=RULE add a file-filtering RULE