Changed the style of the diff headers (use "patch -p1" now).
[rsync/rsync-patches.git] / link-by-hash.diff
... / ...
CommitLineData
1After applying this patch, run these commands for a successful build:
2
3 ./prepare-source
4 ./configure (optional if already run)
5 make
6
7Jason M. Felice writes:
8
9This patch adds the --link-by-hash=DIR option, which hard links received
10files in a link farm arranged by MD4 file hash. The result is that the system
11will only store one copy of the unique contents of each file, regardless of
12the file's name.
13
14
15--- old/Makefile.in
16+++ new/Makefile.in
17@@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18 main.o checksum.o match.o syscall.o log.o backup.o
19 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20 fileio.o batch.o clientname.o chmod.o
21-OBJS3=progress.o pipe.o
22+OBJS3=progress.o pipe.o hashlink.o
23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
25 popt/popthelp.o popt/poptparse.o
26--- old/hashlink.c
27+++ new/hashlink.c
28@@ -0,0 +1,340 @@
29+/*
30+ Copyright (C) Cronosys, LLC 2004
31+
32+ This program is free software; you can redistribute it and/or modify
33+ it under the terms of the GNU General Public License as published by
34+ the Free Software Foundation; either version 2 of the License, or
35+ (at your option) any later version.
36+
37+ This program is distributed in the hope that it will be useful,
38+ but WITHOUT ANY WARRANTY; without even the implied warranty of
39+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40+ GNU General Public License for more details.
41+
42+ You should have received a copy of the GNU General Public License
43+ along with this program; if not, write to the Free Software
44+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45+*/
46+
47+/* This file contains code used by the --link-by-hash option. */
48+
49+#include "rsync.h"
50+
51+extern char *link_by_hash_dir;
52+
53+#if HAVE_LINK
54+
55+char* make_hash_name(struct file_struct *file)
56+{
57+ char hash[33], *dst;
58+ unsigned char *src;
59+ unsigned char c;
60+ int i;
61+
62+ src = (unsigned char*)file->u.sum;
63+ for (dst = hash, i = 0; i < 4; i++, src++) {
64+ c = *src >> 4;
65+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66+ c = *src & 0x0f;
67+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
68+ }
69+ *dst++ = '/';
70+ for (i = 0; i < 12; i++, src++) {
71+ c = *src >> 4;
72+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73+ c = *src & 0x0f;
74+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
75+ }
76+ *dst = 0;
77+
78+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
79+ return dst;
80+}
81+
82+
83+void kill_hashfile(struct hashfile_struct *hashfile)
84+{
85+ if (!hashfile)
86+ return;
87+ free(hashfile->name);
88+ close(hashfile->fd);
89+ free(hashfile);
90+}
91+
92+
93+void kill_hashfiles(struct hashfile_struct *hashfiles)
94+{
95+ struct hashfile_struct *iter, *next;
96+ if ((iter = hashfiles) != NULL) {
97+ do {
98+ next = iter->next;
99+ kill_hashfile(iter);
100+ iter = next;
101+ } while (iter != hashfiles);
102+ }
103+}
104+
105+
106+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
107+{
108+ DIR *d;
109+ struct dirent *di;
110+ struct hashfile_struct *hashfiles = NULL, *hashfile;
111+ STRUCT_STAT st;
112+ long this_fnbr;
113+
114+ *fnbr = 0;
115+
116+ /* Build a list of potential candidates and open
117+ * them. */
118+ if ((d = opendir(hashname)) == NULL) {
119+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
120+ free(hashname);
121+ return NULL;
122+ }
123+ while ((di = readdir(d)) != NULL) {
124+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125+ continue;
126+ }
127+
128+ /* We need to have the largest fnbr in case we need to store
129+ * a new file. */
130+ this_fnbr = atol(di->d_name);
131+ if (this_fnbr > *fnbr)
132+ *fnbr = this_fnbr;
133+
134+ hashfile = new_array(struct hashfile_struct, 1);
135+ asprintf(&hashfile->name,"%s/%s",hashname,
136+ di->d_name);
137+ if (do_stat(hashfile->name,&st) == -1) {
138+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139+ kill_hashfile(hashfile);
140+ continue;
141+ }
142+ if (st.st_size != size) {
143+ kill_hashfile(hashfile);
144+ continue;
145+ }
146+ hashfile->nlink = st.st_nlink;
147+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148+ if (hashfile->fd == -1) {
149+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150+ kill_hashfile(hashfile);
151+ continue;
152+ }
153+ if (hashfiles == NULL)
154+ hashfiles = hashfile->next = hashfile->prev = hashfile;
155+ else {
156+ hashfile->next = hashfiles;
157+ hashfile->prev = hashfiles->prev;
158+ hashfile->next->prev = hashfile;
159+ hashfile->prev->next = hashfile;
160+ }
161+ }
162+ closedir(d);
163+
164+ return hashfiles;
165+}
166+
167+
168+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
169+{
170+ int amt, hamt;
171+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172+ struct hashfile_struct *iter, *next, *best;
173+ uint32 nlink;
174+
175+ if (!files)
176+ return NULL;
177+
178+ iter = files; /* in case files are 0 bytes */
179+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
180+ iter = files;
181+ do {
182+ /* Icky bit to resync when we steal the first node. */
183+ if (!files)
184+ files = iter;
185+
186+ next = iter->next;
187+
188+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190+ if (iter == files) {
191+ files = files->prev;
192+ }
193+ if (iter->next == iter) {
194+ files = next = NULL;
195+ } else {
196+ next = iter->next;
197+ if (iter == files) {
198+ /* So we know to resync */
199+ files = NULL;
200+ }
201+ }
202+ iter->next->prev = iter->prev;
203+ iter->prev->next = iter->next;
204+ kill_hashfile(iter);
205+ }
206+
207+ iter = next;
208+ } while (iter != files);
209+
210+ if (iter == NULL && files == NULL) {
211+ /* There are no matches. */
212+ return NULL;
213+ }
214+ }
215+
216+ if (amt == -1) {
217+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218+ kill_hashfiles(files);
219+ return NULL;
220+ }
221+
222+ /* If we only have one file left, use it. */
223+ if (files == files->next) {
224+ return files;
225+ }
226+
227+ /* All files which remain in the list are identical and should have
228+ * the same size. We pick the one with the lowest link count (we
229+ * may have rolled over because we hit the maximum link count for
230+ * the filesystem). */
231+ best = iter = files;
232+ nlink = iter->nlink;
233+ do {
234+ if (iter->nlink < nlink) {
235+ nlink = iter->nlink;
236+ best = iter;
237+ }
238+ iter = iter->next;
239+ } while (iter != files);
240+
241+ best->next->prev = best->prev;
242+ best->prev->next = best->next;
243+ if (files == best)
244+ files = files->next;
245+ kill_hashfiles(files);
246+ return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+ STRUCT_STAT st;
253+ char *hashname = make_hash_name(file);
254+ int first = 0, rc;
255+ char *linkname;
256+ long last_fnbr;
257+
258+ if (file->length == 0) {
259+ return robust_rename(fnametmp,fname,0644);
260+ }
261+
262+ if (do_stat(hashname, &st) == -1) {
263+ char *dirname;
264+
265+ /* Directory does not exist. */
266+ dirname = strdup(hashname);
267+ *strrchr(dirname,'/') = 0;
268+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
269+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
270+ free(hashname);
271+ free(dirname);
272+ return robust_rename(fnametmp,fname,0644);
273+ }
274+ free(dirname);
275+
276+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
277+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
278+ free(hashname);
279+ return robust_rename(fnametmp,fname,0644);
280+ }
281+
282+ first = 1;
283+ asprintf(&linkname,"%s/0",hashname);
284+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
285+ } else {
286+ struct hashfile_struct *hashfiles, *hashfile;
287+
288+ if (do_stat(fnametmp,&st) == -1) {
289+ rsyserr(FERROR, errno, "stat failed: %s", fname);
290+ return -1;
291+ }
292+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
293+
294+ if (hashfiles == NULL) {
295+ first = 1;
296+ asprintf(&linkname,"%s/0",hashname);
297+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
298+ } else {
299+ int fd;
300+ /* Search for one identical to us. */
301+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
302+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
303+ kill_hashfiles(hashfiles);
304+ return -1;
305+ }
306+ hashfile = compare_hashfiles(fd, hashfiles);
307+ hashfiles = NULL;
308+ close(fd);
309+
310+ if (hashfile) {
311+ first = 0;
312+ linkname = strdup(hashfile->name);
313+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
314+ kill_hashfile(hashfile);
315+ } else {
316+ first = 1;
317+ asprintf(&linkname, "%s/%ld", hashname,
318+ last_fnbr + 1);
319+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
320+ }
321+ }
322+ }
323+
324+ if (!first) {
325+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
326+ linkname, full_fname(fname));
327+ robust_unlink(fname);
328+ rc = do_link(linkname, fname);
329+ if (rc == -1) {
330+ if (errno == EMLINK) {
331+ first = 1;
332+ free(linkname);
333+ asprintf(&linkname,"%s/%ld",hashname,
334+ last_fnbr + 1);
335+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
336+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337+ } else {
338+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339+ linkname, full_fname(fname));
340+ rc = robust_rename(fnametmp,fname,0644);
341+ }
342+ } else {
343+ do_unlink(fnametmp);
344+ }
345+ }
346+
347+ if (first) {
348+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
349+ full_fname(fname),linkname);
350+
351+ rc = robust_rename(fnametmp,fname,0644);
352+ if (rc != 0) {
353+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
354+ full_fname(fnametmp), full_fname(fname));
355+ }
356+ rc = do_link(fname,linkname);
357+ if (rc != 0) {
358+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
359+ full_fname(fname), linkname);
360+ }
361+ }
362+
363+ free(linkname);
364+ free(hashname);
365+ return rc;
366+}
367+
368+#endif
369--- old/options.c
370+++ new/options.c
371@@ -144,6 +144,7 @@ char *backup_suffix = NULL;
372 char *tmpdir = NULL;
373 char *partial_dir = NULL;
374 char *basis_dir[MAX_BASIS_DIRS+1];
375+char *link_by_hash_dir = NULL;
376 char *config_file = NULL;
377 char *shell_cmd = NULL;
378 char *log_format = NULL;
379@@ -337,6 +338,7 @@ void usage(enum logcode F)
380 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
381 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
382 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
383+ rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
384 rprintf(F," -z, --compress compress file data during the transfer\n");
385 rprintf(F," --compress-level=NUM explicitly set compression level\n");
386 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
387@@ -383,7 +385,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
388 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
389 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
390 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
391- OPT_NO_D,
392+ OPT_NO_D, OPT_LINK_BY_HASH,
393 OPT_SERVER, OPT_REFUSED_BASE = 9000};
394
395 static struct poptOption long_options[] = {
396@@ -478,6 +480,7 @@ static struct poptOption long_options[]
397 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
398 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
399 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
400+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
401 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
402 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
403 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
404@@ -1057,6 +1060,21 @@ int parse_arguments(int *argc, const cha
405 usage(FINFO);
406 exit_cleanup(0);
407
408+ case OPT_LINK_BY_HASH:
409+#if HAVE_LINK
410+ arg = poptGetOptArg(pc);
411+ if (sanitize_paths)
412+ arg = sanitize_path(NULL, arg, NULL, 0);
413+ link_by_hash_dir = (char *)arg;
414+ break;
415+#else
416+ snprintf(err_buf, sizeof err_buf,
417+ "hard links are not supported on this %s\n",
418+ am_server ? "server" : "client");
419+ rprintf(FERROR, "ERROR: %s", err_buf);
420+ return 0;
421+#endif
422+
423 default:
424 /* A large opt value means that set_refuse_options()
425 * turned this option off. */
426@@ -1706,6 +1724,11 @@ void server_options(char **args,int *arg
427 }
428 }
429
430+ if (link_by_hash_dir && am_sender) {
431+ args[ac++] = "--link-by-hash";
432+ args[ac++] = link_by_hash_dir;
433+ }
434+
435 if (files_from && (!am_sender || filesfrom_host)) {
436 if (filesfrom_host) {
437 args[ac++] = "--files-from";
438--- old/receiver.c
439+++ new/receiver.c
440@@ -54,6 +54,7 @@ extern int delay_updates;
441 extern struct stats stats;
442 extern char *log_format;
443 extern char *tmpdir;
444+extern char *link_by_hash_dir;
445 extern char *partial_dir;
446 extern char *basis_dir[];
447 extern struct file_list *the_file_list;
448@@ -186,12 +187,13 @@ static int get_tmpname(char *fnametmp, c
449
450
451 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
452- char *fname, int fd, OFF_T total_size)
453+ char *fname, int fd, OFF_T total_size, char *md4)
454 {
455 static char file_sum1[MD4_SUM_LENGTH];
456 static char file_sum2[MD4_SUM_LENGTH];
457 struct map_struct *mapbuf;
458 struct sum_struct sum;
459+ struct mdfour mdfour_data;
460 int32 len;
461 OFF_T offset = 0;
462 OFF_T offset2;
463@@ -211,6 +213,9 @@ static int receive_data(int f_in, char *
464 } else
465 mapbuf = NULL;
466
467+ if (md4)
468+ mdfour_begin(&mdfour_data);
469+
470 sum_init(checksum_seed);
471
472 if (append_mode) {
473@@ -253,6 +258,8 @@ static int receive_data(int f_in, char *
474 cleanup_got_literal = 1;
475
476 sum_update(data, i);
477+ if (md4)
478+ mdfour_update(&mdfour_data,data,i);
479
480 if (fd != -1 && write_file(fd,data,i) != i)
481 goto report_write_error;
482@@ -279,6 +286,8 @@ static int receive_data(int f_in, char *
483
484 see_token(map, len);
485 sum_update(map, len);
486+ if (md4)
487+ mdfour_update(&mdfour_data,map,len);
488 }
489
490 if (inplace) {
491@@ -319,6 +328,8 @@ static int receive_data(int f_in, char *
492 }
493
494 sum_end(file_sum1);
495+ if (md4)
496+ mdfour_result(&mdfour_data, (unsigned char*)md4);
497
498 if (mapbuf)
499 unmap_file(mapbuf);
500@@ -334,7 +345,7 @@ static int receive_data(int f_in, char *
501
502 static void discard_receive_data(int f_in, OFF_T length)
503 {
504- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
505+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
506 }
507
508 static void handle_delayed_updates(struct file_list *flist, char *local_name)
509@@ -666,8 +677,12 @@ int recv_files(int f_in, struct file_lis
510 rprintf(FINFO, "%s\n", fname);
511
512 /* recv file data */
513+#if HAVE_LINK
514+ if (link_by_hash_dir)
515+ file->u.sum = new_array(char, MD4_SUM_LENGTH);
516+#endif
517 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
518- fname, fd2, file->length);
519+ fname, fd2, file->length, file->u.sum);
520
521 if (!log_before_transfer)
522 log_item(file, &initial_stats, iflags, NULL);
523--- old/rsync.c
524+++ new/rsync.c
525@@ -49,6 +49,7 @@ extern int inplace;
526 extern int keep_dirlinks;
527 extern int make_backups;
528 extern struct stats stats;
529+extern char *link_by_hash_dir;
530
531 #if defined HAVE_ICONV_OPEN && defined HAVE_ICONV_H
532 iconv_t ic_chck = (iconv_t)-1;
533@@ -257,8 +258,15 @@ void finish_transfer(char *fname, char *
534 /* move tmp file over real file */
535 if (verbose > 2)
536 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
537- ret = robust_rename(fnametmp, fname, partialptr,
538- file->mode & INITACCESSPERMS);
539+#if HAVE_LINK
540+ if (link_by_hash_dir)
541+ ret = link_by_hash(fnametmp, fname, file);
542+ else
543+#endif
544+ {
545+ ret = robust_rename(fnametmp, fname, partialptr,
546+ file->mode & INITACCESSPERMS);
547+ }
548 if (ret < 0) {
549 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
550 ret == -2 ? "copy" : "rename",
551--- old/rsync.h
552+++ new/rsync.h
553@@ -640,6 +640,14 @@ struct stats {
554 int current_file_index;
555 };
556
557+struct hashfile_struct {
558+ struct hashfile_struct *next;
559+ struct hashfile_struct *prev;
560+ char *name;
561+ int fd;
562+ uint32 nlink;
563+};
564+
565 struct chmod_mode_struct;
566
567 #include "byteorder.h"
568--- old/rsync.yo
569+++ new/rsync.yo
570@@ -361,6 +361,7 @@ to the detailed description below for a
571 --compare-dest=DIR also compare received files relative to DIR
572 --copy-dest=DIR ... and include copies of unchanged files
573 --link-dest=DIR hardlink to files in DIR when unchanged
574+ --link-by-hash=DIR create hardlinks by hash into DIR
575 -z, --compress compress file data during the transfer
576 --compress-level=NUM explicitly set compression level
577 -C, --cvs-exclude auto-ignore files in the same way CVS does