Make sure that any weird mode bits (if they exist up beyond the
[rsync/rsync-patches.git] / link-by-hash.diff
... / ...
CommitLineData
1Jason M. Felice wrote:
2
3This patch adds the --link-by-hash=DIR option, which hard links received
4files in a link farm arranged by MD4 file hash. The result is that the system
5will only store one copy of the unique contents of each file, regardless of
6the file's name.
7
8To use this patch, run these commands for a successful build:
9
10 patch -p1 <patches/link-by-hash.diff
11 ./prepare-source
12 ./configure
13 make
14
15--- old/Makefile.in
16+++ new/Makefile.in
17@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18 main.o checksum.o match.o syscall.o log.o backup.o
19 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20 fileio.o batch.o clientname.o chmod.o
21-OBJS3=progress.o pipe.o
22+OBJS3=progress.o pipe.o hashlink.o
23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
25 popt/popthelp.o popt/poptparse.o
26--- old/hashlink.c
27+++ new/hashlink.c
28@@ -0,0 +1,339 @@
29+/*
30+ Copyright (C) Cronosys, LLC 2004
31+
32+ This program is free software; you can redistribute it and/or modify
33+ it under the terms of the GNU General Public License as published by
34+ the Free Software Foundation; either version 2 of the License, or
35+ (at your option) any later version.
36+
37+ This program is distributed in the hope that it will be useful,
38+ but WITHOUT ANY WARRANTY; without even the implied warranty of
39+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40+ GNU General Public License for more details.
41+
42+ You should have received a copy of the GNU General Public License
43+ along with this program; if not, write to the Free Software
44+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45+*/
46+
47+/* This file contains code used by the --link-by-hash option. */
48+
49+#include "rsync.h"
50+
51+extern char *link_by_hash_dir;
52+
53+#if HAVE_LINK
54+
55+char* make_hash_name(struct file_struct *file)
56+{
57+ char hash[33], *dst;
58+ unsigned char *src;
59+ unsigned char c;
60+ int i;
61+
62+ src = (unsigned char*)file->u.sum;
63+ for (dst = hash, i = 0; i < 4; i++, src++) {
64+ c = *src >> 4;
65+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66+ c = *src & 0x0f;
67+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
68+ }
69+ *dst++ = '/';
70+ for (i = 0; i < 12; i++, src++) {
71+ c = *src >> 4;
72+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73+ c = *src & 0x0f;
74+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
75+ }
76+ *dst = 0;
77+
78+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
79+ return dst;
80+}
81+
82+
83+void kill_hashfile(struct hashfile_struct *hashfile)
84+{
85+ if (!hashfile)
86+ return;
87+ free(hashfile->name);
88+ close(hashfile->fd);
89+ free(hashfile);
90+}
91+
92+
93+void kill_hashfiles(struct hashfile_struct *hashfiles)
94+{
95+ struct hashfile_struct *iter, *next;
96+ if ((iter = hashfiles) != NULL) {
97+ do {
98+ next = iter->next;
99+ kill_hashfile(iter);
100+ iter = next;
101+ } while (iter != hashfiles);
102+ }
103+}
104+
105+
106+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
107+{
108+ DIR *d;
109+ struct dirent *di;
110+ struct hashfile_struct *hashfiles = NULL, *hashfile;
111+ STRUCT_STAT st;
112+ long this_fnbr;
113+
114+ *fnbr = 0;
115+
116+ /* Build a list of potential candidates and open
117+ * them. */
118+ if ((d = opendir(hashname)) == NULL) {
119+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
120+ free(hashname);
121+ return NULL;
122+ }
123+ while ((di = readdir(d)) != NULL) {
124+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125+ continue;
126+ }
127+
128+ /* We need to have the largest fnbr in case we need to store
129+ * a new file. */
130+ this_fnbr = atol(di->d_name);
131+ if (this_fnbr > *fnbr)
132+ *fnbr = this_fnbr;
133+
134+ hashfile = new_array(struct hashfile_struct, 1);
135+ asprintf(&hashfile->name,"%s/%s",hashname,
136+ di->d_name);
137+ if (do_stat(hashfile->name,&st) == -1) {
138+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139+ kill_hashfile(hashfile);
140+ continue;
141+ }
142+ if (st.st_size != size) {
143+ kill_hashfile(hashfile);
144+ continue;
145+ }
146+ hashfile->nlink = st.st_nlink;
147+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148+ if (hashfile->fd == -1) {
149+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150+ kill_hashfile(hashfile);
151+ continue;
152+ }
153+ if (hashfiles == NULL)
154+ hashfiles = hashfile->next = hashfile->prev = hashfile;
155+ else {
156+ hashfile->next = hashfiles;
157+ hashfile->prev = hashfiles->prev;
158+ hashfile->next->prev = hashfile;
159+ hashfile->prev->next = hashfile;
160+ }
161+ }
162+ closedir(d);
163+
164+ return hashfiles;
165+}
166+
167+
168+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
169+{
170+ int amt, hamt;
171+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172+ struct hashfile_struct *iter, *next, *best;
173+ uint32 nlink;
174+
175+ if (!files)
176+ return NULL;
177+
178+ iter = files; /* in case files are 0 bytes */
179+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
180+ iter = files;
181+ do {
182+ /* Icky bit to resync when we steal the first node. */
183+ if (!files)
184+ files = iter;
185+
186+ next = iter->next;
187+
188+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190+ if (iter == files) {
191+ files = files->prev;
192+ }
193+ if (iter->next == iter) {
194+ files = next = NULL;
195+ } else {
196+ next = iter->next;
197+ if (iter == files) {
198+ /* So we know to resync */
199+ files = NULL;
200+ }
201+ }
202+ iter->next->prev = iter->prev;
203+ iter->prev->next = iter->next;
204+ kill_hashfile(iter);
205+ }
206+
207+ iter = next;
208+ } while (iter != files);
209+
210+ if (iter == NULL && files == NULL) {
211+ /* There are no matches. */
212+ return NULL;
213+ }
214+ }
215+
216+ if (amt == -1) {
217+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218+ kill_hashfiles(files);
219+ return NULL;
220+ }
221+
222+ /* If we only have one file left, use it. */
223+ if (files == files->next) {
224+ return files;
225+ }
226+
227+ /* All files which remain in the list are identical and should have
228+ * the same size. We pick the one with the lowest link count (we
229+ * may have rolled over because we hit the maximum link count for
230+ * the filesystem). */
231+ best = iter = files;
232+ nlink = iter->nlink;
233+ do {
234+ if (iter->nlink < nlink) {
235+ nlink = iter->nlink;
236+ best = iter;
237+ }
238+ iter = iter->next;
239+ } while (iter != files);
240+
241+ best->next->prev = best->prev;
242+ best->prev->next = best->next;
243+ if (files == best)
244+ files = files->next;
245+ kill_hashfiles(files);
246+ return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+ STRUCT_STAT st;
253+ char *hashname = make_hash_name(file);
254+ int first = 0, rc;
255+ char *linkname;
256+ long last_fnbr;
257+
258+ if (file->length == 0)
259+ return robust_rename(fnametmp, fname, NULL, 0644);
260+
261+ if (do_stat(hashname, &st) == -1) {
262+ char *dirname;
263+
264+ /* Directory does not exist. */
265+ dirname = strdup(hashname);
266+ *strrchr(dirname,'/') = 0;
267+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
268+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
269+ free(hashname);
270+ free(dirname);
271+ return robust_rename(fnametmp, fname, NULL, 0644);
272+ }
273+ free(dirname);
274+
275+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
276+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
277+ free(hashname);
278+ return robust_rename(fnametmp, fname, NULL, 0644);
279+ }
280+
281+ first = 1;
282+ asprintf(&linkname,"%s/0",hashname);
283+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
284+ } else {
285+ struct hashfile_struct *hashfiles, *hashfile;
286+
287+ if (do_stat(fnametmp,&st) == -1) {
288+ rsyserr(FERROR, errno, "stat failed: %s", fname);
289+ return -1;
290+ }
291+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292+
293+ if (hashfiles == NULL) {
294+ first = 1;
295+ asprintf(&linkname,"%s/0",hashname);
296+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
297+ } else {
298+ int fd;
299+ /* Search for one identical to us. */
300+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
301+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
302+ kill_hashfiles(hashfiles);
303+ return -1;
304+ }
305+ hashfile = compare_hashfiles(fd, hashfiles);
306+ hashfiles = NULL;
307+ close(fd);
308+
309+ if (hashfile) {
310+ first = 0;
311+ linkname = strdup(hashfile->name);
312+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
313+ kill_hashfile(hashfile);
314+ } else {
315+ first = 1;
316+ asprintf(&linkname, "%s/%ld", hashname,
317+ last_fnbr + 1);
318+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
319+ }
320+ }
321+ }
322+
323+ if (!first) {
324+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325+ linkname, full_fname(fname));
326+ robust_unlink(fname);
327+ rc = do_link(linkname, fname);
328+ if (rc == -1) {
329+ if (errno == EMLINK) {
330+ first = 1;
331+ free(linkname);
332+ asprintf(&linkname,"%s/%ld",hashname,
333+ last_fnbr + 1);
334+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
335+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336+ } else {
337+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338+ linkname, full_fname(fname));
339+ rc = robust_rename(fnametmp, fname, NULL, 0644);
340+ }
341+ } else {
342+ do_unlink(fnametmp);
343+ }
344+ }
345+
346+ if (first) {
347+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348+ full_fname(fname),linkname);
349+
350+ rc = robust_rename(fnametmp, fname, NULL, 0644);
351+ if (rc != 0) {
352+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353+ full_fname(fnametmp), full_fname(fname));
354+ }
355+ rc = do_link(fname,linkname);
356+ if (rc != 0) {
357+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358+ full_fname(fname), linkname);
359+ }
360+ }
361+
362+ free(linkname);
363+ free(hashname);
364+ return rc;
365+}
366+
367+#endif
368--- old/options.c
369+++ new/options.c
370@@ -145,6 +145,7 @@ char *backup_suffix = NULL;
371 char *tmpdir = NULL;
372 char *partial_dir = NULL;
373 char *basis_dir[MAX_BASIS_DIRS+1];
374+char *link_by_hash_dir = NULL;
375 char *config_file = NULL;
376 char *shell_cmd = NULL;
377 char *logfile_name = NULL;
378@@ -349,6 +350,7 @@ void usage(enum logcode F)
379 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
380 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
381 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
382+ rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
383 rprintf(F," -z, --compress compress file data during the transfer\n");
384 rprintf(F," --compress-level=NUM explicitly set compression level\n");
385 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
386@@ -398,7 +400,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
387 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
388 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
389 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
390- OPT_NO_D,
391+ OPT_NO_D, OPT_LINK_BY_HASH,
392 OPT_SERVER, OPT_REFUSED_BASE = 9000};
393
394 static struct poptOption long_options[] = {
395@@ -499,6 +501,7 @@ static struct poptOption long_options[]
396 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
397 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
398 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
399+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
400 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
401 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
402 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
403@@ -1089,6 +1092,21 @@ int parse_arguments(int *argc, const cha
404 usage(FINFO);
405 exit_cleanup(0);
406
407+ case OPT_LINK_BY_HASH:
408+#if HAVE_LINK
409+ arg = poptGetOptArg(pc);
410+ if (sanitize_paths)
411+ arg = sanitize_path(NULL, arg, NULL, 0, NULL);
412+ link_by_hash_dir = (char *)arg;
413+ break;
414+#else
415+ snprintf(err_buf, sizeof err_buf,
416+ "hard links are not supported on this %s\n",
417+ am_server ? "server" : "client");
418+ rprintf(FERROR, "ERROR: %s", err_buf);
419+ return 0;
420+#endif
421+
422 default:
423 /* A large opt value means that set_refuse_options()
424 * turned this option off. */
425@@ -1739,6 +1757,11 @@ void server_options(char **args,int *arg
426 }
427 }
428
429+ if (link_by_hash_dir && am_sender) {
430+ args[ac++] = "--link-by-hash";
431+ args[ac++] = link_by_hash_dir;
432+ }
433+
434 if (files_from && (!am_sender || filesfrom_host)) {
435 if (filesfrom_host) {
436 args[ac++] = "--files-from";
437--- old/receiver.c
438+++ new/receiver.c
439@@ -50,6 +50,7 @@ extern int delay_updates;
440 extern struct stats stats;
441 extern char *stdout_format;
442 extern char *tmpdir;
443+extern char *link_by_hash_dir;
444 extern char *partial_dir;
445 extern char *basis_dir[];
446 extern struct file_list *the_file_list;
447@@ -124,12 +125,13 @@ static int get_tmpname(char *fnametmp, c
448
449
450 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
451- char *fname, int fd, OFF_T total_size)
452+ char *fname, int fd, OFF_T total_size, char *md4)
453 {
454 static char file_sum1[MD4_SUM_LENGTH];
455 static char file_sum2[MD4_SUM_LENGTH];
456 struct map_struct *mapbuf;
457 struct sum_struct sum;
458+ struct mdfour mdfour_data;
459 int32 len;
460 OFF_T offset = 0;
461 OFF_T offset2;
462@@ -149,6 +151,9 @@ static int receive_data(int f_in, char *
463 } else
464 mapbuf = NULL;
465
466+ if (md4)
467+ mdfour_begin(&mdfour_data);
468+
469 sum_init(checksum_seed);
470
471 if (append_mode) {
472@@ -191,6 +196,8 @@ static int receive_data(int f_in, char *
473 cleanup_got_literal = 1;
474
475 sum_update(data, i);
476+ if (md4)
477+ mdfour_update(&mdfour_data, (uchar*)data, i);
478
479 if (fd != -1 && write_file(fd,data,i) != i)
480 goto report_write_error;
481@@ -217,6 +224,8 @@ static int receive_data(int f_in, char *
482
483 see_token(map, len);
484 sum_update(map, len);
485+ if (md4)
486+ mdfour_update(&mdfour_data, (uchar*)map, len);
487 }
488
489 if (updating_basis) {
490@@ -259,6 +268,8 @@ static int receive_data(int f_in, char *
491 }
492
493 sum_end(file_sum1);
494+ if (md4)
495+ mdfour_result(&mdfour_data, (unsigned char*)md4);
496
497 if (mapbuf)
498 unmap_file(mapbuf);
499@@ -274,7 +285,7 @@ static int receive_data(int f_in, char *
500
501 static void discard_receive_data(int f_in, OFF_T length)
502 {
503- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
504+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
505 }
506
507 static void handle_delayed_updates(struct file_list *flist, char *local_name)
508@@ -611,8 +622,12 @@ int recv_files(int f_in, struct file_lis
509 rprintf(FINFO, "%s\n", fname);
510
511 /* recv file data */
512+#if HAVE_LINK
513+ if (link_by_hash_dir)
514+ file->u.sum = new_array(char, MD4_SUM_LENGTH);
515+#endif
516 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
517- fname, fd2, file->length);
518+ fname, fd2, file->length, file->u.sum);
519
520 log_item(log_code, file, &initial_stats, iflags, NULL);
521
522--- old/rsync.c
523+++ new/rsync.c
524@@ -48,6 +48,7 @@ extern int inplace;
525 extern int keep_dirlinks;
526 extern int make_backups;
527 extern mode_t orig_umask;
528+extern char *link_by_hash_dir;
529 extern struct stats stats;
530 extern struct chmod_mode_struct *daemon_chmod_modes;
531
532@@ -271,8 +272,15 @@ void finish_transfer(char *fname, char *
533 /* move tmp file over real file */
534 if (verbose > 2)
535 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
536- ret = robust_rename(fnametmp, fname, partialptr,
537- file->mode & INITACCESSPERMS);
538+#if HAVE_LINK
539+ if (link_by_hash_dir)
540+ ret = link_by_hash(fnametmp, fname, file);
541+ else
542+#endif
543+ {
544+ ret = robust_rename(fnametmp, fname, partialptr,
545+ file->mode & INITACCESSPERMS);
546+ }
547 if (ret < 0) {
548 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
549 ret == -2 ? "copy" : "rename",
550--- old/rsync.h
551+++ new/rsync.h
552@@ -651,6 +651,14 @@ struct stats {
553 int current_file_index;
554 };
555
556+struct hashfile_struct {
557+ struct hashfile_struct *next;
558+ struct hashfile_struct *prev;
559+ char *name;
560+ int fd;
561+ uint32 nlink;
562+};
563+
564 struct chmod_mode_struct;
565
566 #include "byteorder.h"
567--- old/rsync.yo
568+++ new/rsync.yo
569@@ -366,6 +366,7 @@ to the detailed description below for a
570 --compare-dest=DIR also compare received files relative to DIR
571 --copy-dest=DIR ... and include copies of unchanged files
572 --link-dest=DIR hardlink to files in DIR when unchanged
573+ --link-by-hash=DIR create hardlinks by hash into DIR
574 -z, --compress compress file data during the transfer
575 --compress-level=NUM explicitly set compression level
576 -C, --cvs-exclude auto-ignore files in the same way CVS does