Has been applied to CVS.
[rsync/rsync-patches.git] / link-by-hash.diff
... / ...
CommitLineData
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
13
14--- orig/Makefile.in 2004-07-04 08:59:17
15+++ Makefile.in 2004-07-03 20:20:15
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
25--- orig/hashlink.c 2004-07-02 21:41:54
26+++ hashlink.c 2004-07-02 21:41:54
27@@ -0,0 +1,342 @@
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#ifdef HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
114+
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
118+ rsyserr(FERROR, errno, "opendir \"%s\"", hashname);
119+ free(hashname);
120+ return NULL;
121+ }
122+ while ((di = readdir(d)) != NULL) {
123+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124+ continue;
125+ }
126+
127+ /* We need to have the largest fnbr in case we need to store
128+ * a new file. */
129+ this_fnbr = atol(di->d_name);
130+ if (this_fnbr > *fnbr)
131+ *fnbr = this_fnbr;
132+
133+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134+ asprintf(&hashfile->name,"%s/%s",hashname,
135+ di->d_name);
136+ if (do_stat(hashfile->name,&st) == -1) {
137+ rsyserr(FERROR, errno, "%s: %s", hashfile->name);
138+ kill_hashfile(hashfile);
139+ continue;
140+ }
141+ if (st.st_size != size) {
142+ kill_hashfile(hashfile);
143+ continue;
144+ }
145+ hashfile->nlink = st.st_nlink;
146+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147+ if (hashfile->fd == -1) {
148+ rsyserr(FERROR, errno, "%s", hashfile->name);
149+ kill_hashfile(hashfile);
150+ continue;
151+ }
152+ if (hashfiles == NULL)
153+ hashfiles = hashfile->next = hashfile->prev = hashfile;
154+ else {
155+ hashfile->next = hashfiles;
156+ hashfile->prev = hashfiles->prev;
157+ hashfile->next->prev = hashfile;
158+ hashfile->prev->next = hashfile;
159+ }
160+ }
161+ closedir(d);
162+
163+ return hashfiles;
164+}
165+
166+
167+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168+{
169+ int amt, hamt;
170+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171+ struct hashfile_struct *iter, *next, *best;
172+ uint32 nlink;
173+
174+ if (!files)
175+ return NULL;
176+
177+ iter = files; /* in case files are 0 bytes */
178+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179+ iter = files;
180+ do {
181+ /* Icky bit to resync when we steal the first node. */
182+ if (!files)
183+ files = iter;
184+
185+ next = iter->next;
186+
187+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189+ if (iter == files) {
190+ files = files->prev;
191+ }
192+ if (iter->next == iter) {
193+ files = next = NULL;
194+ } else {
195+ next = iter->next;
196+ if (iter == files) {
197+ /* So we know to resync */
198+ files = NULL;
199+ }
200+ }
201+ iter->next->prev = iter->prev;
202+ iter->prev->next = iter->next;
203+ kill_hashfile(iter);
204+ }
205+
206+ iter = next;
207+ } while (iter != files);
208+
209+ if (iter == NULL && files == NULL) {
210+ /* There are no matches. */
211+ return NULL;
212+ }
213+
214+ }
215+
216+ if (amt == -1) {
217+ rsyserr(FERROR, errno, "%s");
218+ kill_hashfiles(files);
219+ return NULL;
220+ }
221+
222+ /* If we only have one file left, use it. */
223+ if (files == files->next) {
224+ return files;
225+ }
226+
227+ /* All files which remain in the list are identical and should have
228+ * the same size. We pick the one with the lowest link count (we
229+ * may have rolled over because we hit the maximum link count for
230+ * the filesystem). */
231+ best = iter = files;
232+ nlink = iter->nlink;
233+ do {
234+ if (iter->nlink < nlink) {
235+ nlink = iter->nlink;
236+ best = iter;
237+ }
238+ iter = iter->next;
239+ } while (iter != files);
240+
241+ best->next->prev = best->prev;
242+ best->prev->next = best->next;
243+ if (files == best)
244+ files = files->next;
245+ kill_hashfiles(files);
246+ return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+ STRUCT_STAT st;
253+ char *hashname = make_hash_name(file);
254+ int first = 0, rc;
255+ char *linkname;
256+ long last_fnbr;
257+
258+ if (file->length == 0) {
259+ return robust_rename(fnametmp,fname,0644);
260+ }
261+
262+ if (do_stat(hashname, &st) == -1) {
263+ char *dirname;
264+
265+ /* Directory does not exist. */
266+ dirname = strdup(hashname);
267+ *strrchr(dirname,'/') = 0;
268+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
269+ rsyserr(FERROR, errno, "mkdir %s", dirname);
270+ free(hashname);
271+ free(dirname);
272+ return robust_rename(fnametmp,fname,0644);
273+ }
274+ free(dirname);
275+
276+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
277+ rsyserr(FERROR, errno, "mkdir %s", hashname);
278+ free(hashname);
279+ return robust_rename(fnametmp,fname,0644);
280+ }
281+
282+ first = 1;
283+ asprintf(&linkname,"%s/0",hashname);
284+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
285+
286+ } else {
287+ struct hashfile_struct *hashfiles, *hashfile;
288+ int fd;
289+
290+ if (do_stat(fnametmp,&st) == -1) {
291+ rsyserr(FERROR, errno, "%s", fname);
292+ return -1;
293+ }
294+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
295+
296+ if (hashfiles == NULL) {
297+ first = 1;
298+ asprintf(&linkname,"%s/0",hashname);
299+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
300+ } else {
301+
302+ /* Search for one identical to us. */
303+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
304+ rsyserr(FERROR, errno, "%s", fnametmp);
305+ kill_hashfiles(hashfiles);
306+ return -1;
307+ }
308+ hashfile = compare_hashfiles(fd, hashfiles);
309+ hashfiles = NULL;
310+
311+ if (hashfile) {
312+ first = 0;
313+ linkname = strdup(hashfile->name);
314+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
315+ kill_hashfile(hashfile);
316+ } else {
317+ first = 1;
318+ asprintf(&linkname, "%s/%ld", hashname,
319+ last_fnbr + 1);
320+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
321+ }
322+ }
323+ }
324+
325+ if (!first) {
326+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
327+ linkname, full_fname(fname));
328+ rc = do_link(linkname, fname);
329+ if (rc == -1) {
330+ if (errno == EMLINK) {
331+ first = 1;
332+ free(linkname);
333+ asprintf(&linkname,"%s/%ld",hashname,
334+ last_fnbr + 1);
335+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
336+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337+ } else {
338+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339+ linkname, full_fname(fname));
340+ robust_unlink(fname);
341+ rc = robust_rename(fnametmp,fname,0644);
342+ }
343+ } else {
344+ do_unlink(fnametmp);
345+ }
346+ }
347+
348+ if (first) {
349+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
350+ full_fname(fname),linkname);
351+
352+ rc = robust_rename(fnametmp,fname,0644);
353+ if (rc != 0) {
354+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
355+ full_fname(fnametmp), full_fname(fname));
356+ }
357+ rc = do_link(fname,linkname);
358+ if (rc != 0) {
359+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
360+ full_fname(fname), linkname);
361+ }
362+ }
363+
364+ free(linkname);
365+ free(hashname);
366+ return rc;
367+}
368+
369+#endif
370--- orig/options.c 2004-07-15 16:51:50
371+++ options.c 2004-07-03 20:20:15
372@@ -124,6 +124,7 @@ char *log_format = NULL;
373 char *password_file = NULL;
374 char *rsync_path = RSYNC_PATH;
375 char *backup_dir = NULL;
376+char *link_by_hash_dir = NULL;
377 char backup_dir_buf[MAXPATHLEN];
378 int rsync_port = RSYNC_PORT;
379 int link_dest = 0;
380@@ -269,6 +270,7 @@ void usage(enum logcode F)
381 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
382 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
383 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
384+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
385 rprintf(F," -P equivalent to --partial --progress\n");
386 rprintf(F," -z, --compress compress file data\n");
387 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
388@@ -309,7 +311,7 @@ void usage(enum logcode F)
389 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
390 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
391 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
392- OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT,
393+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_LINK_BY_HASH,
394 OPT_REFUSED_BASE = 9000};
395
396 static struct poptOption long_options[] = {
397@@ -367,6 +369,7 @@ static struct poptOption long_options[]
398 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
399 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
400 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
401+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
402 /* TODO: Should this take an optional int giving the compression level? */
403 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
404 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
405@@ -593,6 +596,19 @@ int parse_arguments(int *argc, const cha
406 return 0;
407 #endif
408
409+ case OPT_LINK_BY_HASH:
410+#if HAVE_LINK
411+ link_by_hash_dir = (char *)poptGetOptArg(pc);
412+ checksum_seed = FIXED_CHECKSUM_SEED;
413+ break;
414+#else
415+ snprintf(err_buf, sizeof err_buf,
416+ "hard links are not supported on this %s\n",
417+ am_server ? "server" : "client");
418+ rprintf(FERROR, "ERROR: %s", err_buf);
419+ return 0;
420+#endif
421+
422 default:
423 /* A large opt value means that set_refuse_options()
424 * turned this option off (opt-BASE is its index). */
425@@ -969,6 +985,11 @@ void server_options(char **args,int *arg
426 args[ac++] = compare_dest;
427 }
428
429+ if (link_by_hash_dir && am_sender) {
430+ args[ac++] = "--link-by-hash";
431+ args[ac++] = link_by_hash_dir;
432+ }
433+
434 if (files_from && (!am_sender || remote_filesfrom_file)) {
435 if (remote_filesfrom_file) {
436 args[ac++] = "--files-from";
437--- orig/receiver.c 2004-07-14 17:12:06
438+++ receiver.c 2004-07-03 20:20:15
439@@ -48,6 +48,7 @@ extern int ignore_errors;
440 extern int orig_umask;
441 extern int keep_partial;
442 extern int checksum_seed;
443+extern char *link_by_hash_dir;
444
445 static void delete_one(char *fn, int is_dir)
446 {
447@@ -196,10 +197,11 @@ static int get_tmpname(char *fnametmp, c
448
449
450 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
451- OFF_T total_size)
452+ OFF_T total_size,char *md4)
453 {
454 int i;
455 struct sum_struct sum;
456+ struct mdfour mdfour_data;
457 unsigned int len;
458 OFF_T offset = 0;
459 OFF_T offset2;
460@@ -210,6 +212,9 @@ static int receive_data(int f_in,struct
461
462 read_sum_head(f_in, &sum);
463
464+ if (md4)
465+ mdfour_begin(&mdfour_data);
466+
467 sum_init(checksum_seed);
468
469 while ((i = recv_token(f_in, &data)) != 0) {
470@@ -226,6 +231,8 @@ static int receive_data(int f_in,struct
471 cleanup_got_literal = 1;
472
473 sum_update(data,i);
474+ if (md4)
475+ mdfour_update(&mdfour_data,data,i);
476
477 if (fd != -1 && write_file(fd,data,i) != i) {
478 rsyserr(FERROR, errno, "write failed on %s",
479@@ -253,6 +260,8 @@ static int receive_data(int f_in,struct
480
481 see_token(map, len);
482 sum_update(map,len);
483+ if (md4)
484+ mdfour_update(&mdfour_data,map,len);
485 }
486
487 if (fd != -1 && write_file(fd, map, len) != (int)len) {
488@@ -275,6 +284,8 @@ static int receive_data(int f_in,struct
489 }
490
491 sum_end(file_sum1);
492+ if (md4)
493+ mdfour_result(&mdfour_data, (unsigned char*)md4);
494
495 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
496 if (verbose > 2)
497@@ -374,7 +385,7 @@ int recv_files(int f_in, struct file_lis
498 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
499 rsyserr(FERROR, errno, "fstat %s failed",
500 full_fname(fnamecmp));
501- receive_data(f_in,NULL,-1,NULL,file->length);
502+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
503 close(fd1);
504 continue;
505 }
506@@ -387,7 +398,7 @@ int recv_files(int f_in, struct file_lis
507 */
508 rprintf(FERROR,"recv_files: %s is a directory\n",
509 full_fname(fnamecmp));
510- receive_data(f_in, NULL, -1, NULL, file->length);
511+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
512 close(fd1);
513 continue;
514 }
515@@ -443,7 +454,7 @@ int recv_files(int f_in, struct file_lis
516 if (fd2 == -1) {
517 rsyserr(FERROR, errno, "mkstemp %s failed",
518 full_fname(fnametmp));
519- receive_data(f_in,mapbuf,-1,NULL,file->length);
520+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
521 if (mapbuf)
522 unmap_file(mapbuf);
523 if (fd1 != -1)
524@@ -457,7 +468,11 @@ int recv_files(int f_in, struct file_lis
525 rprintf(FINFO, "%s\n", fname);
526
527 /* recv file data */
528- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
529+#ifdef HAVE_LINK
530+ if (link_by_hash_dir)
531+ file->u.sum = (char*)malloc(MD4_SUM_LENGTH);
532+#endif
533+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
534
535 log_recv(file, &initial_stats);
536
537--- orig/rsync.c 2004-07-02 18:06:32
538+++ rsync.c 2004-07-03 20:20:15
539@@ -34,6 +34,7 @@ extern int force_delete;
540 extern int recurse;
541 extern int make_backups;
542 extern char *backup_dir;
543+extern char *link_by_hash_dir;
544
545
546 /*
547@@ -242,6 +243,12 @@ void finish_transfer(char *fname, char *
548 /* move tmp file over real file */
549 if (verbose > 2)
550 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
551+#ifdef HAVE_LINK
552+ if (link_by_hash_dir)
553+ ret = link_by_hash(fnametmp, fname, file);
554+ else
555+#endif
556+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
557 ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
558 if (ret < 0) {
559 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
560--- orig/rsync.h 2004-07-07 08:27:00
561+++ rsync.h 2004-07-03 20:20:15
562@@ -522,6 +522,14 @@ struct stats {
563 int current_file_index;
564 };
565
566+struct hashfile_struct {
567+ struct hashfile_struct *next;
568+ struct hashfile_struct *prev;
569+ char *name;
570+ int fd;
571+ uint32 nlink;
572+};
573+
574
575 /* we need this function because of the silly way in which duplicate
576 entries are handled in the file lists - we can't change this