Fixed failing hunk.
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
8a529471
WD
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
2eb075b2
WD
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
2eb075b2 13
d0320a46 14--- orig/Makefile.in 2004-08-13 07:18:58
13bed3dd 15+++ Makefile.in 2004-07-03 20:20:15
8a529471
WD
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
47841496
WD
25--- orig/hashlink.c 2004-09-24 16:44:25
26+++ hashlink.c 2004-09-24 16:44:25
27@@ -0,0 +1,340 @@
c57f4101
WD
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#ifdef HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
47841496 114+
c57f4101
WD
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
d0320a46 118+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
c57f4101
WD
119+ free(hashname);
120+ return NULL;
121+ }
122+ while ((di = readdir(d)) != NULL) {
123+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124+ continue;
125+ }
126+
127+ /* We need to have the largest fnbr in case we need to store
128+ * a new file. */
129+ this_fnbr = atol(di->d_name);
130+ if (this_fnbr > *fnbr)
131+ *fnbr = this_fnbr;
132+
133+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134+ asprintf(&hashfile->name,"%s/%s",hashname,
135+ di->d_name);
136+ if (do_stat(hashfile->name,&st) == -1) {
d0320a46 137+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
c57f4101
WD
138+ kill_hashfile(hashfile);
139+ continue;
140+ }
141+ if (st.st_size != size) {
142+ kill_hashfile(hashfile);
143+ continue;
144+ }
145+ hashfile->nlink = st.st_nlink;
146+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147+ if (hashfile->fd == -1) {
d0320a46 148+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
c57f4101
WD
149+ kill_hashfile(hashfile);
150+ continue;
151+ }
152+ if (hashfiles == NULL)
153+ hashfiles = hashfile->next = hashfile->prev = hashfile;
154+ else {
155+ hashfile->next = hashfiles;
156+ hashfile->prev = hashfiles->prev;
157+ hashfile->next->prev = hashfile;
158+ hashfile->prev->next = hashfile;
159+ }
160+ }
161+ closedir(d);
162+
163+ return hashfiles;
164+}
165+
166+
167+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168+{
169+ int amt, hamt;
170+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171+ struct hashfile_struct *iter, *next, *best;
172+ uint32 nlink;
173+
174+ if (!files)
175+ return NULL;
176+
177+ iter = files; /* in case files are 0 bytes */
178+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179+ iter = files;
180+ do {
181+ /* Icky bit to resync when we steal the first node. */
182+ if (!files)
183+ files = iter;
184+
185+ next = iter->next;
186+
187+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189+ if (iter == files) {
190+ files = files->prev;
191+ }
192+ if (iter->next == iter) {
193+ files = next = NULL;
194+ } else {
195+ next = iter->next;
196+ if (iter == files) {
197+ /* So we know to resync */
198+ files = NULL;
199+ }
200+ }
201+ iter->next->prev = iter->prev;
202+ iter->prev->next = iter->next;
203+ kill_hashfile(iter);
204+ }
205+
206+ iter = next;
207+ } while (iter != files);
208+
209+ if (iter == NULL && files == NULL) {
210+ /* There are no matches. */
211+ return NULL;
212+ }
c57f4101
WD
213+ }
214+
215+ if (amt == -1) {
d0320a46 216+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
c57f4101
WD
217+ kill_hashfiles(files);
218+ return NULL;
219+ }
220+
221+ /* If we only have one file left, use it. */
222+ if (files == files->next) {
223+ return files;
224+ }
225+
226+ /* All files which remain in the list are identical and should have
227+ * the same size. We pick the one with the lowest link count (we
228+ * may have rolled over because we hit the maximum link count for
229+ * the filesystem). */
230+ best = iter = files;
231+ nlink = iter->nlink;
232+ do {
233+ if (iter->nlink < nlink) {
234+ nlink = iter->nlink;
235+ best = iter;
236+ }
237+ iter = iter->next;
238+ } while (iter != files);
239+
240+ best->next->prev = best->prev;
241+ best->prev->next = best->next;
242+ if (files == best)
243+ files = files->next;
244+ kill_hashfiles(files);
245+ return best;
246+}
247+
248+
249+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
250+{
251+ STRUCT_STAT st;
47841496 252+ char *hashname = make_hash_name(file);
c57f4101
WD
253+ int first = 0, rc;
254+ char *linkname;
255+ long last_fnbr;
256+
257+ if (file->length == 0) {
258+ return robust_rename(fnametmp,fname,0644);
259+ }
260+
261+ if (do_stat(hashname, &st) == -1) {
262+ char *dirname;
263+
264+ /* Directory does not exist. */
265+ dirname = strdup(hashname);
266+ *strrchr(dirname,'/') = 0;
267+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
d0320a46 268+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
c57f4101
WD
269+ free(hashname);
270+ free(dirname);
271+ return robust_rename(fnametmp,fname,0644);
272+ }
273+ free(dirname);
274+
275+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
d0320a46 276+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
c57f4101
WD
277+ free(hashname);
278+ return robust_rename(fnametmp,fname,0644);
279+ }
280+
281+ first = 1;
282+ asprintf(&linkname,"%s/0",hashname);
283+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
c57f4101
WD
284+ } else {
285+ struct hashfile_struct *hashfiles, *hashfile;
c57f4101
WD
286+
287+ if (do_stat(fnametmp,&st) == -1) {
d0320a46 288+ rsyserr(FERROR, errno, "stat failed: %s", fname);
c57f4101
WD
289+ return -1;
290+ }
291+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292+
293+ if (hashfiles == NULL) {
294+ first = 1;
295+ asprintf(&linkname,"%s/0",hashname);
296+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
297+ } else {
47841496 298+ int fd;
c57f4101
WD
299+ /* Search for one identical to us. */
300+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
d0320a46 301+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
c57f4101
WD
302+ kill_hashfiles(hashfiles);
303+ return -1;
304+ }
305+ hashfile = compare_hashfiles(fd, hashfiles);
306+ hashfiles = NULL;
47841496 307+ close(fd);
c57f4101
WD
308+
309+ if (hashfile) {
310+ first = 0;
311+ linkname = strdup(hashfile->name);
312+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
313+ kill_hashfile(hashfile);
314+ } else {
315+ first = 1;
316+ asprintf(&linkname, "%s/%ld", hashname,
317+ last_fnbr + 1);
318+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
319+ }
320+ }
321+ }
322+
323+ if (!first) {
324+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325+ linkname, full_fname(fname));
cad12f62 326+ robust_unlink(fname);
c57f4101
WD
327+ rc = do_link(linkname, fname);
328+ if (rc == -1) {
329+ if (errno == EMLINK) {
330+ first = 1;
331+ free(linkname);
332+ asprintf(&linkname,"%s/%ld",hashname,
333+ last_fnbr + 1);
334+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
335+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336+ } else {
fe6407b5
WD
337+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338+ linkname, full_fname(fname));
c57f4101
WD
339+ rc = robust_rename(fnametmp,fname,0644);
340+ }
341+ } else {
342+ do_unlink(fnametmp);
343+ }
344+ }
345+
346+ if (first) {
347+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348+ full_fname(fname),linkname);
349+
350+ rc = robust_rename(fnametmp,fname,0644);
351+ if (rc != 0) {
fe6407b5
WD
352+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353+ full_fname(fnametmp), full_fname(fname));
c57f4101
WD
354+ }
355+ rc = do_link(fname,linkname);
356+ if (rc != 0) {
fe6407b5
WD
357+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358+ full_fname(fname), linkname);
c57f4101
WD
359+ }
360+ }
361+
362+ free(linkname);
363+ free(hashname);
364+ return rc;
365+}
366+
367+#endif
47841496 368--- orig/options.c 2004-09-23 17:42:07
d0320a46 369+++ options.c 2004-08-13 18:13:18
f635ed27 370@@ -126,6 +126,7 @@ char *log_format = NULL;
c57f4101
WD
371 char *password_file = NULL;
372 char *rsync_path = RSYNC_PATH;
373 char *backup_dir = NULL;
374+char *link_by_hash_dir = NULL;
375 char backup_dir_buf[MAXPATHLEN];
376 int rsync_port = RSYNC_PORT;
377 int link_dest = 0;
f635ed27 378@@ -279,6 +280,7 @@ void usage(enum logcode F)
d4e89c6a 379 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
c57f4101
WD
380 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
381 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
382+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
383 rprintf(F," -P equivalent to --partial --progress\n");
384 rprintf(F," -z, --compress compress file data\n");
385 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
f635ed27 386@@ -319,7 +321,7 @@ void usage(enum logcode F)
c57f4101
WD
387 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
388 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
389 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
125d7fca
WD
390- OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT,
391+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_LINK_BY_HASH,
c57f4101
WD
392 OPT_REFUSED_BASE = 9000};
393
394 static struct poptOption long_options[] = {
f635ed27 395@@ -378,6 +380,7 @@ static struct poptOption long_options[]
c57f4101
WD
396 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
397 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
54691942 398 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
c57f4101
WD
399+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
400 /* TODO: Should this take an optional int giving the compression level? */
401 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
402 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
47841496 403@@ -616,6 +619,21 @@ int parse_arguments(int *argc, const cha
c57f4101
WD
404 return 0;
405 #endif
406
407+ case OPT_LINK_BY_HASH:
408+#if HAVE_LINK
d0320a46
WD
409+ arg = poptGetOptArg(pc);
410+ if (sanitize_paths)
411+ arg = sanitize_path(NULL, arg, NULL, 0);
412+ link_by_hash_dir = (char *)arg;
c57f4101
WD
413+ break;
414+#else
415+ snprintf(err_buf, sizeof err_buf,
416+ "hard links are not supported on this %s\n",
417+ am_server ? "server" : "client");
418+ rprintf(FERROR, "ERROR: %s", err_buf);
419+ return 0;
420+#endif
421+
422 default:
423 /* A large opt value means that set_refuse_options()
424 * turned this option off (opt-BASE is its index). */
47841496 425@@ -1087,6 +1105,11 @@ void server_options(char **args,int *arg
c57f4101 426 args[ac++] = compare_dest;
7b675ff5
WD
427 }
428
c57f4101
WD
429+ if (link_by_hash_dir && am_sender) {
430+ args[ac++] = "--link-by-hash";
431+ args[ac++] = link_by_hash_dir;
7b675ff5
WD
432+ }
433+
c57f4101 434 if (files_from && (!am_sender || remote_filesfrom_file)) {
7b675ff5
WD
435 if (remote_filesfrom_file) {
436 args[ac++] = "--files-from";
a6587818 437--- orig/receiver.c 2004-09-21 09:40:27
dc3ae351 438+++ receiver.c 2004-07-20 21:44:05
d5753a22 439@@ -39,6 +39,7 @@ extern int io_error;
f6c3b300 440 extern char *tmpdir;
afbebe13 441 extern char *partial_dir;
f6c3b300 442 extern char *compare_dest;
54691942 443+extern char *link_by_hash_dir;
f6c3b300
WD
444 extern int make_backups;
445 extern int do_progress;
446 extern char *backup_dir;
a6587818 447@@ -202,12 +203,13 @@ static int get_tmpname(char *fnametmp, c
c57f4101
WD
448
449
dc3ae351
WD
450 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
451- char *fname, int fd, OFF_T total_size)
452+ char *fname, int fd, OFF_T total_size, char *md4)
c57f4101 453 {
dc3ae351
WD
454 static char file_sum1[MD4_SUM_LENGTH];
455 static char file_sum2[MD4_SUM_LENGTH];
456 struct map_struct *mapbuf;
c57f4101
WD
457 struct sum_struct sum;
458+ struct mdfour mdfour_data;
459 unsigned int len;
460 OFF_T offset = 0;
461 OFF_T offset2;
a6587818 462@@ -227,6 +229,9 @@ static int receive_data(int f_in, char *
dc3ae351
WD
463 } else
464 mapbuf = NULL;
7b675ff5 465
c57f4101
WD
466+ if (md4)
467+ mdfour_begin(&mdfour_data);
7b675ff5
WD
468+
469 sum_init(checksum_seed);
c57f4101
WD
470
471 while ((i = recv_token(f_in, &data)) != 0) {
a6587818 472@@ -243,6 +248,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
473 cleanup_got_literal = 1;
474
475 sum_update(data,i);
476+ if (md4)
477+ mdfour_update(&mdfour_data,data,i);
478
afbebe13
WD
479 if (fd != -1 && write_file(fd,data,i) != i)
480 goto report_write_error;
a6587818 481@@ -267,6 +274,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
482
483 see_token(map, len);
484 sum_update(map,len);
485+ if (md4)
486+ mdfour_update(&mdfour_data,map,len);
487 }
488
afbebe13 489 if (inplace) {
a6587818 490@@ -306,6 +315,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
491 }
492
493 sum_end(file_sum1);
494+ if (md4)
495+ mdfour_result(&mdfour_data, (unsigned char*)md4);
496
dc3ae351
WD
497 if (mapbuf)
498 unmap_file(mapbuf);
a6587818 499@@ -321,7 +332,7 @@ static int receive_data(int f_in, char *
5823d322
WD
500
501 static void discard_receive_data(int f_in, OFF_T length)
502 {
dc3ae351
WD
503- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
504+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
5823d322
WD
505 }
506
507
a6587818 508@@ -542,8 +553,12 @@ int recv_files(int f_in, struct file_lis
982426b8 509 rprintf(FINFO, "%s\n", safe_fname(fname));
c57f4101
WD
510
511 /* recv file data */
c57f4101 512+#ifdef HAVE_LINK
7b675ff5
WD
513+ if (link_by_hash_dir)
514+ file->u.sum = (char*)malloc(MD4_SUM_LENGTH);
c57f4101 515+#endif
dc3ae351
WD
516 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
517- fname, fd2, file->length);
518+ fname, fd2, file->length, file->u.sum);
c57f4101
WD
519
520 log_recv(file, &initial_stats);
521
a6587818 522--- orig/rsync.c 2004-09-07 21:45:30
d0320a46 523+++ rsync.c 2004-08-13 18:14:34
d5753a22 524@@ -34,6 +34,7 @@ extern int force_delete;
7b675ff5 525 extern int recurse;
d5753a22 526 extern int keep_dirlinks;
c57f4101
WD
527 extern int make_backups;
528+extern char *link_by_hash_dir;
f6c3b300
WD
529 extern char *backup_dir;
530 extern int inplace;
c57f4101 531
d0320a46 532@@ -254,7 +255,12 @@ void finish_transfer(char *fname, char *
13bed3dd
WD
533 /* move tmp file over real file */
534 if (verbose > 2)
535 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
d0320a46 536- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
c57f4101 537+#ifdef HAVE_LINK
2eb075b2 538+ if (link_by_hash_dir)
7b675ff5 539+ ret = link_by_hash(fnametmp, fname, file);
2eb075b2 540+ else
c57f4101 541+#endif
2eb075b2 542+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
54691942 543 if (ret < 0) {
fe6407b5 544 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
d0320a46 545 ret == -2 ? "copy" : "rename",
47841496 546--- orig/rsync.h 2004-09-22 08:47:31
13bed3dd 547+++ rsync.h 2004-07-03 20:20:15
47841496 548@@ -526,6 +526,14 @@ struct stats {
c57f4101
WD
549 int current_file_index;
550 };
551
552+struct hashfile_struct {
553+ struct hashfile_struct *next;
554+ struct hashfile_struct *prev;
555+ char *name;
556+ int fd;
557+ uint32 nlink;
558+};
559+
560
561 /* we need this function because of the silly way in which duplicate
562 entries are handled in the file lists - we can't change this