Improved the default (suggested) commands depending on how a
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
8a529471
WD
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
2eb075b2
WD
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
2eb075b2 13
d0320a46 14--- orig/Makefile.in 2004-08-13 07:18:58
13bed3dd 15+++ Makefile.in 2004-07-03 20:20:15
8a529471
WD
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
d0320a46
WD
25--- orig/hashlink.c 2004-08-13 18:04:59
26+++ hashlink.c 2004-08-13 18:04:59
fe6407b5 27@@ -0,0 +1,342 @@
c57f4101
WD
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#ifdef HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
114+
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
d0320a46 118+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
c57f4101
WD
119+ free(hashname);
120+ return NULL;
121+ }
122+ while ((di = readdir(d)) != NULL) {
123+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124+ continue;
125+ }
126+
127+ /* We need to have the largest fnbr in case we need to store
128+ * a new file. */
129+ this_fnbr = atol(di->d_name);
130+ if (this_fnbr > *fnbr)
131+ *fnbr = this_fnbr;
132+
133+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134+ asprintf(&hashfile->name,"%s/%s",hashname,
135+ di->d_name);
136+ if (do_stat(hashfile->name,&st) == -1) {
d0320a46 137+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
c57f4101
WD
138+ kill_hashfile(hashfile);
139+ continue;
140+ }
141+ if (st.st_size != size) {
142+ kill_hashfile(hashfile);
143+ continue;
144+ }
145+ hashfile->nlink = st.st_nlink;
146+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147+ if (hashfile->fd == -1) {
d0320a46 148+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
c57f4101
WD
149+ kill_hashfile(hashfile);
150+ continue;
151+ }
152+ if (hashfiles == NULL)
153+ hashfiles = hashfile->next = hashfile->prev = hashfile;
154+ else {
155+ hashfile->next = hashfiles;
156+ hashfile->prev = hashfiles->prev;
157+ hashfile->next->prev = hashfile;
158+ hashfile->prev->next = hashfile;
159+ }
160+ }
161+ closedir(d);
162+
163+ return hashfiles;
164+}
165+
166+
167+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168+{
169+ int amt, hamt;
170+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171+ struct hashfile_struct *iter, *next, *best;
172+ uint32 nlink;
173+
174+ if (!files)
175+ return NULL;
176+
177+ iter = files; /* in case files are 0 bytes */
178+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179+ iter = files;
180+ do {
181+ /* Icky bit to resync when we steal the first node. */
182+ if (!files)
183+ files = iter;
184+
185+ next = iter->next;
186+
187+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189+ if (iter == files) {
190+ files = files->prev;
191+ }
192+ if (iter->next == iter) {
193+ files = next = NULL;
194+ } else {
195+ next = iter->next;
196+ if (iter == files) {
197+ /* So we know to resync */
198+ files = NULL;
199+ }
200+ }
201+ iter->next->prev = iter->prev;
202+ iter->prev->next = iter->next;
203+ kill_hashfile(iter);
204+ }
205+
206+ iter = next;
207+ } while (iter != files);
208+
209+ if (iter == NULL && files == NULL) {
210+ /* There are no matches. */
211+ return NULL;
212+ }
213+
214+ }
215+
216+ if (amt == -1) {
d0320a46 217+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
c57f4101
WD
218+ kill_hashfiles(files);
219+ return NULL;
220+ }
221+
222+ /* If we only have one file left, use it. */
223+ if (files == files->next) {
224+ return files;
225+ }
226+
227+ /* All files which remain in the list are identical and should have
228+ * the same size. We pick the one with the lowest link count (we
229+ * may have rolled over because we hit the maximum link count for
230+ * the filesystem). */
231+ best = iter = files;
232+ nlink = iter->nlink;
233+ do {
234+ if (iter->nlink < nlink) {
235+ nlink = iter->nlink;
236+ best = iter;
237+ }
238+ iter = iter->next;
239+ } while (iter != files);
240+
241+ best->next->prev = best->prev;
242+ best->prev->next = best->next;
243+ if (files == best)
244+ files = files->next;
245+ kill_hashfiles(files);
246+ return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+ STRUCT_STAT st;
253+ char *hashname = make_hash_name(file);
254+ int first = 0, rc;
255+ char *linkname;
256+ long last_fnbr;
257+
258+ if (file->length == 0) {
259+ return robust_rename(fnametmp,fname,0644);
260+ }
261+
262+ if (do_stat(hashname, &st) == -1) {
263+ char *dirname;
264+
265+ /* Directory does not exist. */
266+ dirname = strdup(hashname);
267+ *strrchr(dirname,'/') = 0;
268+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
d0320a46 269+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
c57f4101
WD
270+ free(hashname);
271+ free(dirname);
272+ return robust_rename(fnametmp,fname,0644);
273+ }
274+ free(dirname);
275+
276+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
d0320a46 277+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
c57f4101
WD
278+ free(hashname);
279+ return robust_rename(fnametmp,fname,0644);
280+ }
281+
282+ first = 1;
283+ asprintf(&linkname,"%s/0",hashname);
284+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
285+
286+ } else {
287+ struct hashfile_struct *hashfiles, *hashfile;
288+ int fd;
289+
290+ if (do_stat(fnametmp,&st) == -1) {
d0320a46 291+ rsyserr(FERROR, errno, "stat failed: %s", fname);
c57f4101
WD
292+ return -1;
293+ }
294+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
295+
296+ if (hashfiles == NULL) {
297+ first = 1;
298+ asprintf(&linkname,"%s/0",hashname);
299+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
300+ } else {
301+
302+ /* Search for one identical to us. */
303+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
d0320a46 304+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
c57f4101
WD
305+ kill_hashfiles(hashfiles);
306+ return -1;
307+ }
308+ hashfile = compare_hashfiles(fd, hashfiles);
309+ hashfiles = NULL;
310+
311+ if (hashfile) {
312+ first = 0;
313+ linkname = strdup(hashfile->name);
314+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
315+ kill_hashfile(hashfile);
316+ } else {
317+ first = 1;
318+ asprintf(&linkname, "%s/%ld", hashname,
319+ last_fnbr + 1);
320+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
321+ }
322+ }
323+ }
324+
325+ if (!first) {
326+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
327+ linkname, full_fname(fname));
328+ rc = do_link(linkname, fname);
329+ if (rc == -1) {
330+ if (errno == EMLINK) {
331+ first = 1;
332+ free(linkname);
333+ asprintf(&linkname,"%s/%ld",hashname,
334+ last_fnbr + 1);
335+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
336+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337+ } else {
fe6407b5
WD
338+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339+ linkname, full_fname(fname));
c57f4101
WD
340+ robust_unlink(fname);
341+ rc = robust_rename(fnametmp,fname,0644);
342+ }
343+ } else {
344+ do_unlink(fnametmp);
345+ }
346+ }
347+
348+ if (first) {
349+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
350+ full_fname(fname),linkname);
351+
352+ rc = robust_rename(fnametmp,fname,0644);
353+ if (rc != 0) {
fe6407b5
WD
354+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
355+ full_fname(fnametmp), full_fname(fname));
c57f4101
WD
356+ }
357+ rc = do_link(fname,linkname);
358+ if (rc != 0) {
fe6407b5
WD
359+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
360+ full_fname(fname), linkname);
c57f4101
WD
361+ }
362+ }
363+
364+ free(linkname);
365+ free(hashname);
366+ return rc;
367+}
368+
369+#endif
a6587818 370--- orig/options.c 2004-09-20 05:10:48
d0320a46 371+++ options.c 2004-08-13 18:13:18
f635ed27 372@@ -126,6 +126,7 @@ char *log_format = NULL;
c57f4101
WD
373 char *password_file = NULL;
374 char *rsync_path = RSYNC_PATH;
375 char *backup_dir = NULL;
376+char *link_by_hash_dir = NULL;
377 char backup_dir_buf[MAXPATHLEN];
378 int rsync_port = RSYNC_PORT;
379 int link_dest = 0;
f635ed27 380@@ -279,6 +280,7 @@ void usage(enum logcode F)
d4e89c6a 381 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
c57f4101
WD
382 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
383 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
384+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
385 rprintf(F," -P equivalent to --partial --progress\n");
386 rprintf(F," -z, --compress compress file data\n");
387 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
f635ed27 388@@ -319,7 +321,7 @@ void usage(enum logcode F)
c57f4101
WD
389 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
390 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
391 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
125d7fca
WD
392- OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT,
393+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_LINK_BY_HASH,
c57f4101
WD
394 OPT_REFUSED_BASE = 9000};
395
396 static struct poptOption long_options[] = {
f635ed27 397@@ -378,6 +380,7 @@ static struct poptOption long_options[]
c57f4101
WD
398 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
399 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
54691942 400 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
c57f4101
WD
401+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
402 /* TODO: Should this take an optional int giving the compression level? */
403 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
404 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
a6587818 405@@ -616,6 +619,22 @@ int parse_arguments(int *argc, const cha
c57f4101
WD
406 return 0;
407 #endif
408
409+ case OPT_LINK_BY_HASH:
410+#if HAVE_LINK
d0320a46
WD
411+ arg = poptGetOptArg(pc);
412+ if (sanitize_paths)
413+ arg = sanitize_path(NULL, arg, NULL, 0);
414+ link_by_hash_dir = (char *)arg;
415+ checksum_seed = 12345;
c57f4101
WD
416+ break;
417+#else
418+ snprintf(err_buf, sizeof err_buf,
419+ "hard links are not supported on this %s\n",
420+ am_server ? "server" : "client");
421+ rprintf(FERROR, "ERROR: %s", err_buf);
422+ return 0;
423+#endif
424+
425 default:
426 /* A large opt value means that set_refuse_options()
427 * turned this option off (opt-BASE is its index). */
a6587818 428@@ -1083,6 +1102,11 @@ void server_options(char **args,int *arg
c57f4101 429 args[ac++] = compare_dest;
7b675ff5
WD
430 }
431
c57f4101
WD
432+ if (link_by_hash_dir && am_sender) {
433+ args[ac++] = "--link-by-hash";
434+ args[ac++] = link_by_hash_dir;
7b675ff5
WD
435+ }
436+
c57f4101 437 if (files_from && (!am_sender || remote_filesfrom_file)) {
7b675ff5
WD
438 if (remote_filesfrom_file) {
439 args[ac++] = "--files-from";
a6587818 440--- orig/receiver.c 2004-09-21 09:40:27
dc3ae351 441+++ receiver.c 2004-07-20 21:44:05
d5753a22 442@@ -39,6 +39,7 @@ extern int io_error;
f6c3b300 443 extern char *tmpdir;
afbebe13 444 extern char *partial_dir;
f6c3b300 445 extern char *compare_dest;
54691942 446+extern char *link_by_hash_dir;
f6c3b300
WD
447 extern int make_backups;
448 extern int do_progress;
449 extern char *backup_dir;
a6587818 450@@ -202,12 +203,13 @@ static int get_tmpname(char *fnametmp, c
c57f4101
WD
451
452
dc3ae351
WD
453 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
454- char *fname, int fd, OFF_T total_size)
455+ char *fname, int fd, OFF_T total_size, char *md4)
c57f4101 456 {
dc3ae351
WD
457 static char file_sum1[MD4_SUM_LENGTH];
458 static char file_sum2[MD4_SUM_LENGTH];
459 struct map_struct *mapbuf;
c57f4101
WD
460 struct sum_struct sum;
461+ struct mdfour mdfour_data;
462 unsigned int len;
463 OFF_T offset = 0;
464 OFF_T offset2;
a6587818 465@@ -227,6 +229,9 @@ static int receive_data(int f_in, char *
dc3ae351
WD
466 } else
467 mapbuf = NULL;
7b675ff5 468
c57f4101
WD
469+ if (md4)
470+ mdfour_begin(&mdfour_data);
7b675ff5
WD
471+
472 sum_init(checksum_seed);
c57f4101
WD
473
474 while ((i = recv_token(f_in, &data)) != 0) {
a6587818 475@@ -243,6 +248,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
476 cleanup_got_literal = 1;
477
478 sum_update(data,i);
479+ if (md4)
480+ mdfour_update(&mdfour_data,data,i);
481
afbebe13
WD
482 if (fd != -1 && write_file(fd,data,i) != i)
483 goto report_write_error;
a6587818 484@@ -267,6 +274,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
485
486 see_token(map, len);
487 sum_update(map,len);
488+ if (md4)
489+ mdfour_update(&mdfour_data,map,len);
490 }
491
afbebe13 492 if (inplace) {
a6587818 493@@ -306,6 +315,8 @@ static int receive_data(int f_in, char *
c57f4101
WD
494 }
495
496 sum_end(file_sum1);
497+ if (md4)
498+ mdfour_result(&mdfour_data, (unsigned char*)md4);
499
dc3ae351
WD
500 if (mapbuf)
501 unmap_file(mapbuf);
a6587818 502@@ -321,7 +332,7 @@ static int receive_data(int f_in, char *
5823d322
WD
503
504 static void discard_receive_data(int f_in, OFF_T length)
505 {
dc3ae351
WD
506- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
507+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
5823d322
WD
508 }
509
510
a6587818 511@@ -542,8 +553,12 @@ int recv_files(int f_in, struct file_lis
982426b8 512 rprintf(FINFO, "%s\n", safe_fname(fname));
c57f4101
WD
513
514 /* recv file data */
c57f4101 515+#ifdef HAVE_LINK
7b675ff5
WD
516+ if (link_by_hash_dir)
517+ file->u.sum = (char*)malloc(MD4_SUM_LENGTH);
c57f4101 518+#endif
dc3ae351
WD
519 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
520- fname, fd2, file->length);
521+ fname, fd2, file->length, file->u.sum);
c57f4101
WD
522
523 log_recv(file, &initial_stats);
524
a6587818 525--- orig/rsync.c 2004-09-07 21:45:30
d0320a46 526+++ rsync.c 2004-08-13 18:14:34
d5753a22 527@@ -34,6 +34,7 @@ extern int force_delete;
7b675ff5 528 extern int recurse;
d5753a22 529 extern int keep_dirlinks;
c57f4101
WD
530 extern int make_backups;
531+extern char *link_by_hash_dir;
f6c3b300
WD
532 extern char *backup_dir;
533 extern int inplace;
c57f4101 534
d0320a46 535@@ -254,7 +255,12 @@ void finish_transfer(char *fname, char *
13bed3dd
WD
536 /* move tmp file over real file */
537 if (verbose > 2)
538 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
d0320a46 539- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
c57f4101 540+#ifdef HAVE_LINK
2eb075b2 541+ if (link_by_hash_dir)
7b675ff5 542+ ret = link_by_hash(fnametmp, fname, file);
2eb075b2 543+ else
c57f4101 544+#endif
2eb075b2 545+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
54691942 546 if (ret < 0) {
fe6407b5 547 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
d0320a46 548 ret == -2 ? "copy" : "rename",
d5753a22 549--- orig/rsync.h 2004-08-03 15:41:32
13bed3dd 550+++ rsync.h 2004-07-03 20:20:15
afbebe13 551@@ -525,6 +525,14 @@ struct stats {
c57f4101
WD
552 int current_file_index;
553 };
554
555+struct hashfile_struct {
556+ struct hashfile_struct *next;
557+ struct hashfile_struct *prev;
558+ char *name;
559+ int fd;
560+ uint32 nlink;
561+};
562+
563
564 /* we need this function because of the silly way in which duplicate
565 entries are handled in the file lists - we can't change this