Removing applied patch.
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
8a529471
WD
1After applying this patch and running configure, you MUST run this
2command before "make":
3
4 make proto
5
6Jason M. Felice writes:
2eb075b2
WD
7
8This patch adds the --link-by-hash=DIR option, which hard links received
9files in a link farm arranged by MD4 file hash. The result is that the system
10will only store one copy of the unique contents of each file, regardless of
11the file's name.
12
2eb075b2 13
fe6407b5
WD
14--- Makefile.in 15 May 2004 00:48:11 -0000 1.101
15+++ Makefile.in 15 May 2004 20:29:17 -0000
8a529471
WD
16@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17 main.o checksum.o match.o syscall.o log.o backup.o
18 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19 fileio.o batch.o clientname.o
20-OBJS3=progress.o pipe.o
21+OBJS3=progress.o pipe.o hashlink.o
22 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
24 popt/popthelp.o popt/poptparse.o
25--- /dev/null 1 Jan 1970 00:00:00 -0000
fe6407b5
WD
26+++ hashlink.c 15 May 2004 20:29:17 -0000
27@@ -0,0 +1,342 @@
c57f4101
WD
28+/*
29+ Copyright (C) Cronosys, LLC 2004
30+
31+ This program is free software; you can redistribute it and/or modify
32+ it under the terms of the GNU General Public License as published by
33+ the Free Software Foundation; either version 2 of the License, or
34+ (at your option) any later version.
35+
36+ This program is distributed in the hope that it will be useful,
37+ but WITHOUT ANY WARRANTY; without even the implied warranty of
38+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39+ GNU General Public License for more details.
40+
41+ You should have received a copy of the GNU General Public License
42+ along with this program; if not, write to the Free Software
43+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44+*/
45+
46+/* This file contains code used by the --link-by-hash option. */
47+
48+#include "rsync.h"
49+
50+extern char *link_by_hash_dir;
51+
52+#ifdef HAVE_LINK
53+
54+char* make_hash_name(struct file_struct *file)
55+{
56+ char hash[33], *dst;
57+ unsigned char *src;
58+ unsigned char c;
59+ int i;
60+
61+ src = (unsigned char*)file->u.sum;
62+ for (dst = hash, i = 0; i < 4; i++, src++) {
63+ c = *src >> 4;
64+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65+ c = *src & 0x0f;
66+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67+ }
68+ *dst++ = '/';
69+ for (i = 0; i < 12; i++, src++) {
70+ c = *src >> 4;
71+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72+ c = *src & 0x0f;
73+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74+ }
75+ *dst = 0;
76+
77+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78+ return dst;
79+}
80+
81+
82+void kill_hashfile(struct hashfile_struct *hashfile)
83+{
84+ if (!hashfile)
85+ return;
86+ free(hashfile->name);
87+ close(hashfile->fd);
88+ free(hashfile);
89+}
90+
91+
92+void kill_hashfiles(struct hashfile_struct *hashfiles)
93+{
94+ struct hashfile_struct *iter, *next;
95+ if ((iter = hashfiles) != NULL) {
96+ do {
97+ next = iter->next;
98+ kill_hashfile(iter);
99+ iter = next;
100+ } while (iter != hashfiles);
101+ }
102+}
103+
104+
105+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106+{
107+ DIR *d;
108+ struct dirent *di;
109+ struct hashfile_struct *hashfiles = NULL, *hashfile;
110+ STRUCT_STAT st;
111+ long this_fnbr;
112+
113+ *fnbr = 0;
114+
115+ /* Build a list of potential candidates and open
116+ * them. */
117+ if ((d = opendir(hashname)) == NULL) {
fe6407b5 118+ rsyserr(FERROR, errno, "opendir \"%s\"", hashname);
c57f4101
WD
119+ free(hashname);
120+ return NULL;
121+ }
122+ while ((di = readdir(d)) != NULL) {
123+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124+ continue;
125+ }
126+
127+ /* We need to have the largest fnbr in case we need to store
128+ * a new file. */
129+ this_fnbr = atol(di->d_name);
130+ if (this_fnbr > *fnbr)
131+ *fnbr = this_fnbr;
132+
133+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134+ asprintf(&hashfile->name,"%s/%s",hashname,
135+ di->d_name);
136+ if (do_stat(hashfile->name,&st) == -1) {
fe6407b5 137+ rsyserr(FERROR, errno, "%s: %s", hashfile->name);
c57f4101
WD
138+ kill_hashfile(hashfile);
139+ continue;
140+ }
141+ if (st.st_size != size) {
142+ kill_hashfile(hashfile);
143+ continue;
144+ }
145+ hashfile->nlink = st.st_nlink;
146+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147+ if (hashfile->fd == -1) {
fe6407b5 148+ rsyserr(FERROR, errno, "%s", hashfile->name);
c57f4101
WD
149+ kill_hashfile(hashfile);
150+ continue;
151+ }
152+ if (hashfiles == NULL)
153+ hashfiles = hashfile->next = hashfile->prev = hashfile;
154+ else {
155+ hashfile->next = hashfiles;
156+ hashfile->prev = hashfiles->prev;
157+ hashfile->next->prev = hashfile;
158+ hashfile->prev->next = hashfile;
159+ }
160+ }
161+ closedir(d);
162+
163+ return hashfiles;
164+}
165+
166+
167+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168+{
169+ int amt, hamt;
170+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171+ struct hashfile_struct *iter, *next, *best;
172+ uint32 nlink;
173+
174+ if (!files)
175+ return NULL;
176+
177+ iter = files; /* in case files are 0 bytes */
178+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179+ iter = files;
180+ do {
181+ /* Icky bit to resync when we steal the first node. */
182+ if (!files)
183+ files = iter;
184+
185+ next = iter->next;
186+
187+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189+ if (iter == files) {
190+ files = files->prev;
191+ }
192+ if (iter->next == iter) {
193+ files = next = NULL;
194+ } else {
195+ next = iter->next;
196+ if (iter == files) {
197+ /* So we know to resync */
198+ files = NULL;
199+ }
200+ }
201+ iter->next->prev = iter->prev;
202+ iter->prev->next = iter->next;
203+ kill_hashfile(iter);
204+ }
205+
206+ iter = next;
207+ } while (iter != files);
208+
209+ if (iter == NULL && files == NULL) {
210+ /* There are no matches. */
211+ return NULL;
212+ }
213+
214+ }
215+
216+ if (amt == -1) {
fe6407b5 217+ rsyserr(FERROR, errno, "%s");
c57f4101
WD
218+ kill_hashfiles(files);
219+ return NULL;
220+ }
221+
222+ /* If we only have one file left, use it. */
223+ if (files == files->next) {
224+ return files;
225+ }
226+
227+ /* All files which remain in the list are identical and should have
228+ * the same size. We pick the one with the lowest link count (we
229+ * may have rolled over because we hit the maximum link count for
230+ * the filesystem). */
231+ best = iter = files;
232+ nlink = iter->nlink;
233+ do {
234+ if (iter->nlink < nlink) {
235+ nlink = iter->nlink;
236+ best = iter;
237+ }
238+ iter = iter->next;
239+ } while (iter != files);
240+
241+ best->next->prev = best->prev;
242+ best->prev->next = best->next;
243+ if (files == best)
244+ files = files->next;
245+ kill_hashfiles(files);
246+ return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+ STRUCT_STAT st;
253+ char *hashname = make_hash_name(file);
254+ int first = 0, rc;
255+ char *linkname;
256+ long last_fnbr;
257+
258+ if (file->length == 0) {
259+ return robust_rename(fnametmp,fname,0644);
260+ }
261+
262+ if (do_stat(hashname, &st) == -1) {
263+ char *dirname;
264+
265+ /* Directory does not exist. */
266+ dirname = strdup(hashname);
267+ *strrchr(dirname,'/') = 0;
268+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
fe6407b5 269+ rsyserr(FERROR, errno, "mkdir %s", dirname);
c57f4101
WD
270+ free(hashname);
271+ free(dirname);
272+ return robust_rename(fnametmp,fname,0644);
273+ }
274+ free(dirname);
275+
276+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
fe6407b5 277+ rsyserr(FERROR, errno, "mkdir %s", hashname);
c57f4101
WD
278+ free(hashname);
279+ return robust_rename(fnametmp,fname,0644);
280+ }
281+
282+ first = 1;
283+ asprintf(&linkname,"%s/0",hashname);
284+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
285+
286+ } else {
287+ struct hashfile_struct *hashfiles, *hashfile;
288+ int fd;
289+
290+ if (do_stat(fnametmp,&st) == -1) {
fe6407b5 291+ rsyserr(FERROR, errno, "%s", fname);
c57f4101
WD
292+ return -1;
293+ }
294+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
295+
296+ if (hashfiles == NULL) {
297+ first = 1;
298+ asprintf(&linkname,"%s/0",hashname);
299+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
300+ } else {
301+
302+ /* Search for one identical to us. */
303+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
fe6407b5 304+ rsyserr(FERROR, errno, "%s", fnametmp);
c57f4101
WD
305+ kill_hashfiles(hashfiles);
306+ return -1;
307+ }
308+ hashfile = compare_hashfiles(fd, hashfiles);
309+ hashfiles = NULL;
310+
311+ if (hashfile) {
312+ first = 0;
313+ linkname = strdup(hashfile->name);
314+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
315+ kill_hashfile(hashfile);
316+ } else {
317+ first = 1;
318+ asprintf(&linkname, "%s/%ld", hashname,
319+ last_fnbr + 1);
320+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
321+ }
322+ }
323+ }
324+
325+ if (!first) {
326+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
327+ linkname, full_fname(fname));
328+ rc = do_link(linkname, fname);
329+ if (rc == -1) {
330+ if (errno == EMLINK) {
331+ first = 1;
332+ free(linkname);
333+ asprintf(&linkname,"%s/%ld",hashname,
334+ last_fnbr + 1);
335+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
336+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337+ } else {
fe6407b5
WD
338+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339+ linkname, full_fname(fname));
c57f4101
WD
340+ robust_unlink(fname);
341+ rc = robust_rename(fnametmp,fname,0644);
342+ }
343+ } else {
344+ do_unlink(fnametmp);
345+ }
346+ }
347+
348+ if (first) {
349+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
350+ full_fname(fname),linkname);
351+
352+ rc = robust_rename(fnametmp,fname,0644);
353+ if (rc != 0) {
fe6407b5
WD
354+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
355+ full_fname(fnametmp), full_fname(fname));
c57f4101
WD
356+ }
357+ rc = do_link(fname,linkname);
358+ if (rc != 0) {
fe6407b5
WD
359+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
360+ full_fname(fname), linkname);
c57f4101
WD
361+ }
362+ }
363+
364+ free(linkname);
365+ free(hashname);
366+ return rc;
367+}
368+
369+#endif
8a529471 370--- options.c 6 May 2004 21:08:01 -0000 1.148
fe6407b5 371+++ options.c 15 May 2004 20:29:17 -0000
54691942 372@@ -121,6 +121,7 @@ char *log_format = NULL;
c57f4101
WD
373 char *password_file = NULL;
374 char *rsync_path = RSYNC_PATH;
375 char *backup_dir = NULL;
376+char *link_by_hash_dir = NULL;
377 char backup_dir_buf[MAXPATHLEN];
378 int rsync_port = RSYNC_PORT;
379 int link_dest = 0;
54691942 380@@ -266,6 +267,7 @@ void usage(enum logcode F)
c57f4101
WD
381 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
382 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
383 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
384+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
385 rprintf(F," -P equivalent to --partial --progress\n");
386 rprintf(F," -z, --compress compress file data\n");
387 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
54691942 388@@ -305,7 +307,7 @@ void usage(enum logcode F)
c57f4101
WD
389 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
390 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
391 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
392- OPT_READ_BATCH, OPT_WRITE_BATCH,
393+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
394 OPT_REFUSED_BASE = 9000};
395
396 static struct poptOption long_options[] = {
54691942 397@@ -362,6 +364,7 @@ static struct poptOption long_options[]
c57f4101
WD
398 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
399 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
54691942 400 {"link-dest", 0, POPT_ARG_STRING, &compare_dest, OPT_LINK_DEST, 0, 0 },
c57f4101
WD
401+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
402 /* TODO: Should this take an optional int giving the compression level? */
403 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
404 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
54691942 405@@ -584,6 +587,19 @@ int parse_arguments(int *argc, const cha
c57f4101
WD
406 return 0;
407 #endif
408
409+ case OPT_LINK_BY_HASH:
410+#if HAVE_LINK
411+ link_by_hash_dir = (char *)poptGetOptArg(pc);
412+ checksum_seed = FIXED_CHECKSUM_SEED;
413+ break;
414+#else
415+ snprintf(err_buf, sizeof err_buf,
416+ "hard links are not supported on this %s\n",
417+ am_server ? "server" : "client");
418+ rprintf(FERROR, "ERROR: %s", err_buf);
419+ return 0;
420+#endif
421+
422 default:
423 /* A large opt value means that set_refuse_options()
424 * turned this option off (opt-BASE is its index). */
54691942
WD
425@@ -951,6 +967,11 @@ void server_options(char **args,int *arg
426 */
427 args[ac++] = link_dest ? "--link-dest" : "--compare-dest";
c57f4101 428 args[ac++] = compare_dest;
54691942
WD
429+ }
430+
c57f4101
WD
431+ if (link_by_hash_dir && am_sender) {
432+ args[ac++] = "--link-by-hash";
433+ args[ac++] = link_by_hash_dir;
54691942
WD
434 }
435
c57f4101 436 if (files_from && (!am_sender || remote_filesfrom_file)) {
fe6407b5
WD
437--- receiver.c 15 May 2004 19:31:10 -0000 1.78
438+++ receiver.c 15 May 2004 20:29:17 -0000
8a529471 439@@ -46,6 +46,7 @@ extern int module_id;
54691942
WD
440 extern int ignore_errors;
441 extern int orig_umask;
8a529471 442 extern int keep_partial;
54691942
WD
443+extern char *link_by_hash_dir;
444
445 static void delete_one(char *fn, int is_dir)
446 {
fe6407b5 447@@ -192,10 +193,11 @@ static int get_tmpname(char *fnametmp, c
c57f4101
WD
448
449
450 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
451- OFF_T total_size)
452+ OFF_T total_size,char *md4)
453 {
454 int i;
455 struct sum_struct sum;
456+ struct mdfour mdfour_data;
457 unsigned int len;
458 OFF_T offset = 0;
459 OFF_T offset2;
fe6407b5 460@@ -205,7 +207,9 @@ static int receive_data(int f_in,struct
c57f4101
WD
461 char *map=NULL;
462
463 read_sum_head(f_in, &sum);
464-
465+ if (md4)
466+ mdfour_begin(&mdfour_data);
467+
468 sum_init();
469
470 while ((i = recv_token(f_in, &data)) != 0) {
fe6407b5 471@@ -222,6 +226,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
472 cleanup_got_literal = 1;
473
474 sum_update(data,i);
475+ if (md4)
476+ mdfour_update(&mdfour_data,data,i);
477
478 if (fd != -1 && write_file(fd,data,i) != i) {
fe6407b5
WD
479 rsyserr(FERROR, errno, "write failed on %s",
480@@ -249,6 +255,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
481
482 see_token(map, len);
483 sum_update(map,len);
484+ if (md4)
485+ mdfour_update(&mdfour_data,map,len);
486 }
487
488 if (fd != -1 && write_file(fd,map,len) != (int) len) {
fe6407b5 489@@ -271,6 +279,8 @@ static int receive_data(int f_in,struct
c57f4101
WD
490 }
491
492 sum_end(file_sum1);
493+ if (md4)
494+ mdfour_result(&mdfour_data, (unsigned char*)md4);
495
496 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
497 if (verbose > 2) {
fe6407b5 498@@ -374,7 +384,7 @@ int recv_files(int f_in,struct file_list
c57f4101 499 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
fe6407b5
WD
500 rsyserr(FERROR, errno, "fstat %s failed",
501 full_fname(fnamecmp));
c57f4101
WD
502- receive_data(f_in,NULL,-1,NULL,file->length);
503+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
504 close(fd1);
505 continue;
506 }
fe6407b5 507@@ -387,7 +397,7 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
508 */
509 rprintf(FERROR,"recv_files: %s is a directory\n",
510 full_fname(fnamecmp));
511- receive_data(f_in, NULL, -1, NULL, file->length);
512+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
513 close(fd1);
514 continue;
515 }
fe6407b5 516@@ -439,7 +449,7 @@ int recv_files(int f_in,struct file_list
c57f4101 517 if (fd2 == -1) {
fe6407b5
WD
518 rsyserr(FERROR, errno, "mkstemp %s failed",
519 full_fname(fnametmp));
c57f4101
WD
520- receive_data(f_in,mapbuf,-1,NULL,file->length);
521+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
522 if (mapbuf) unmap_file(mapbuf);
523 if (fd1 != -1) close(fd1);
524 continue;
fe6407b5 525@@ -452,7 +462,12 @@ int recv_files(int f_in,struct file_list
c57f4101
WD
526 }
527
528 /* recv file data */
529- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
530+#ifdef HAVE_LINK
531+ if (link_by_hash_dir) {
532+ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
533+ }
534+#endif
535+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
536
537 log_recv(file, &initial_stats);
538
fe6407b5
WD
539--- rsync.c 15 May 2004 19:31:10 -0000 1.139
540+++ rsync.c 15 May 2004 20:29:17 -0000
8a529471
WD
541@@ -31,6 +31,7 @@ extern int am_generator;
542 extern int preserve_uid;
c57f4101 543 extern int preserve_gid;
c57f4101
WD
544 extern int make_backups;
545+extern char *link_by_hash_dir;
546
547
548 /*
8a529471 549@@ -236,8 +237,12 @@ void finish_transfer(char *fname, char *
c57f4101
WD
550 if (make_backups && !make_backup(fname))
551 return;
552
2eb075b2
WD
553- /* move tmp file over real file */
554- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
c57f4101 555+#ifdef HAVE_LINK
2eb075b2
WD
556+ if (link_by_hash_dir)
557+ ret = link_by_hash(fnametmp,fname,file);
558+ else
c57f4101 559+#endif
2eb075b2 560+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
54691942 561 if (ret < 0) {
fe6407b5 562 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
2eb075b2 563 ret == -2 ? "copy" : "rename",
8a529471 564--- rsync.h 13 May 2004 18:51:22 -0000 1.203
fe6407b5 565+++ rsync.h 15 May 2004 20:29:18 -0000
8a529471 566@@ -521,6 +521,14 @@ struct stats {
c57f4101
WD
567 int current_file_index;
568 };
569
570+struct hashfile_struct {
571+ struct hashfile_struct *next;
572+ struct hashfile_struct *prev;
573+ char *name;
574+ int fd;
575+ uint32 nlink;
576+};
577+
578
579 /* we need this function because of the silly way in which duplicate
580 entries are handled in the file lists - we can't change this