Updated rsync.yo hunk.
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
2eb075b2
WD
1To: rsync@lists.samba.org
2From: "Jason M. Felice" <jfelice@cronosys.com>
3Subject: [patch] Add `--link-by-hash' option (rev 5).
4Date: Mon, 23 Feb 2004 13:29:08 -0500
5
6This patch adds the --link-by-hash=DIR option, which hard links received
7files in a link farm arranged by MD4 file hash. The result is that the system
8will only store one copy of the unique contents of each file, regardless of
9the file's name.
10
11(rev 5)
12* Fixed silly logic error.
13
14(rev 4)
15* Updated for committed robust_rename() patch, other changes in CVS.
16
17(rev 3)
18* Don't link empty files.
19* Roll over to new file when filesystem maximum link count is reached.
20* If link fails for another reason, leave non-linked file there.
21* Depends on rsync-rename.diff
22
23(rev 2)
24* This revision is actually against CVS HEAD (I didn't realize I was working
25 from a stale rsync'd CVS).
26* Apply permissions after linking (permissions were lost if we already had
27 a copy of the file in the link farm).
28
29Patch Summary:
30
31 -1 +1 Makefile.in
32 -0 +351 hashlink.c (new)
33 -1 +22 options.c
34 -0 +6 proto.h
35 -6 +21 receiver.c
36 -2 +8 rsync.c
37 -0 +8 rsync.h
38
c57f4101
WD
39--- hashlink.c 1969-12-31 19:00:00.000000000 -0500
40+++ hashlink.c 2004-02-23 10:30:45.000000000 -0500
41@@ -0,0 +1,351 @@
42+/*
43+ Copyright (C) Cronosys, LLC 2004
44+
45+ This program is free software; you can redistribute it and/or modify
46+ it under the terms of the GNU General Public License as published by
47+ the Free Software Foundation; either version 2 of the License, or
48+ (at your option) any later version.
49+
50+ This program is distributed in the hope that it will be useful,
51+ but WITHOUT ANY WARRANTY; without even the implied warranty of
52+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
53+ GNU General Public License for more details.
54+
55+ You should have received a copy of the GNU General Public License
56+ along with this program; if not, write to the Free Software
57+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
58+*/
59+
60+/* This file contains code used by the --link-by-hash option. */
61+
62+#include "rsync.h"
63+
64+extern char *link_by_hash_dir;
65+
66+#ifdef HAVE_LINK
67+
68+char* make_hash_name(struct file_struct *file)
69+{
70+ char hash[33], *dst;
71+ unsigned char *src;
72+ unsigned char c;
73+ int i;
74+
75+ src = (unsigned char*)file->u.sum;
76+ for (dst = hash, i = 0; i < 4; i++, src++) {
77+ c = *src >> 4;
78+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
79+ c = *src & 0x0f;
80+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
81+ }
82+ *dst++ = '/';
83+ for (i = 0; i < 12; i++, src++) {
84+ c = *src >> 4;
85+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
86+ c = *src & 0x0f;
87+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
88+ }
89+ *dst = 0;
90+
91+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
92+ return dst;
93+}
94+
95+
96+void kill_hashfile(struct hashfile_struct *hashfile)
97+{
98+ if (!hashfile)
99+ return;
100+ free(hashfile->name);
101+ close(hashfile->fd);
102+ free(hashfile);
103+}
104+
105+
106+void kill_hashfiles(struct hashfile_struct *hashfiles)
107+{
108+ struct hashfile_struct *iter, *next;
109+ if ((iter = hashfiles) != NULL) {
110+ do {
111+ next = iter->next;
112+ kill_hashfile(iter);
113+ iter = next;
114+ } while (iter != hashfiles);
115+ }
116+}
117+
118+
119+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
120+{
121+ DIR *d;
122+ struct dirent *di;
123+ struct hashfile_struct *hashfiles = NULL, *hashfile;
124+ STRUCT_STAT st;
125+ long this_fnbr;
126+
127+ *fnbr = 0;
128+
129+ /* Build a list of potential candidates and open
130+ * them. */
131+ if ((d = opendir(hashname)) == NULL) {
132+ rprintf(FERROR,"opendir \"%s\": %s\n",
133+ hashname, strerror(errno));
134+ free(hashname);
135+ return NULL;
136+ }
137+ while ((di = readdir(d)) != NULL) {
138+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
139+ continue;
140+ }
141+
142+ /* We need to have the largest fnbr in case we need to store
143+ * a new file. */
144+ this_fnbr = atol(di->d_name);
145+ if (this_fnbr > *fnbr)
146+ *fnbr = this_fnbr;
147+
148+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
149+ asprintf(&hashfile->name,"%s/%s",hashname,
150+ di->d_name);
151+ if (do_stat(hashfile->name,&st) == -1) {
152+ rprintf(FERROR,"%s: %s", hashfile->name,
153+ strerror(errno));
154+ kill_hashfile(hashfile);
155+ continue;
156+ }
157+ if (st.st_size != size) {
158+ kill_hashfile(hashfile);
159+ continue;
160+ }
161+ hashfile->nlink = st.st_nlink;
162+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
163+ if (hashfile->fd == -1) {
164+ rprintf(FERROR,"%s: %s\n", hashfile->name,
165+ strerror(errno));
166+ kill_hashfile(hashfile);
167+ continue;
168+ }
169+ if (hashfiles == NULL)
170+ hashfiles = hashfile->next = hashfile->prev = hashfile;
171+ else {
172+ hashfile->next = hashfiles;
173+ hashfile->prev = hashfiles->prev;
174+ hashfile->next->prev = hashfile;
175+ hashfile->prev->next = hashfile;
176+ }
177+ }
178+ closedir(d);
179+
180+ return hashfiles;
181+}
182+
183+
184+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
185+{
186+ int amt, hamt;
187+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
188+ struct hashfile_struct *iter, *next, *best;
189+ uint32 nlink;
190+
191+ if (!files)
192+ return NULL;
193+
194+ iter = files; /* in case files are 0 bytes */
195+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
196+ iter = files;
197+ do {
198+ /* Icky bit to resync when we steal the first node. */
199+ if (!files)
200+ files = iter;
201+
202+ next = iter->next;
203+
204+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
205+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
206+ if (iter == files) {
207+ files = files->prev;
208+ }
209+ if (iter->next == iter) {
210+ files = next = NULL;
211+ } else {
212+ next = iter->next;
213+ if (iter == files) {
214+ /* So we know to resync */
215+ files = NULL;
216+ }
217+ }
218+ iter->next->prev = iter->prev;
219+ iter->prev->next = iter->next;
220+ kill_hashfile(iter);
221+ }
222+
223+ iter = next;
224+ } while (iter != files);
225+
226+ if (iter == NULL && files == NULL) {
227+ /* There are no matches. */
228+ return NULL;
229+ }
230+
231+ }
232+
233+ if (amt == -1) {
234+ rprintf(FERROR,"%s",strerror(errno));
235+ kill_hashfiles(files);
236+ return NULL;
237+ }
238+
239+ /* If we only have one file left, use it. */
240+ if (files == files->next) {
241+ return files;
242+ }
243+
244+ /* All files which remain in the list are identical and should have
245+ * the same size. We pick the one with the lowest link count (we
246+ * may have rolled over because we hit the maximum link count for
247+ * the filesystem). */
248+ best = iter = files;
249+ nlink = iter->nlink;
250+ do {
251+ if (iter->nlink < nlink) {
252+ nlink = iter->nlink;
253+ best = iter;
254+ }
255+ iter = iter->next;
256+ } while (iter != files);
257+
258+ best->next->prev = best->prev;
259+ best->prev->next = best->next;
260+ if (files == best)
261+ files = files->next;
262+ kill_hashfiles(files);
263+ return best;
264+}
265+
266+
267+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
268+{
269+ STRUCT_STAT st;
270+ char *hashname = make_hash_name(file);
271+ int first = 0, rc;
272+ char *linkname;
273+ long last_fnbr;
274+
275+ if (file->length == 0) {
276+ return robust_rename(fnametmp,fname,0644);
277+ }
278+
279+ if (do_stat(hashname, &st) == -1) {
280+ char *dirname;
281+
282+ /* Directory does not exist. */
283+ dirname = strdup(hashname);
284+ *strrchr(dirname,'/') = 0;
285+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
286+ rprintf(FERROR, "mkdir %s: %s\n", dirname,
287+ strerror(errno));
288+ free(hashname);
289+ free(dirname);
290+ return robust_rename(fnametmp,fname,0644);
291+ }
292+ free(dirname);
293+
294+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
295+ rprintf(FERROR, "mkdir %s: %s\n", hashname,
296+ strerror(errno));
297+ free(hashname);
298+ return robust_rename(fnametmp,fname,0644);
299+ }
300+
301+ first = 1;
302+ asprintf(&linkname,"%s/0",hashname);
303+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
304+
305+ } else {
306+ struct hashfile_struct *hashfiles, *hashfile;
307+ int fd;
308+
309+ if (do_stat(fnametmp,&st) == -1) {
310+ rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
311+ return -1;
312+ }
313+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
314+
315+ if (hashfiles == NULL) {
316+ first = 1;
317+ asprintf(&linkname,"%s/0",hashname);
318+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
319+ } else {
320+
321+ /* Search for one identical to us. */
322+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
323+ rprintf(FERROR,"%s: %s\n",fnametmp,
324+ strerror(errno));
325+ kill_hashfiles(hashfiles);
326+ return -1;
327+ }
328+ hashfile = compare_hashfiles(fd, hashfiles);
329+ hashfiles = NULL;
330+
331+ if (hashfile) {
332+ first = 0;
333+ linkname = strdup(hashfile->name);
334+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
335+ kill_hashfile(hashfile);
336+ } else {
337+ first = 1;
338+ asprintf(&linkname, "%s/%ld", hashname,
339+ last_fnbr + 1);
340+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
341+ }
342+ }
343+ }
344+
345+ if (!first) {
346+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
347+ linkname, full_fname(fname));
348+ rc = do_link(linkname, fname);
349+ if (rc == -1) {
350+ if (errno == EMLINK) {
351+ first = 1;
352+ free(linkname);
353+ asprintf(&linkname,"%s/%ld",hashname,
354+ last_fnbr + 1);
355+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
356+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
357+ } else {
358+ rprintf(FERROR,"link \"%s\" -> %s: %s\n",
359+ linkname,full_fname(fname),
360+ strerror(errno));
361+ robust_unlink(fname);
362+ rc = robust_rename(fnametmp,fname,0644);
363+ }
364+ } else {
365+ do_unlink(fnametmp);
366+ }
367+ }
368+
369+ if (first) {
370+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
371+ full_fname(fname),linkname);
372+
373+ rc = robust_rename(fnametmp,fname,0644);
374+ if (rc != 0) {
375+ rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
376+ full_fname(fnametmp),full_fname(fname),
377+ strerror(errno));
378+ }
379+ rc = do_link(fname,linkname);
380+ if (rc != 0) {
381+ rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
382+ full_fname(fname),linkname,
383+ strerror(errno));
384+ }
385+ }
386+
387+ free(linkname);
388+ free(hashname);
389+ return rc;
390+}
391+
392+#endif
393--- Makefile.in 2004-02-23 10:22:51.000000000 -0500
394+++ Makefile.in 2004-02-23 10:22:51.000000000 -0500
395@@ -35,7 +35,7 @@
396 main.o checksum.o match.o syscall.o log.o backup.o
397 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
398 fileio.o batch.o clientname.o
399-OBJS3=progress.o pipe.o
400+OBJS3=progress.o pipe.o hashlink.o
401 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
402 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
403 popt/popthelp.o popt/poptparse.o
404--- options.c 2004-02-23 10:22:51.000000000 -0500
405+++ options.c 2004-02-23 10:29:14.000000000 -0500
406@@ -119,6 +119,7 @@
407 char *password_file = NULL;
408 char *rsync_path = RSYNC_PATH;
409 char *backup_dir = NULL;
410+char *link_by_hash_dir = NULL;
411 char backup_dir_buf[MAXPATHLEN];
412 int rsync_port = RSYNC_PORT;
413 int link_dest = 0;
414@@ -264,6 +265,7 @@
415 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
416 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
417 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
418+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
419 rprintf(F," -P equivalent to --partial --progress\n");
420 rprintf(F," -z, --compress compress file data\n");
421 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
422@@ -303,7 +305,7 @@
423 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
424 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
425 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
426- OPT_READ_BATCH, OPT_WRITE_BATCH,
427+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
428 OPT_REFUSED_BASE = 9000};
429
430 static struct poptOption long_options[] = {
431@@ -360,6 +362,7 @@
432 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
433 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
434 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
435+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
436 /* TODO: Should this take an optional int giving the compression level? */
437 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
438 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
439@@ -577,6 +580,19 @@
440 return 0;
441 #endif
442
443+ case OPT_LINK_BY_HASH:
444+#if HAVE_LINK
445+ link_by_hash_dir = (char *)poptGetOptArg(pc);
446+ checksum_seed = FIXED_CHECKSUM_SEED;
447+ break;
448+#else
449+ snprintf(err_buf, sizeof err_buf,
450+ "hard links are not supported on this %s\n",
451+ am_server ? "server" : "client");
452+ rprintf(FERROR, "ERROR: %s", err_buf);
453+ return 0;
454+#endif
455+
456 default:
457 /* A large opt value means that set_refuse_options()
458 * turned this option off (opt-BASE is its index). */
459@@ -934,6 +950,11 @@
460 args[ac++] = compare_dest;
461 }
462
463+ if (link_by_hash_dir && am_sender) {
464+ args[ac++] = "--link-by-hash";
465+ args[ac++] = link_by_hash_dir;
466+ }
467+
468 if (files_from && (!am_sender || remote_filesfrom_file)) {
469 if (remote_filesfrom_file) {
470 args[ac++] = "--files-from";
471--- proto.h 2004-02-23 10:22:51.000000000 -0500
2eb075b2 472+++ proto.h 2004-02-23 11:06:03.000000000 -0500
c57f4101
WD
473@@ -93,6 +93,12 @@
474 void write_sum_head(int f, struct sum_struct *sum);
475 void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
476 void generate_files(int f, struct file_list *flist, char *local_name);
477+char* make_hash_name(struct file_struct *file);
478+void kill_hashfile(struct hashfile_struct *hashfile);
479+void kill_hashfiles(struct hashfile_struct *hashfiles);
480+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
481+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
482+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
483 void init_hard_links(struct file_list *flist);
484 int hard_link_check(struct file_struct *file, int skip);
485 void do_hard_links(void);
486--- receiver.c 2004-02-23 10:22:51.000000000 -0500
487+++ receiver.c 2004-02-23 10:22:51.000000000 -0500
488@@ -186,10 +186,11 @@
489
490
491 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
492- OFF_T total_size)
493+ OFF_T total_size,char *md4)
494 {
495 int i;
496 struct sum_struct sum;
497+ struct mdfour mdfour_data;
498 unsigned int len;
499 OFF_T offset = 0;
500 OFF_T offset2;
501@@ -199,7 +200,9 @@
502 char *map=NULL;
503
504 read_sum_head(f_in, &sum);
505-
506+ if (md4)
507+ mdfour_begin(&mdfour_data);
508+
509 sum_init();
510
511 while ((i = recv_token(f_in, &data)) != 0) {
512@@ -216,6 +219,8 @@
513 cleanup_got_literal = 1;
514
515 sum_update(data,i);
516+ if (md4)
517+ mdfour_update(&mdfour_data,data,i);
518
519 if (fd != -1 && write_file(fd,data,i) != i) {
520 rprintf(FERROR, "write failed on %s: %s\n",
521@@ -243,6 +248,8 @@
522
523 see_token(map, len);
524 sum_update(map,len);
525+ if (md4)
526+ mdfour_update(&mdfour_data,map,len);
527 }
528
529 if (fd != -1 && write_file(fd,map,len) != (int) len) {
530@@ -265,6 +272,8 @@
531 }
532
533 sum_end(file_sum1);
534+ if (md4)
535+ mdfour_result(&mdfour_data, (unsigned char*)md4);
536
537 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
538 if (verbose > 2) {
539@@ -299,6 +308,7 @@
540 extern int preserve_perms;
541 extern int delete_after;
542 extern int orig_umask;
543+ extern char *link_by_hash_dir;
544 struct stats initial_stats;
545
546 if (verbose > 2) {
547@@ -372,7 +382,7 @@
548 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
549 rprintf(FERROR, "fstat %s failed: %s\n",
550 full_fname(fnamecmp), strerror(errno));
551- receive_data(f_in,NULL,-1,NULL,file->length);
552+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
553 close(fd1);
554 continue;
555 }
556@@ -385,7 +395,7 @@
557 */
558 rprintf(FERROR,"recv_files: %s is a directory\n",
559 full_fname(fnamecmp));
560- receive_data(f_in, NULL, -1, NULL, file->length);
561+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
562 close(fd1);
563 continue;
564 }
565@@ -437,7 +447,7 @@
566 if (fd2 == -1) {
567 rprintf(FERROR, "mkstemp %s failed: %s\n",
568 full_fname(fnametmp), strerror(errno));
569- receive_data(f_in,mapbuf,-1,NULL,file->length);
570+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
571 if (mapbuf) unmap_file(mapbuf);
572 if (fd1 != -1) close(fd1);
573 continue;
574@@ -450,7 +460,12 @@
575 }
576
577 /* recv file data */
578- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
579+#ifdef HAVE_LINK
580+ if (link_by_hash_dir) {
581+ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
582+ }
583+#endif
584+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
585
586 log_recv(file, &initial_stats);
587
588--- rsync.c 2004-02-23 10:22:51.000000000 -0500
2eb075b2 589+++ rsync.c 2004-02-23 12:49:33.000000000 -0500
c57f4101
WD
590@@ -33,6 +33,7 @@
591 extern int preserve_gid;
592 extern int preserve_perms;
593 extern int make_backups;
594+extern char *link_by_hash_dir;
595
596
597 /*
2eb075b2 598@@ -236,8 +237,13 @@
c57f4101
WD
599 if (make_backups && !make_backup(fname))
600 return;
601
2eb075b2
WD
602- /* move tmp file over real file */
603- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
c57f4101 604+#ifdef HAVE_LINK
2eb075b2
WD
605+ if (link_by_hash_dir)
606+ ret = link_by_hash(fnametmp,fname,file);
607+ else
c57f4101 608+#endif
2eb075b2
WD
609+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
610+
c57f4101 611 if (ret != 0) {
2eb075b2
WD
612 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
613 ret == -2 ? "copy" : "rename",
c57f4101 614--- rsync.h 2004-02-23 10:22:51.000000000 -0500
2eb075b2 615+++ rsync.h 2004-02-23 12:42:59.000000000 -0500
c57f4101
WD
616@@ -513,6 +513,14 @@
617 int current_file_index;
618 };
619
620+struct hashfile_struct {
621+ struct hashfile_struct *next;
622+ struct hashfile_struct *prev;
623+ char *name;
624+ int fd;
625+ uint32 nlink;
626+};
627+
628
629 /* we need this function because of the silly way in which duplicate
630 entries are handled in the file lists - we can't change this