Adding a new feature diff.
[rsync/rsync-patches.git] / link-by-hash.diff
CommitLineData
c57f4101
WD
1--- hashlink.c 1969-12-31 19:00:00.000000000 -0500
2+++ hashlink.c 2004-02-23 10:30:45.000000000 -0500
3@@ -0,0 +1,351 @@
4+/*
5+ Copyright (C) Cronosys, LLC 2004
6+
7+ This program is free software; you can redistribute it and/or modify
8+ it under the terms of the GNU General Public License as published by
9+ the Free Software Foundation; either version 2 of the License, or
10+ (at your option) any later version.
11+
12+ This program is distributed in the hope that it will be useful,
13+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+ GNU General Public License for more details.
16+
17+ You should have received a copy of the GNU General Public License
18+ along with this program; if not, write to the Free Software
19+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20+*/
21+
22+/* This file contains code used by the --link-by-hash option. */
23+
24+#include "rsync.h"
25+
26+extern char *link_by_hash_dir;
27+
28+#ifdef HAVE_LINK
29+
30+char* make_hash_name(struct file_struct *file)
31+{
32+ char hash[33], *dst;
33+ unsigned char *src;
34+ unsigned char c;
35+ int i;
36+
37+ src = (unsigned char*)file->u.sum;
38+ for (dst = hash, i = 0; i < 4; i++, src++) {
39+ c = *src >> 4;
40+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
41+ c = *src & 0x0f;
42+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
43+ }
44+ *dst++ = '/';
45+ for (i = 0; i < 12; i++, src++) {
46+ c = *src >> 4;
47+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
48+ c = *src & 0x0f;
49+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
50+ }
51+ *dst = 0;
52+
53+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
54+ return dst;
55+}
56+
57+
58+void kill_hashfile(struct hashfile_struct *hashfile)
59+{
60+ if (!hashfile)
61+ return;
62+ free(hashfile->name);
63+ close(hashfile->fd);
64+ free(hashfile);
65+}
66+
67+
68+void kill_hashfiles(struct hashfile_struct *hashfiles)
69+{
70+ struct hashfile_struct *iter, *next;
71+ if ((iter = hashfiles) != NULL) {
72+ do {
73+ next = iter->next;
74+ kill_hashfile(iter);
75+ iter = next;
76+ } while (iter != hashfiles);
77+ }
78+}
79+
80+
81+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
82+{
83+ DIR *d;
84+ struct dirent *di;
85+ struct hashfile_struct *hashfiles = NULL, *hashfile;
86+ STRUCT_STAT st;
87+ long this_fnbr;
88+
89+ *fnbr = 0;
90+
91+ /* Build a list of potential candidates and open
92+ * them. */
93+ if ((d = opendir(hashname)) == NULL) {
94+ rprintf(FERROR,"opendir \"%s\": %s\n",
95+ hashname, strerror(errno));
96+ free(hashname);
97+ return NULL;
98+ }
99+ while ((di = readdir(d)) != NULL) {
100+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
101+ continue;
102+ }
103+
104+ /* We need to have the largest fnbr in case we need to store
105+ * a new file. */
106+ this_fnbr = atol(di->d_name);
107+ if (this_fnbr > *fnbr)
108+ *fnbr = this_fnbr;
109+
110+ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
111+ asprintf(&hashfile->name,"%s/%s",hashname,
112+ di->d_name);
113+ if (do_stat(hashfile->name,&st) == -1) {
114+ rprintf(FERROR,"%s: %s", hashfile->name,
115+ strerror(errno));
116+ kill_hashfile(hashfile);
117+ continue;
118+ }
119+ if (st.st_size != size) {
120+ kill_hashfile(hashfile);
121+ continue;
122+ }
123+ hashfile->nlink = st.st_nlink;
124+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
125+ if (hashfile->fd == -1) {
126+ rprintf(FERROR,"%s: %s\n", hashfile->name,
127+ strerror(errno));
128+ kill_hashfile(hashfile);
129+ continue;
130+ }
131+ if (hashfiles == NULL)
132+ hashfiles = hashfile->next = hashfile->prev = hashfile;
133+ else {
134+ hashfile->next = hashfiles;
135+ hashfile->prev = hashfiles->prev;
136+ hashfile->next->prev = hashfile;
137+ hashfile->prev->next = hashfile;
138+ }
139+ }
140+ closedir(d);
141+
142+ return hashfiles;
143+}
144+
145+
146+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
147+{
148+ int amt, hamt;
149+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
150+ struct hashfile_struct *iter, *next, *best;
151+ uint32 nlink;
152+
153+ if (!files)
154+ return NULL;
155+
156+ iter = files; /* in case files are 0 bytes */
157+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
158+ iter = files;
159+ do {
160+ /* Icky bit to resync when we steal the first node. */
161+ if (!files)
162+ files = iter;
163+
164+ next = iter->next;
165+
166+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
167+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
168+ if (iter == files) {
169+ files = files->prev;
170+ }
171+ if (iter->next == iter) {
172+ files = next = NULL;
173+ } else {
174+ next = iter->next;
175+ if (iter == files) {
176+ /* So we know to resync */
177+ files = NULL;
178+ }
179+ }
180+ iter->next->prev = iter->prev;
181+ iter->prev->next = iter->next;
182+ kill_hashfile(iter);
183+ }
184+
185+ iter = next;
186+ } while (iter != files);
187+
188+ if (iter == NULL && files == NULL) {
189+ /* There are no matches. */
190+ return NULL;
191+ }
192+
193+ }
194+
195+ if (amt == -1) {
196+ rprintf(FERROR,"%s",strerror(errno));
197+ kill_hashfiles(files);
198+ return NULL;
199+ }
200+
201+ /* If we only have one file left, use it. */
202+ if (files == files->next) {
203+ return files;
204+ }
205+
206+ /* All files which remain in the list are identical and should have
207+ * the same size. We pick the one with the lowest link count (we
208+ * may have rolled over because we hit the maximum link count for
209+ * the filesystem). */
210+ best = iter = files;
211+ nlink = iter->nlink;
212+ do {
213+ if (iter->nlink < nlink) {
214+ nlink = iter->nlink;
215+ best = iter;
216+ }
217+ iter = iter->next;
218+ } while (iter != files);
219+
220+ best->next->prev = best->prev;
221+ best->prev->next = best->next;
222+ if (files == best)
223+ files = files->next;
224+ kill_hashfiles(files);
225+ return best;
226+}
227+
228+
229+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
230+{
231+ STRUCT_STAT st;
232+ char *hashname = make_hash_name(file);
233+ int first = 0, rc;
234+ char *linkname;
235+ long last_fnbr;
236+
237+ if (file->length == 0) {
238+ return robust_rename(fnametmp,fname,0644);
239+ }
240+
241+ if (do_stat(hashname, &st) == -1) {
242+ char *dirname;
243+
244+ /* Directory does not exist. */
245+ dirname = strdup(hashname);
246+ *strrchr(dirname,'/') = 0;
247+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
248+ rprintf(FERROR, "mkdir %s: %s\n", dirname,
249+ strerror(errno));
250+ free(hashname);
251+ free(dirname);
252+ return robust_rename(fnametmp,fname,0644);
253+ }
254+ free(dirname);
255+
256+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
257+ rprintf(FERROR, "mkdir %s: %s\n", hashname,
258+ strerror(errno));
259+ free(hashname);
260+ return robust_rename(fnametmp,fname,0644);
261+ }
262+
263+ first = 1;
264+ asprintf(&linkname,"%s/0",hashname);
265+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
266+
267+ } else {
268+ struct hashfile_struct *hashfiles, *hashfile;
269+ int fd;
270+
271+ if (do_stat(fnametmp,&st) == -1) {
272+ rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
273+ return -1;
274+ }
275+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
276+
277+ if (hashfiles == NULL) {
278+ first = 1;
279+ asprintf(&linkname,"%s/0",hashname);
280+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
281+ } else {
282+
283+ /* Search for one identical to us. */
284+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
285+ rprintf(FERROR,"%s: %s\n",fnametmp,
286+ strerror(errno));
287+ kill_hashfiles(hashfiles);
288+ return -1;
289+ }
290+ hashfile = compare_hashfiles(fd, hashfiles);
291+ hashfiles = NULL;
292+
293+ if (hashfile) {
294+ first = 0;
295+ linkname = strdup(hashfile->name);
296+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
297+ kill_hashfile(hashfile);
298+ } else {
299+ first = 1;
300+ asprintf(&linkname, "%s/%ld", hashname,
301+ last_fnbr + 1);
302+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
303+ }
304+ }
305+ }
306+
307+ if (!first) {
308+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
309+ linkname, full_fname(fname));
310+ rc = do_link(linkname, fname);
311+ if (rc == -1) {
312+ if (errno == EMLINK) {
313+ first = 1;
314+ free(linkname);
315+ asprintf(&linkname,"%s/%ld",hashname,
316+ last_fnbr + 1);
317+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
318+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
319+ } else {
320+ rprintf(FERROR,"link \"%s\" -> %s: %s\n",
321+ linkname,full_fname(fname),
322+ strerror(errno));
323+ robust_unlink(fname);
324+ rc = robust_rename(fnametmp,fname,0644);
325+ }
326+ } else {
327+ do_unlink(fnametmp);
328+ }
329+ }
330+
331+ if (first) {
332+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
333+ full_fname(fname),linkname);
334+
335+ rc = robust_rename(fnametmp,fname,0644);
336+ if (rc != 0) {
337+ rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
338+ full_fname(fnametmp),full_fname(fname),
339+ strerror(errno));
340+ }
341+ rc = do_link(fname,linkname);
342+ if (rc != 0) {
343+ rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
344+ full_fname(fname),linkname,
345+ strerror(errno));
346+ }
347+ }
348+
349+ free(linkname);
350+ free(hashname);
351+ return rc;
352+}
353+
354+#endif
355--- Makefile.in 2004-02-23 10:22:51.000000000 -0500
356+++ Makefile.in 2004-02-23 10:22:51.000000000 -0500
357@@ -35,7 +35,7 @@
358 main.o checksum.o match.o syscall.o log.o backup.o
359 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
360 fileio.o batch.o clientname.o
361-OBJS3=progress.o pipe.o
362+OBJS3=progress.o pipe.o hashlink.o
363 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
364 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
365 popt/popthelp.o popt/poptparse.o
366--- options.c 2004-02-23 10:22:51.000000000 -0500
367+++ options.c 2004-02-23 10:29:14.000000000 -0500
368@@ -119,6 +119,7 @@
369 char *password_file = NULL;
370 char *rsync_path = RSYNC_PATH;
371 char *backup_dir = NULL;
372+char *link_by_hash_dir = NULL;
373 char backup_dir_buf[MAXPATHLEN];
374 int rsync_port = RSYNC_PORT;
375 int link_dest = 0;
376@@ -264,6 +265,7 @@
377 rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
378 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
379 rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
380+ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
381 rprintf(F," -P equivalent to --partial --progress\n");
382 rprintf(F," -z, --compress compress file data\n");
383 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
384@@ -303,7 +305,7 @@
385 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
386 OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
387 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
388- OPT_READ_BATCH, OPT_WRITE_BATCH,
389+ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
390 OPT_REFUSED_BASE = 9000};
391
392 static struct poptOption long_options[] = {
393@@ -360,6 +362,7 @@
394 {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
395 {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
396 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
397+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
398 /* TODO: Should this take an optional int giving the compression level? */
399 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
400 {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
401@@ -577,6 +580,19 @@
402 return 0;
403 #endif
404
405+ case OPT_LINK_BY_HASH:
406+#if HAVE_LINK
407+ link_by_hash_dir = (char *)poptGetOptArg(pc);
408+ checksum_seed = FIXED_CHECKSUM_SEED;
409+ break;
410+#else
411+ snprintf(err_buf, sizeof err_buf,
412+ "hard links are not supported on this %s\n",
413+ am_server ? "server" : "client");
414+ rprintf(FERROR, "ERROR: %s", err_buf);
415+ return 0;
416+#endif
417+
418 default:
419 /* A large opt value means that set_refuse_options()
420 * turned this option off (opt-BASE is its index). */
421@@ -934,6 +950,11 @@
422 args[ac++] = compare_dest;
423 }
424
425+ if (link_by_hash_dir && am_sender) {
426+ args[ac++] = "--link-by-hash";
427+ args[ac++] = link_by_hash_dir;
428+ }
429+
430 if (files_from && (!am_sender || remote_filesfrom_file)) {
431 if (remote_filesfrom_file) {
432 args[ac++] = "--files-from";
433--- proto.h 2004-02-23 10:22:51.000000000 -0500
434+++ proto.h 2004-02-23 10:22:51.000000000 -0500
435@@ -93,6 +93,12 @@
436 void write_sum_head(int f, struct sum_struct *sum);
437 void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
438 void generate_files(int f, struct file_list *flist, char *local_name);
439+char* make_hash_name(struct file_struct *file);
440+void kill_hashfile(struct hashfile_struct *hashfile);
441+void kill_hashfiles(struct hashfile_struct *hashfiles);
442+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
443+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
444+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
445 void init_hard_links(struct file_list *flist);
446 int hard_link_check(struct file_struct *file, int skip);
447 void do_hard_links(void);
448--- receiver.c 2004-02-23 10:22:51.000000000 -0500
449+++ receiver.c 2004-02-23 10:22:51.000000000 -0500
450@@ -186,10 +186,11 @@
451
452
453 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
454- OFF_T total_size)
455+ OFF_T total_size,char *md4)
456 {
457 int i;
458 struct sum_struct sum;
459+ struct mdfour mdfour_data;
460 unsigned int len;
461 OFF_T offset = 0;
462 OFF_T offset2;
463@@ -199,7 +200,9 @@
464 char *map=NULL;
465
466 read_sum_head(f_in, &sum);
467-
468+ if (md4)
469+ mdfour_begin(&mdfour_data);
470+
471 sum_init();
472
473 while ((i = recv_token(f_in, &data)) != 0) {
474@@ -216,6 +219,8 @@
475 cleanup_got_literal = 1;
476
477 sum_update(data,i);
478+ if (md4)
479+ mdfour_update(&mdfour_data,data,i);
480
481 if (fd != -1 && write_file(fd,data,i) != i) {
482 rprintf(FERROR, "write failed on %s: %s\n",
483@@ -243,6 +248,8 @@
484
485 see_token(map, len);
486 sum_update(map,len);
487+ if (md4)
488+ mdfour_update(&mdfour_data,map,len);
489 }
490
491 if (fd != -1 && write_file(fd,map,len) != (int) len) {
492@@ -265,6 +272,8 @@
493 }
494
495 sum_end(file_sum1);
496+ if (md4)
497+ mdfour_result(&mdfour_data, (unsigned char*)md4);
498
499 read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
500 if (verbose > 2) {
501@@ -299,6 +308,7 @@
502 extern int preserve_perms;
503 extern int delete_after;
504 extern int orig_umask;
505+ extern char *link_by_hash_dir;
506 struct stats initial_stats;
507
508 if (verbose > 2) {
509@@ -372,7 +382,7 @@
510 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
511 rprintf(FERROR, "fstat %s failed: %s\n",
512 full_fname(fnamecmp), strerror(errno));
513- receive_data(f_in,NULL,-1,NULL,file->length);
514+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
515 close(fd1);
516 continue;
517 }
518@@ -385,7 +395,7 @@
519 */
520 rprintf(FERROR,"recv_files: %s is a directory\n",
521 full_fname(fnamecmp));
522- receive_data(f_in, NULL, -1, NULL, file->length);
523+ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
524 close(fd1);
525 continue;
526 }
527@@ -437,7 +447,7 @@
528 if (fd2 == -1) {
529 rprintf(FERROR, "mkstemp %s failed: %s\n",
530 full_fname(fnametmp), strerror(errno));
531- receive_data(f_in,mapbuf,-1,NULL,file->length);
532+ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
533 if (mapbuf) unmap_file(mapbuf);
534 if (fd1 != -1) close(fd1);
535 continue;
536@@ -450,7 +460,12 @@
537 }
538
539 /* recv file data */
540- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
541+#ifdef HAVE_LINK
542+ if (link_by_hash_dir) {
543+ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
544+ }
545+#endif
546+ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
547
548 log_recv(file, &initial_stats);
549
550--- rsync.c 2004-02-23 10:22:51.000000000 -0500
551+++ rsync.c 2004-02-23 10:27:15.000000000 -0500
552@@ -33,6 +33,7 @@
553 extern int preserve_gid;
554 extern int preserve_perms;
555 extern int make_backups;
556+extern char *link_by_hash_dir;
557
558
559 /*
560@@ -236,6 +237,12 @@
561 if (make_backups && !make_backup(fname))
562 return;
563
564+#ifdef HAVE_LINK
565+ if (link_by_hash_dir) {
566+ if ((ret = link_by_hash(fnametmp,fname,file)) != 0)
567+ return;
568+ } else
569+#endif
570 /* move tmp file over real file */
571 ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
572 if (ret != 0) {
573--- rsync.h 2004-02-23 10:22:51.000000000 -0500
574+++ rsync.h 2004-02-23 10:22:51.000000000 -0500
575@@ -513,6 +513,14 @@
576 int current_file_index;
577 };
578
579+struct hashfile_struct {
580+ struct hashfile_struct *next;
581+ struct hashfile_struct *prev;
582+ char *name;
583+ int fd;
584+ uint32 nlink;
585+};
586+
587
588 /* we need this function because of the silly way in which duplicate
589 entries are handled in the file lists - we can't change this