Fixed failing hunks.
[rsync/rsync-patches.git] / link-by-hash.diff
1 After applying this patch and running configure, you MUST run this
2 command before "make":
3
4     make proto
5
6 Jason M. Felice writes:
7
8 This patch adds the --link-by-hash=DIR option, which hard links received
9 files in a link farm arranged by MD4 file hash.  The result is that the system
10 will only store one copy of the unique contents of each file, regardless of
11 the file's name.
12
13
14 --- orig/Makefile.in    2006-01-14 08:14:29
15 +++ Makefile.in 2005-11-07 04:37:17
16 @@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17         main.o checksum.o match.o syscall.o log.o backup.o
18  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19         fileio.o batch.o clientname.o chmod.o
20 -OBJS3=progress.o pipe.o
21 +OBJS3=progress.o pipe.o hashlink.o
22  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
24         popt/popthelp.o popt/poptparse.o
25 --- orig/hashlink.c     2004-09-24 16:44:25
26 +++ hashlink.c  2004-09-24 16:44:25
27 @@ -0,0 +1,340 @@
28 +/*
29 +   Copyright (C) Cronosys, LLC 2004
30 +
31 +   This program is free software; you can redistribute it and/or modify
32 +   it under the terms of the GNU General Public License as published by
33 +   the Free Software Foundation; either version 2 of the License, or
34 +   (at your option) any later version.
35 +
36 +   This program is distributed in the hope that it will be useful,
37 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
38 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
39 +   GNU General Public License for more details.
40 +
41 +   You should have received a copy of the GNU General Public License
42 +   along with this program; if not, write to the Free Software
43 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44 +*/
45 +
46 +/* This file contains code used by the --link-by-hash option. */
47 +
48 +#include "rsync.h"
49 +
50 +extern char *link_by_hash_dir;
51 +
52 +#if HAVE_LINK
53 +
54 +char* make_hash_name(struct file_struct *file)
55 +{
56 +       char hash[33], *dst;
57 +       unsigned char *src;
58 +       unsigned char c;
59 +       int i;
60 +
61 +       src = (unsigned char*)file->u.sum;
62 +       for (dst = hash, i = 0; i < 4; i++, src++) {
63 +               c = *src >> 4;
64 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65 +               c = *src & 0x0f;
66 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67 +       }
68 +       *dst++ = '/';
69 +       for (i = 0; i < 12; i++, src++) {
70 +               c = *src >> 4;
71 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72 +               c = *src & 0x0f;
73 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74 +       }
75 +       *dst = 0;
76 +
77 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78 +       return dst;
79 +}
80 +
81 +
82 +void kill_hashfile(struct hashfile_struct *hashfile)
83 +{
84 +       if (!hashfile)
85 +               return;
86 +       free(hashfile->name);
87 +       close(hashfile->fd);
88 +       free(hashfile);
89 +}
90 +
91 +
92 +void kill_hashfiles(struct hashfile_struct *hashfiles)
93 +{
94 +       struct hashfile_struct *iter, *next;
95 +       if ((iter = hashfiles) != NULL) {
96 +               do {
97 +                       next = iter->next;
98 +                       kill_hashfile(iter);
99 +                       iter = next;
100 +               } while (iter != hashfiles);
101 +       }
102 +}
103 +
104 +
105 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106 +{
107 +       DIR *d;
108 +       struct dirent *di;
109 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
110 +       STRUCT_STAT st;
111 +       long this_fnbr;
112 +
113 +       *fnbr = 0;
114 +
115 +       /* Build a list of potential candidates and open
116 +        * them. */
117 +       if ((d = opendir(hashname)) == NULL) {
118 +               rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
119 +               free(hashname);
120 +               return NULL;
121 +       }
122 +       while ((di = readdir(d)) != NULL) {
123 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124 +                       continue;
125 +               }
126 +
127 +               /* We need to have the largest fnbr in case we need to store
128 +                * a new file. */
129 +               this_fnbr = atol(di->d_name);
130 +               if (this_fnbr > *fnbr)
131 +                       *fnbr = this_fnbr;
132 +
133 +               hashfile = new_array(struct hashfile_struct, 1);
134 +               asprintf(&hashfile->name,"%s/%s",hashname,
135 +                        di->d_name);
136 +               if (do_stat(hashfile->name,&st) == -1) {
137 +                       rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
138 +                       kill_hashfile(hashfile);
139 +                       continue;
140 +               }
141 +               if (st.st_size != size) {
142 +                       kill_hashfile(hashfile);
143 +                       continue;
144 +               }
145 +               hashfile->nlink = st.st_nlink;
146 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147 +               if (hashfile->fd == -1) {
148 +                       rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
149 +                       kill_hashfile(hashfile);
150 +                       continue;
151 +               }
152 +               if (hashfiles == NULL)
153 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
154 +               else {
155 +                       hashfile->next = hashfiles;
156 +                       hashfile->prev = hashfiles->prev;
157 +                       hashfile->next->prev = hashfile;
158 +                       hashfile->prev->next = hashfile;
159 +               }
160 +       }
161 +       closedir(d);
162 +
163 +       return hashfiles;
164 +}
165 +
166 +
167 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168 +{
169 +       int amt, hamt;
170 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171 +       struct hashfile_struct *iter, *next, *best;
172 +       uint32 nlink;
173 +
174 +       if (!files)
175 +               return NULL;
176 +
177 +       iter = files; /* in case files are 0 bytes */
178 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179 +               iter = files;
180 +               do {
181 +                       /* Icky bit to resync when we steal the first node. */
182 +                       if (!files)
183 +                               files = iter;
184 +
185 +                       next = iter->next;
186 +
187 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189 +                               if (iter == files) {
190 +                                       files = files->prev;
191 +                               }
192 +                               if (iter->next == iter) {
193 +                                       files = next = NULL;
194 +                               } else {
195 +                                       next = iter->next;
196 +                                       if (iter == files) {
197 +                                               /* So we know to resync */
198 +                                               files = NULL;
199 +                                       }
200 +                               }
201 +                               iter->next->prev = iter->prev;
202 +                               iter->prev->next = iter->next;
203 +                               kill_hashfile(iter);
204 +                       }
205 +
206 +                       iter = next;
207 +               } while (iter != files);
208 +
209 +               if (iter == NULL && files == NULL) {
210 +                       /* There are no matches. */
211 +                       return NULL;
212 +               }
213 +       }
214 +
215 +       if (amt == -1) {
216 +               rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
217 +               kill_hashfiles(files);
218 +               return NULL;
219 +       }
220 +
221 +       /* If we only have one file left, use it. */
222 +       if (files == files->next) {
223 +               return files;
224 +       }
225 +
226 +       /* All files which remain in the list are identical and should have
227 +        * the same size.  We pick the one with the lowest link count (we
228 +        * may have rolled over because we hit the maximum link count for
229 +        * the filesystem). */
230 +       best = iter = files;
231 +       nlink = iter->nlink;
232 +       do {
233 +               if (iter->nlink < nlink) {
234 +                       nlink = iter->nlink;
235 +                       best = iter;
236 +               }
237 +               iter = iter->next;
238 +       } while (iter != files);
239 +
240 +       best->next->prev = best->prev;
241 +       best->prev->next = best->next;
242 +       if (files == best)
243 +               files = files->next;
244 +       kill_hashfiles(files);
245 +       return best;
246 +}
247 +
248 +
249 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
250 +{
251 +       STRUCT_STAT st;
252 +       char *hashname = make_hash_name(file);
253 +       int first = 0, rc;
254 +       char *linkname;
255 +       long last_fnbr;
256 +
257 +       if (file->length == 0) {
258 +               return robust_rename(fnametmp,fname,0644);
259 +       }
260 +
261 +       if (do_stat(hashname, &st) == -1) {
262 +               char *dirname;
263 +
264 +               /* Directory does not exist. */
265 +               dirname = strdup(hashname);
266 +               *strrchr(dirname,'/') = 0;
267 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
268 +                       rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
269 +                       free(hashname);
270 +                       free(dirname);
271 +                       return robust_rename(fnametmp,fname,0644);
272 +               }
273 +               free(dirname);
274 +
275 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
276 +                       rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
277 +                       free(hashname);
278 +                       return robust_rename(fnametmp,fname,0644);
279 +               }
280 +
281 +               first = 1;
282 +               asprintf(&linkname,"%s/0",hashname);
283 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
284 +       } else {
285 +               struct hashfile_struct *hashfiles, *hashfile;
286 +
287 +               if (do_stat(fnametmp,&st) == -1) {
288 +                       rsyserr(FERROR, errno, "stat failed: %s", fname);
289 +                       return -1;
290 +               }
291 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292 +
293 +               if (hashfiles == NULL) {
294 +                       first = 1;
295 +                       asprintf(&linkname,"%s/0",hashname);
296 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
297 +               } else {
298 +                       int fd;
299 +                       /* Search for one identical to us. */
300 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
301 +                               rsyserr(FERROR, errno, "open failed: %s", fnametmp);
302 +                               kill_hashfiles(hashfiles);
303 +                               return -1;
304 +                       }
305 +                       hashfile = compare_hashfiles(fd, hashfiles);
306 +                       hashfiles = NULL;
307 +                       close(fd);
308 +
309 +                       if (hashfile) {
310 +                               first = 0;
311 +                               linkname = strdup(hashfile->name);
312 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
313 +                               kill_hashfile(hashfile);
314 +                       } else {
315 +                               first = 1;
316 +                               asprintf(&linkname, "%s/%ld", hashname,
317 +                                        last_fnbr + 1);
318 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
319 +                       }
320 +               }
321 +       }
322 +
323 +       if (!first) {
324 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325 +                               linkname, full_fname(fname));
326 +               robust_unlink(fname);
327 +               rc = do_link(linkname, fname);
328 +               if (rc == -1) {
329 +                       if (errno == EMLINK) {
330 +                               first = 1;
331 +                               free(linkname);
332 +                               asprintf(&linkname,"%s/%ld",hashname,
333 +                                        last_fnbr + 1);
334 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
335 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336 +                       } else {
337 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338 +                                       linkname, full_fname(fname));
339 +                               rc = robust_rename(fnametmp,fname,0644);
340 +                       }
341 +               } else {
342 +                       do_unlink(fnametmp);
343 +               }
344 +       }
345 +
346 +       if (first) {
347 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348 +                               full_fname(fname),linkname);
349 +
350 +               rc = robust_rename(fnametmp,fname,0644);
351 +               if (rc != 0) {
352 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353 +                               full_fname(fnametmp), full_fname(fname));
354 +               }
355 +               rc = do_link(fname,linkname);
356 +               if (rc != 0) {
357 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358 +                               full_fname(fname), linkname);
359 +               }
360 +       }
361 +
362 +       free(linkname);
363 +       free(hashname);
364 +       return rc;
365 +}
366 +
367 +#endif
368 --- orig/options.c      2006-01-21 21:02:30
369 +++ options.c   2006-01-21 21:12:04
370 @@ -141,6 +141,7 @@ char *backup_suffix = NULL;
371  char *tmpdir = NULL;
372  char *partial_dir = NULL;
373  char *basis_dir[MAX_BASIS_DIRS+1];
374 +char *link_by_hash_dir = NULL;
375  char *config_file = NULL;
376  char *shell_cmd = NULL;
377  char *log_format = NULL;
378 @@ -328,6 +329,7 @@ void usage(enum logcode F)
379    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
380    rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
381    rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
382 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
383    rprintf(F," -z, --compress              compress file data during the transfer\n");
384    rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
385    rprintf(F," -C, --cvs-exclude           auto-ignore files the same way CVS does\n");
386 @@ -373,6 +375,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
387        OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
388        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
389        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
390 +      OPT_LINK_BY_HASH,
391        OPT_SERVER, OPT_REFUSED_BASE = 9000};
392  
393  static struct poptOption long_options[] = {
394 @@ -461,6 +464,7 @@ static struct poptOption long_options[] 
395    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
396    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
397    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
398 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
399    {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
400    {"compress",        'z', POPT_ARG_NONE,   0, 'z', 0, 0 },
401    {"compress-level",   0,  POPT_ARG_INT,    &def_compress_level, 'z', 0, 0 },
402 @@ -1027,6 +1031,21 @@ int parse_arguments(int *argc, const cha
403                         usage(FINFO);
404                         exit_cleanup(0);
405  
406 +                case OPT_LINK_BY_HASH:
407 +#if HAVE_LINK
408 +                       arg = poptGetOptArg(pc);
409 +                       if (sanitize_paths)
410 +                               arg = sanitize_path(NULL, arg, NULL, 0);
411 +                       link_by_hash_dir = (char *)arg;
412 +                       break;
413 +#else
414 +                       snprintf(err_buf, sizeof err_buf,
415 +                                "hard links are not supported on this %s\n",
416 +                                am_server ? "server" : "client");
417 +                       rprintf(FERROR, "ERROR: %s", err_buf);
418 +                       return 0;
419 +#endif
420 +
421                 default:
422                         /* A large opt value means that set_refuse_options()
423                          * turned this option off. */
424 @@ -1660,6 +1679,11 @@ void server_options(char **args,int *arg
425                 }
426         }
427  
428 +       if (link_by_hash_dir && am_sender) {
429 +               args[ac++] = "--link-by-hash";
430 +               args[ac++] = link_by_hash_dir;
431 +       }
432 +
433         if (files_from && (!am_sender || filesfrom_host)) {
434                 if (filesfrom_host) {
435                         args[ac++] = "--files-from";
436 --- orig/receiver.c     2006-01-14 20:27:09
437 +++ receiver.c  2005-01-15 21:29:13
438 @@ -54,6 +54,7 @@ extern int delay_updates;
439  extern struct stats stats;
440  extern char *log_format;
441  extern char *tmpdir;
442 +extern char *link_by_hash_dir;
443  extern char *partial_dir;
444  extern char *basis_dir[];
445  extern struct file_list *the_file_list;
446 @@ -186,12 +187,13 @@ static int get_tmpname(char *fnametmp, c
447  
448  
449  static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
450 -                       char *fname, int fd, OFF_T total_size)
451 +                       char *fname, int fd, OFF_T total_size, char *md4)
452  {
453         static char file_sum1[MD4_SUM_LENGTH];
454         static char file_sum2[MD4_SUM_LENGTH];
455         struct map_struct *mapbuf;
456         struct sum_struct sum;
457 +       struct mdfour mdfour_data;
458         int32 len;
459         OFF_T offset = 0;
460         OFF_T offset2;
461 @@ -211,6 +213,9 @@ static int receive_data(int f_in, char *
462         } else
463                 mapbuf = NULL;
464  
465 +       if (md4)
466 +               mdfour_begin(&mdfour_data);
467 +
468         sum_init(checksum_seed);
469  
470         if (append_mode) {
471 @@ -253,6 +258,8 @@ static int receive_data(int f_in, char *
472                         cleanup_got_literal = 1;
473  
474                         sum_update(data, i);
475 +                       if (md4)
476 +                               mdfour_update(&mdfour_data,data,i);
477  
478                         if (fd != -1 && write_file(fd,data,i) != i)
479                                 goto report_write_error;
480 @@ -279,6 +286,8 @@ static int receive_data(int f_in, char *
481  
482                         see_token(map, len);
483                         sum_update(map, len);
484 +                       if (md4)
485 +                               mdfour_update(&mdfour_data,map,len);
486                 }
487  
488                 if (inplace) {
489 @@ -319,6 +328,8 @@ static int receive_data(int f_in, char *
490         }
491  
492         sum_end(file_sum1);
493 +       if (md4)
494 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
495  
496         if (mapbuf)
497                 unmap_file(mapbuf);
498 @@ -334,7 +345,7 @@ static int receive_data(int f_in, char *
499  
500  static void discard_receive_data(int f_in, OFF_T length)
501  {
502 -       receive_data(f_in, NULL, -1, 0, NULL, -1, length);
503 +       receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
504  }
505  
506  static void handle_delayed_updates(struct file_list *flist, char *local_name)
507 @@ -666,8 +677,12 @@ int recv_files(int f_in, struct file_lis
508                         rprintf(FINFO, "%s\n", fname);
509  
510                 /* recv file data */
511 +#if HAVE_LINK
512 +               if (link_by_hash_dir)
513 +                       file->u.sum = new_array(char, MD4_SUM_LENGTH);
514 +#endif
515                 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
516 -                                      fname, fd2, file->length);
517 +                                      fname, fd2, file->length, file->u.sum);
518  
519                 if (!log_before_transfer)
520                         log_item(file, &initial_stats, iflags, NULL);
521 --- orig/rsync.c        2006-01-14 08:14:31
522 +++ rsync.c     2006-01-14 08:26:01
523 @@ -38,6 +38,7 @@ extern int inplace;
524  extern int keep_dirlinks;
525  extern int make_backups;
526  extern struct stats stats;
527 +extern char *link_by_hash_dir;
528  
529  
530  /*
531 @@ -188,7 +189,12 @@ void finish_transfer(char *fname, char *
532         /* move tmp file over real file */
533         if (verbose > 2)
534                 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
535 -       ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
536 +#if HAVE_LINK
537 +       if (link_by_hash_dir)
538 +               ret = link_by_hash(fnametmp, fname, file);
539 +       else
540 +#endif
541 +               ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
542         if (ret < 0) {
543                 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
544                         ret == -2 ? "copy" : "rename",
545 --- orig/rsync.h        2006-01-21 21:02:30
546 +++ rsync.h     2004-07-03 20:20:15
547 @@ -640,6 +640,14 @@ struct stats {
548         int current_file_index;
549  };
550  
551 +struct hashfile_struct {
552 +       struct hashfile_struct *next;
553 +       struct hashfile_struct *prev;
554 +       char *name;
555 +       int fd;
556 +       uint32 nlink;
557 +};
558 +
559  
560  #include "byteorder.h"
561  #include "lib/mdfour.h"
562 --- orig/rsync.yo       2006-01-21 21:02:31
563 +++ rsync.yo    2005-02-13 06:58:47
564 @@ -356,6 +356,7 @@ to the detailed description below for a 
565       --compare-dest=DIR      also compare received files relative to DIR
566       --copy-dest=DIR         ... and include copies of unchanged files
567       --link-dest=DIR         hardlink to files in DIR when unchanged
568 +     --link-by-hash=DIR      create hardlinks by hash into DIR
569   -z, --compress              compress file data during the transfer
570       --compress-level=NUM    explicitly set compression level
571   -C, --cvs-exclude           auto-ignore files in the same way CVS does