Updated the opening comments to mention how to apply the patch
[rsync/rsync-patches.git] / link-by-hash.diff
1 Jason M. Felice wrote:
2
3 This patch adds the --link-by-hash=DIR option, which hard links received
4 files in a link farm arranged by MD4 file hash.  The result is that the system
5 will only store one copy of the unique contents of each file, regardless of
6 the file's name.
7
8 To use this patch, run these commands for a successful build:
9
10     patch -p1 <patches/link-by-hash.diff
11     ./prepare-source
12     ./configure
13     make
14
15 --- old/Makefile.in
16 +++ new/Makefile.in
17 @@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18         main.o checksum.o match.o syscall.o log.o backup.o
19  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20         fileio.o batch.o clientname.o chmod.o
21 -OBJS3=progress.o pipe.o
22 +OBJS3=progress.o pipe.o hashlink.o
23  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
25         popt/popthelp.o popt/poptparse.o
26 --- old/hashlink.c
27 +++ new/hashlink.c
28 @@ -0,0 +1,339 @@
29 +/*
30 +   Copyright (C) Cronosys, LLC 2004
31 +
32 +   This program is free software; you can redistribute it and/or modify
33 +   it under the terms of the GNU General Public License as published by
34 +   the Free Software Foundation; either version 2 of the License, or
35 +   (at your option) any later version.
36 +
37 +   This program is distributed in the hope that it will be useful,
38 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
39 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
40 +   GNU General Public License for more details.
41 +
42 +   You should have received a copy of the GNU General Public License
43 +   along with this program; if not, write to the Free Software
44 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45 +*/
46 +
47 +/* This file contains code used by the --link-by-hash option. */
48 +
49 +#include "rsync.h"
50 +
51 +extern char *link_by_hash_dir;
52 +
53 +#if HAVE_LINK
54 +
55 +char* make_hash_name(struct file_struct *file)
56 +{
57 +       char hash[33], *dst;
58 +       unsigned char *src;
59 +       unsigned char c;
60 +       int i;
61 +
62 +       src = (unsigned char*)file->u.sum;
63 +       for (dst = hash, i = 0; i < 4; i++, src++) {
64 +               c = *src >> 4;
65 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66 +               c = *src & 0x0f;
67 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
68 +       }
69 +       *dst++ = '/';
70 +       for (i = 0; i < 12; i++, src++) {
71 +               c = *src >> 4;
72 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73 +               c = *src & 0x0f;
74 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
75 +       }
76 +       *dst = 0;
77 +
78 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
79 +       return dst;
80 +}
81 +
82 +
83 +void kill_hashfile(struct hashfile_struct *hashfile)
84 +{
85 +       if (!hashfile)
86 +               return;
87 +       free(hashfile->name);
88 +       close(hashfile->fd);
89 +       free(hashfile);
90 +}
91 +
92 +
93 +void kill_hashfiles(struct hashfile_struct *hashfiles)
94 +{
95 +       struct hashfile_struct *iter, *next;
96 +       if ((iter = hashfiles) != NULL) {
97 +               do {
98 +                       next = iter->next;
99 +                       kill_hashfile(iter);
100 +                       iter = next;
101 +               } while (iter != hashfiles);
102 +       }
103 +}
104 +
105 +
106 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
107 +{
108 +       DIR *d;
109 +       struct dirent *di;
110 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
111 +       STRUCT_STAT st;
112 +       long this_fnbr;
113 +
114 +       *fnbr = 0;
115 +
116 +       /* Build a list of potential candidates and open
117 +        * them. */
118 +       if ((d = opendir(hashname)) == NULL) {
119 +               rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
120 +               free(hashname);
121 +               return NULL;
122 +       }
123 +       while ((di = readdir(d)) != NULL) {
124 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125 +                       continue;
126 +               }
127 +
128 +               /* We need to have the largest fnbr in case we need to store
129 +                * a new file. */
130 +               this_fnbr = atol(di->d_name);
131 +               if (this_fnbr > *fnbr)
132 +                       *fnbr = this_fnbr;
133 +
134 +               hashfile = new_array(struct hashfile_struct, 1);
135 +               asprintf(&hashfile->name,"%s/%s",hashname,
136 +                        di->d_name);
137 +               if (do_stat(hashfile->name,&st) == -1) {
138 +                       rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139 +                       kill_hashfile(hashfile);
140 +                       continue;
141 +               }
142 +               if (st.st_size != size) {
143 +                       kill_hashfile(hashfile);
144 +                       continue;
145 +               }
146 +               hashfile->nlink = st.st_nlink;
147 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148 +               if (hashfile->fd == -1) {
149 +                       rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150 +                       kill_hashfile(hashfile);
151 +                       continue;
152 +               }
153 +               if (hashfiles == NULL)
154 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
155 +               else {
156 +                       hashfile->next = hashfiles;
157 +                       hashfile->prev = hashfiles->prev;
158 +                       hashfile->next->prev = hashfile;
159 +                       hashfile->prev->next = hashfile;
160 +               }
161 +       }
162 +       closedir(d);
163 +
164 +       return hashfiles;
165 +}
166 +
167 +
168 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
169 +{
170 +       int amt, hamt;
171 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172 +       struct hashfile_struct *iter, *next, *best;
173 +       uint32 nlink;
174 +
175 +       if (!files)
176 +               return NULL;
177 +
178 +       iter = files; /* in case files are 0 bytes */
179 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
180 +               iter = files;
181 +               do {
182 +                       /* Icky bit to resync when we steal the first node. */
183 +                       if (!files)
184 +                               files = iter;
185 +
186 +                       next = iter->next;
187 +
188 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190 +                               if (iter == files) {
191 +                                       files = files->prev;
192 +                               }
193 +                               if (iter->next == iter) {
194 +                                       files = next = NULL;
195 +                               } else {
196 +                                       next = iter->next;
197 +                                       if (iter == files) {
198 +                                               /* So we know to resync */
199 +                                               files = NULL;
200 +                                       }
201 +                               }
202 +                               iter->next->prev = iter->prev;
203 +                               iter->prev->next = iter->next;
204 +                               kill_hashfile(iter);
205 +                       }
206 +
207 +                       iter = next;
208 +               } while (iter != files);
209 +
210 +               if (iter == NULL && files == NULL) {
211 +                       /* There are no matches. */
212 +                       return NULL;
213 +               }
214 +       }
215 +
216 +       if (amt == -1) {
217 +               rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218 +               kill_hashfiles(files);
219 +               return NULL;
220 +       }
221 +
222 +       /* If we only have one file left, use it. */
223 +       if (files == files->next) {
224 +               return files;
225 +       }
226 +
227 +       /* All files which remain in the list are identical and should have
228 +        * the same size.  We pick the one with the lowest link count (we
229 +        * may have rolled over because we hit the maximum link count for
230 +        * the filesystem). */
231 +       best = iter = files;
232 +       nlink = iter->nlink;
233 +       do {
234 +               if (iter->nlink < nlink) {
235 +                       nlink = iter->nlink;
236 +                       best = iter;
237 +               }
238 +               iter = iter->next;
239 +       } while (iter != files);
240 +
241 +       best->next->prev = best->prev;
242 +       best->prev->next = best->next;
243 +       if (files == best)
244 +               files = files->next;
245 +       kill_hashfiles(files);
246 +       return best;
247 +}
248 +
249 +
250 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251 +{
252 +       STRUCT_STAT st;
253 +       char *hashname = make_hash_name(file);
254 +       int first = 0, rc;
255 +       char *linkname;
256 +       long last_fnbr;
257 +
258 +       if (file->length == 0)
259 +               return robust_rename(fnametmp, fname, NULL, 0644);
260 +
261 +       if (do_stat(hashname, &st) == -1) {
262 +               char *dirname;
263 +
264 +               /* Directory does not exist. */
265 +               dirname = strdup(hashname);
266 +               *strrchr(dirname,'/') = 0;
267 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
268 +                       rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
269 +                       free(hashname);
270 +                       free(dirname);
271 +                       return robust_rename(fnametmp, fname, NULL, 0644);
272 +               }
273 +               free(dirname);
274 +
275 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
276 +                       rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
277 +                       free(hashname);
278 +                       return robust_rename(fnametmp, fname, NULL, 0644);
279 +               }
280 +
281 +               first = 1;
282 +               asprintf(&linkname,"%s/0",hashname);
283 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
284 +       } else {
285 +               struct hashfile_struct *hashfiles, *hashfile;
286 +
287 +               if (do_stat(fnametmp,&st) == -1) {
288 +                       rsyserr(FERROR, errno, "stat failed: %s", fname);
289 +                       return -1;
290 +               }
291 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292 +
293 +               if (hashfiles == NULL) {
294 +                       first = 1;
295 +                       asprintf(&linkname,"%s/0",hashname);
296 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
297 +               } else {
298 +                       int fd;
299 +                       /* Search for one identical to us. */
300 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
301 +                               rsyserr(FERROR, errno, "open failed: %s", fnametmp);
302 +                               kill_hashfiles(hashfiles);
303 +                               return -1;
304 +                       }
305 +                       hashfile = compare_hashfiles(fd, hashfiles);
306 +                       hashfiles = NULL;
307 +                       close(fd);
308 +
309 +                       if (hashfile) {
310 +                               first = 0;
311 +                               linkname = strdup(hashfile->name);
312 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
313 +                               kill_hashfile(hashfile);
314 +                       } else {
315 +                               first = 1;
316 +                               asprintf(&linkname, "%s/%ld", hashname,
317 +                                        last_fnbr + 1);
318 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
319 +                       }
320 +               }
321 +       }
322 +
323 +       if (!first) {
324 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325 +                               linkname, full_fname(fname));
326 +               robust_unlink(fname);
327 +               rc = do_link(linkname, fname);
328 +               if (rc == -1) {
329 +                       if (errno == EMLINK) {
330 +                               first = 1;
331 +                               free(linkname);
332 +                               asprintf(&linkname,"%s/%ld",hashname,
333 +                                        last_fnbr + 1);
334 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
335 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336 +                       } else {
337 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338 +                                       linkname, full_fname(fname));
339 +                               rc = robust_rename(fnametmp, fname, NULL, 0644);
340 +                       }
341 +               } else {
342 +                       do_unlink(fnametmp);
343 +               }
344 +       }
345 +
346 +       if (first) {
347 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348 +                               full_fname(fname),linkname);
349 +
350 +               rc = robust_rename(fnametmp, fname, NULL, 0644);
351 +               if (rc != 0) {
352 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353 +                               full_fname(fnametmp), full_fname(fname));
354 +               }
355 +               rc = do_link(fname,linkname);
356 +               if (rc != 0) {
357 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358 +                               full_fname(fname), linkname);
359 +               }
360 +       }
361 +
362 +       free(linkname);
363 +       free(hashname);
364 +       return rc;
365 +}
366 +
367 +#endif
368 --- old/options.c
369 +++ new/options.c
370 @@ -145,6 +145,7 @@ char *backup_suffix = NULL;
371  char *tmpdir = NULL;
372  char *partial_dir = NULL;
373  char *basis_dir[MAX_BASIS_DIRS+1];
374 +char *link_by_hash_dir = NULL;
375  char *config_file = NULL;
376  char *shell_cmd = NULL;
377  char *logfile_name = NULL;
378 @@ -349,6 +350,7 @@ void usage(enum logcode F)
379    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
380    rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
381    rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
382 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
383    rprintf(F," -z, --compress              compress file data during the transfer\n");
384    rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
385    rprintf(F," -C, --cvs-exclude           auto-ignore files the same way CVS does\n");
386 @@ -398,7 +400,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
387        OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
388        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
389        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
390 -      OPT_NO_D,
391 +      OPT_NO_D, OPT_LINK_BY_HASH,
392        OPT_SERVER, OPT_REFUSED_BASE = 9000};
393  
394  static struct poptOption long_options[] = {
395 @@ -499,6 +501,7 @@ static struct poptOption long_options[] 
396    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
397    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
398    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
399 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
400    {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
401    {"compress",        'z', POPT_ARG_NONE,   0, 'z', 0, 0 },
402    {"compress-level",   0,  POPT_ARG_INT,    &def_compress_level, 'z', 0, 0 },
403 @@ -1089,6 +1092,21 @@ int parse_arguments(int *argc, const cha
404                         usage(FINFO);
405                         exit_cleanup(0);
406  
407 +                case OPT_LINK_BY_HASH:
408 +#if HAVE_LINK
409 +                       arg = poptGetOptArg(pc);
410 +                       if (sanitize_paths)
411 +                               arg = sanitize_path(NULL, arg, NULL, 0, NULL);
412 +                       link_by_hash_dir = (char *)arg;
413 +                       break;
414 +#else
415 +                       snprintf(err_buf, sizeof err_buf,
416 +                                "hard links are not supported on this %s\n",
417 +                                am_server ? "server" : "client");
418 +                       rprintf(FERROR, "ERROR: %s", err_buf);
419 +                       return 0;
420 +#endif
421 +
422                 default:
423                         /* A large opt value means that set_refuse_options()
424                          * turned this option off. */
425 @@ -1739,6 +1757,11 @@ void server_options(char **args,int *arg
426                 }
427         }
428  
429 +       if (link_by_hash_dir && am_sender) {
430 +               args[ac++] = "--link-by-hash";
431 +               args[ac++] = link_by_hash_dir;
432 +       }
433 +
434         if (files_from && (!am_sender || filesfrom_host)) {
435                 if (filesfrom_host) {
436                         args[ac++] = "--files-from";
437 --- old/receiver.c
438 +++ new/receiver.c
439 @@ -50,6 +50,7 @@ extern int delay_updates;
440  extern struct stats stats;
441  extern char *stdout_format;
442  extern char *tmpdir;
443 +extern char *link_by_hash_dir;
444  extern char *partial_dir;
445  extern char *basis_dir[];
446  extern struct file_list *the_file_list;
447 @@ -124,12 +125,13 @@ static int get_tmpname(char *fnametmp, c
448  
449  
450  static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
451 -                       char *fname, int fd, OFF_T total_size)
452 +                       char *fname, int fd, OFF_T total_size, char *md4)
453  {
454         static char file_sum1[MD4_SUM_LENGTH];
455         static char file_sum2[MD4_SUM_LENGTH];
456         struct map_struct *mapbuf;
457         struct sum_struct sum;
458 +       struct mdfour mdfour_data;
459         int32 len;
460         OFF_T offset = 0;
461         OFF_T offset2;
462 @@ -149,6 +151,9 @@ static int receive_data(int f_in, char *
463         } else
464                 mapbuf = NULL;
465  
466 +       if (md4)
467 +               mdfour_begin(&mdfour_data);
468 +
469         sum_init(checksum_seed);
470  
471         if (append_mode) {
472 @@ -191,6 +196,8 @@ static int receive_data(int f_in, char *
473                         cleanup_got_literal = 1;
474  
475                         sum_update(data, i);
476 +                       if (md4)
477 +                               mdfour_update(&mdfour_data, (uchar*)data, i);
478  
479                         if (fd != -1 && write_file(fd,data,i) != i)
480                                 goto report_write_error;
481 @@ -217,6 +224,8 @@ static int receive_data(int f_in, char *
482  
483                         see_token(map, len);
484                         sum_update(map, len);
485 +                       if (md4)
486 +                               mdfour_update(&mdfour_data, (uchar*)map, len);
487                 }
488  
489                 if (updating_basis) {
490 @@ -259,6 +268,8 @@ static int receive_data(int f_in, char *
491         }
492  
493         sum_end(file_sum1);
494 +       if (md4)
495 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
496  
497         if (mapbuf)
498                 unmap_file(mapbuf);
499 @@ -274,7 +285,7 @@ static int receive_data(int f_in, char *
500  
501  static void discard_receive_data(int f_in, OFF_T length)
502  {
503 -       receive_data(f_in, NULL, -1, 0, NULL, -1, length);
504 +       receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
505  }
506  
507  static void handle_delayed_updates(struct file_list *flist, char *local_name)
508 @@ -611,8 +622,12 @@ int recv_files(int f_in, struct file_lis
509                         rprintf(FINFO, "%s\n", fname);
510  
511                 /* recv file data */
512 +#if HAVE_LINK
513 +               if (link_by_hash_dir)
514 +                       file->u.sum = new_array(char, MD4_SUM_LENGTH);
515 +#endif
516                 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
517 -                                      fname, fd2, file->length);
518 +                                      fname, fd2, file->length, file->u.sum);
519  
520                 log_item(log_code, file, &initial_stats, iflags, NULL);
521  
522 --- old/rsync.c
523 +++ new/rsync.c
524 @@ -48,6 +48,7 @@ extern int inplace;
525  extern int keep_dirlinks;
526  extern int make_backups;
527  extern mode_t orig_umask;
528 +extern char *link_by_hash_dir;
529  extern struct stats stats;
530  extern struct chmod_mode_struct *daemon_chmod_modes;
531  
532 @@ -271,8 +272,15 @@ void finish_transfer(char *fname, char *
533         /* move tmp file over real file */
534         if (verbose > 2)
535                 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
536 -       ret = robust_rename(fnametmp, fname, partialptr,
537 -                           file->mode & INITACCESSPERMS);
538 +#if HAVE_LINK
539 +       if (link_by_hash_dir)
540 +               ret = link_by_hash(fnametmp, fname, file);
541 +       else
542 +#endif
543 +       {
544 +               ret = robust_rename(fnametmp, fname, partialptr,
545 +                                   file->mode & INITACCESSPERMS);
546 +       }
547         if (ret < 0) {
548                 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
549                         ret == -2 ? "copy" : "rename",
550 --- old/rsync.h
551 +++ new/rsync.h
552 @@ -651,6 +651,14 @@ struct stats {
553         int current_file_index;
554  };
555  
556 +struct hashfile_struct {
557 +       struct hashfile_struct *next;
558 +       struct hashfile_struct *prev;
559 +       char *name;
560 +       int fd;
561 +       uint32 nlink;
562 +};
563 +
564  struct chmod_mode_struct;
565  
566  #include "byteorder.h"
567 --- old/rsync.yo
568 +++ new/rsync.yo
569 @@ -366,6 +366,7 @@ to the detailed description below for a 
570       --compare-dest=DIR      also compare received files relative to DIR
571       --copy-dest=DIR         ... and include copies of unchanged files
572       --link-dest=DIR         hardlink to files in DIR when unchanged
573 +     --link-by-hash=DIR      create hardlinks by hash into DIR
574   -z, --compress              compress file data during the transfer
575       --compress-level=NUM    explicitly set compression level
576   -C, --cvs-exclude           auto-ignore files in the same way CVS does