Got rid of the changes to proto.h and mentioned the build instructions
[rsync/rsync-patches.git] / link-by-hash.diff
1 After applying this patch and running configure, you MUST run this
2 command before "make":
3
4     make proto
5
6 Jason M. Felice writes:
7
8 This patch adds the --link-by-hash=DIR option, which hard links received
9 files in a link farm arranged by MD4 file hash.  The result is that the system
10 will only store one copy of the unique contents of each file, regardless of
11 the file's name.
12
13
14 --- Makefile.in 2 May 2004 17:04:14 -0000       1.100
15 +++ Makefile.in 13 May 2004 19:04:49 -0000
16 @@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17         main.o checksum.o match.o syscall.o log.o backup.o
18  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19         fileio.o batch.o clientname.o
20 -OBJS3=progress.o pipe.o
21 +OBJS3=progress.o pipe.o hashlink.o
22  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
24         popt/popthelp.o popt/poptparse.o
25 --- /dev/null   1 Jan 1970 00:00:00 -0000
26 +++ hashlink.c  13 May 2004 19:04:49 -0000
27 @@ -0,0 +1,351 @@
28 +/*
29 +   Copyright (C) Cronosys, LLC 2004
30 +
31 +   This program is free software; you can redistribute it and/or modify
32 +   it under the terms of the GNU General Public License as published by
33 +   the Free Software Foundation; either version 2 of the License, or
34 +   (at your option) any later version.
35 +
36 +   This program is distributed in the hope that it will be useful,
37 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
38 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
39 +   GNU General Public License for more details.
40 +
41 +   You should have received a copy of the GNU General Public License
42 +   along with this program; if not, write to the Free Software
43 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44 +*/
45 +
46 +/* This file contains code used by the --link-by-hash option. */
47 +
48 +#include "rsync.h"
49 +
50 +extern char *link_by_hash_dir;
51 +
52 +#ifdef HAVE_LINK
53 +
54 +char* make_hash_name(struct file_struct *file)
55 +{
56 +       char hash[33], *dst;
57 +       unsigned char *src;
58 +       unsigned char c;
59 +       int i;
60 +
61 +       src = (unsigned char*)file->u.sum;
62 +       for (dst = hash, i = 0; i < 4; i++, src++) {
63 +               c = *src >> 4;
64 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65 +               c = *src & 0x0f;
66 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67 +       }
68 +       *dst++ = '/';
69 +       for (i = 0; i < 12; i++, src++) {
70 +               c = *src >> 4;
71 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72 +               c = *src & 0x0f;
73 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74 +       }
75 +       *dst = 0;
76 +
77 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78 +       return dst;
79 +}
80 +
81 +
82 +void kill_hashfile(struct hashfile_struct *hashfile)
83 +{
84 +       if (!hashfile)
85 +               return;
86 +       free(hashfile->name);
87 +       close(hashfile->fd);
88 +       free(hashfile);
89 +}
90 +
91 +
92 +void kill_hashfiles(struct hashfile_struct *hashfiles)
93 +{
94 +       struct hashfile_struct *iter, *next;
95 +       if ((iter = hashfiles) != NULL) {
96 +               do {
97 +                       next = iter->next;
98 +                       kill_hashfile(iter);
99 +                       iter = next;
100 +               } while (iter != hashfiles);
101 +       }
102 +}
103 +
104 +
105 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106 +{
107 +       DIR *d;
108 +       struct dirent *di;
109 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
110 +       STRUCT_STAT st;
111 +       long this_fnbr;
112 +
113 +       *fnbr = 0;
114 +       
115 +       /* Build a list of potential candidates and open
116 +        * them. */
117 +       if ((d = opendir(hashname)) == NULL) {
118 +               rprintf(FERROR,"opendir \"%s\": %s\n",
119 +                       hashname, strerror(errno));
120 +               free(hashname);
121 +               return NULL;
122 +       }
123 +       while ((di = readdir(d)) != NULL) {
124 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125 +                       continue;
126 +               }
127 +
128 +               /* We need to have the largest fnbr in case we need to store
129 +                * a new file. */
130 +               this_fnbr = atol(di->d_name);
131 +               if (this_fnbr > *fnbr)
132 +                       *fnbr = this_fnbr;
133 +
134 +               hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
135 +               asprintf(&hashfile->name,"%s/%s",hashname,
136 +                        di->d_name);
137 +               if (do_stat(hashfile->name,&st) == -1) {
138 +                       rprintf(FERROR,"%s: %s", hashfile->name,
139 +                               strerror(errno));
140 +                       kill_hashfile(hashfile);
141 +                       continue;
142 +               }
143 +               if (st.st_size != size) {
144 +                       kill_hashfile(hashfile);
145 +                       continue;
146 +               }
147 +               hashfile->nlink = st.st_nlink;
148 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
149 +               if (hashfile->fd == -1) {
150 +                       rprintf(FERROR,"%s: %s\n", hashfile->name,
151 +                               strerror(errno));
152 +                       kill_hashfile(hashfile);
153 +                       continue;
154 +               }
155 +               if (hashfiles == NULL)
156 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
157 +               else {
158 +                       hashfile->next = hashfiles;
159 +                       hashfile->prev = hashfiles->prev;
160 +                       hashfile->next->prev = hashfile;
161 +                       hashfile->prev->next = hashfile;
162 +               }
163 +       }
164 +       closedir(d);
165 +
166 +       return hashfiles;
167 +}
168 +
169 +
170 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
171 +{
172 +       int amt, hamt;
173 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
174 +       struct hashfile_struct *iter, *next, *best;
175 +       uint32 nlink;
176 +
177 +       if (!files)
178 +               return NULL;
179 +
180 +       iter = files; /* in case files are 0 bytes */
181 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
182 +               iter = files;
183 +               do {
184 +                       /* Icky bit to resync when we steal the first node. */
185 +                       if (!files)
186 +                               files = iter;
187 +
188 +                       next = iter->next;
189 +
190 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
191 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
192 +                               if (iter == files) {
193 +                                       files = files->prev;
194 +                               }
195 +                               if (iter->next == iter) {
196 +                                       files = next = NULL;
197 +                               } else {
198 +                                       next = iter->next;
199 +                                       if (iter == files) {
200 +                                               /* So we know to resync */
201 +                                               files = NULL;
202 +                                       }
203 +                               }
204 +                               iter->next->prev = iter->prev;
205 +                               iter->prev->next = iter->next;
206 +                               kill_hashfile(iter);
207 +                       }
208 +
209 +                       iter = next;
210 +               } while (iter != files);
211 +
212 +               if (iter == NULL && files == NULL) {
213 +                       /* There are no matches. */
214 +                       return NULL;
215 +               }
216 +               
217 +       }
218 +
219 +       if (amt == -1) {
220 +               rprintf(FERROR,"%s",strerror(errno));
221 +               kill_hashfiles(files);
222 +               return NULL;
223 +       }
224 +
225 +       /* If we only have one file left, use it. */
226 +       if (files == files->next) {
227 +               return files;
228 +       }
229 +
230 +       /* All files which remain in the list are identical and should have
231 +        * the same size.  We pick the one with the lowest link count (we
232 +        * may have rolled over because we hit the maximum link count for
233 +        * the filesystem). */
234 +       best = iter = files;
235 +       nlink = iter->nlink;
236 +       do {
237 +               if (iter->nlink < nlink) {
238 +                       nlink = iter->nlink;
239 +                       best = iter;
240 +               }
241 +               iter = iter->next;
242 +       } while (iter != files);
243 +
244 +       best->next->prev = best->prev;
245 +       best->prev->next = best->next;
246 +       if (files == best)
247 +               files = files->next;
248 +       kill_hashfiles(files);
249 +       return best;
250 +}
251 +
252 +
253 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
254 +{
255 +       STRUCT_STAT st;
256 +       char *hashname = make_hash_name(file);          
257 +       int first = 0, rc;
258 +       char *linkname;
259 +       long last_fnbr;
260 +
261 +       if (file->length == 0) {
262 +               return robust_rename(fnametmp,fname,0644);
263 +       }
264 +
265 +       if (do_stat(hashname, &st) == -1) {
266 +               char *dirname;
267 +
268 +               /* Directory does not exist. */
269 +               dirname = strdup(hashname);
270 +               *strrchr(dirname,'/') = 0;
271 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
272 +                       rprintf(FERROR, "mkdir %s: %s\n", dirname,
273 +                               strerror(errno));
274 +                       free(hashname);
275 +                       free(dirname);
276 +                       return robust_rename(fnametmp,fname,0644);
277 +               }
278 +               free(dirname);
279 +
280 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
281 +                       rprintf(FERROR, "mkdir %s: %s\n", hashname,
282 +                               strerror(errno));
283 +                       free(hashname);
284 +                       return robust_rename(fnametmp,fname,0644);
285 +               }
286 +
287 +               first = 1;
288 +               asprintf(&linkname,"%s/0",hashname);
289 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
290 +                       
291 +       } else {
292 +               struct hashfile_struct *hashfiles, *hashfile;
293 +               int fd;
294 +
295 +               if (do_stat(fnametmp,&st) == -1) {
296 +                       rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
297 +                       return -1;
298 +               }
299 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
300 +
301 +               if (hashfiles == NULL) {
302 +                       first = 1;
303 +                       asprintf(&linkname,"%s/0",hashname);
304 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
305 +               } else {
306 +                       
307 +                       /* Search for one identical to us. */
308 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
309 +                               rprintf(FERROR,"%s: %s\n",fnametmp,
310 +                                       strerror(errno));
311 +                               kill_hashfiles(hashfiles);
312 +                               return -1;
313 +                       }
314 +                       hashfile = compare_hashfiles(fd, hashfiles);
315 +                       hashfiles = NULL;
316 +
317 +                       if (hashfile) {
318 +                               first = 0;
319 +                               linkname = strdup(hashfile->name);
320 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
321 +                               kill_hashfile(hashfile);
322 +                       } else {
323 +                               first = 1;
324 +                               asprintf(&linkname, "%s/%ld", hashname,
325 +                                        last_fnbr + 1);
326 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
327 +                       }
328 +               }
329 +       }
330 +
331 +       if (!first) {
332 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
333 +                               linkname, full_fname(fname));
334 +               rc = do_link(linkname, fname);
335 +               if (rc == -1) {
336 +                       if (errno == EMLINK) {
337 +                               first = 1;
338 +                               free(linkname);
339 +                               asprintf(&linkname,"%s/%ld",hashname,
340 +                                        last_fnbr + 1);
341 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
342 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
343 +                       } else {
344 +                               rprintf(FERROR,"link \"%s\" -> %s: %s\n",
345 +                                       linkname,full_fname(fname),
346 +                                       strerror(errno));
347 +                               robust_unlink(fname);
348 +                               rc = robust_rename(fnametmp,fname,0644);
349 +                       }
350 +               } else {
351 +                       do_unlink(fnametmp);
352 +               }
353 +       }
354 +
355 +       if (first) {
356 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
357 +                               full_fname(fname),linkname);
358 +
359 +               rc = robust_rename(fnametmp,fname,0644);
360 +               if (rc != 0) {
361 +                       rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
362 +                               full_fname(fnametmp),full_fname(fname),
363 +                               strerror(errno));
364 +               }
365 +               rc = do_link(fname,linkname);
366 +               if (rc != 0) {
367 +                       rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
368 +                               full_fname(fname),linkname,
369 +                               strerror(errno));
370 +               }
371 +       }
372 +
373 +       free(linkname);
374 +       free(hashname);
375 +       return rc;
376 +}
377 +
378 +#endif
379 --- options.c   6 May 2004 21:08:01 -0000       1.148
380 +++ options.c   13 May 2004 19:04:49 -0000
381 @@ -121,6 +121,7 @@ char *log_format = NULL;
382  char *password_file = NULL;
383  char *rsync_path = RSYNC_PATH;
384  char *backup_dir = NULL;
385 +char *link_by_hash_dir = NULL;
386  char backup_dir_buf[MAXPATHLEN];
387  int rsync_port = RSYNC_PORT;
388  int link_dest = 0;
389 @@ -266,6 +267,7 @@ void usage(enum logcode F)
390    rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
391    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
392    rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
393 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash to DIR for regular files\n");
394    rprintf(F," -P                          equivalent to --partial --progress\n");
395    rprintf(F," -z, --compress              compress file data\n");
396    rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
397 @@ -305,7 +307,7 @@ void usage(enum logcode F)
398  enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
399        OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
400        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
401 -      OPT_READ_BATCH, OPT_WRITE_BATCH,
402 +      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
403        OPT_REFUSED_BASE = 9000};
404  
405  static struct poptOption long_options[] = {
406 @@ -362,6 +364,7 @@ static struct poptOption long_options[] 
407    {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
408    {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
409    {"link-dest",        0,  POPT_ARG_STRING, &compare_dest,  OPT_LINK_DEST, 0, 0 },
410 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0,              OPT_LINK_BY_HASH, 0, 0},
411    /* TODO: Should this take an optional int giving the compression level? */
412    {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
413    {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
414 @@ -584,6 +587,19 @@ int parse_arguments(int *argc, const cha
415                         return 0;
416  #endif
417  
418 +                case OPT_LINK_BY_HASH:
419 +#if HAVE_LINK
420 +                       link_by_hash_dir = (char *)poptGetOptArg(pc);
421 +                       checksum_seed = FIXED_CHECKSUM_SEED;
422 +                       break;
423 +#else
424 +                       snprintf(err_buf, sizeof err_buf,
425 +                                "hard links are not supported on this %s\n",
426 +                                am_server ? "server" : "client");
427 +                       rprintf(FERROR, "ERROR: %s", err_buf);
428 +                       return 0;
429 +#endif
430 +
431                 default:
432                         /* A large opt value means that set_refuse_options()
433                          * turned this option off (opt-BASE is its index). */
434 @@ -951,6 +967,11 @@ void server_options(char **args,int *arg
435                  */
436                 args[ac++] = link_dest ? "--link-dest" : "--compare-dest";
437                 args[ac++] = compare_dest;
438 +       }
439 +
440 +       if (link_by_hash_dir && am_sender) {
441 +               args[ac++] = "--link-by-hash";
442 +               args[ac++] = link_by_hash_dir;
443         }
444  
445         if (files_from && (!am_sender || remote_filesfrom_file)) {
446 --- receiver.c  13 May 2004 07:08:22 -0000      1.77
447 +++ receiver.c  13 May 2004 19:04:49 -0000
448 @@ -46,6 +46,7 @@ extern int module_id;
449  extern int ignore_errors;
450  extern int orig_umask;
451  extern int keep_partial;
452 +extern char *link_by_hash_dir;
453  
454  static void delete_one(char *fn, int is_dir)
455  {
456 @@ -191,10 +192,11 @@ static int get_tmpname(char *fnametmp, c
457  
458  
459  static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
460 -                       OFF_T total_size)
461 +                       OFF_T total_size,char *md4)
462  {
463         int i;
464         struct sum_struct sum;
465 +       struct mdfour mdfour_data;
466         unsigned int len;
467         OFF_T offset = 0;
468         OFF_T offset2;
469 @@ -204,7 +206,9 @@ static int receive_data(int f_in,struct 
470         char *map=NULL;
471  
472         read_sum_head(f_in, &sum);
473 -
474 +       if (md4)
475 +               mdfour_begin(&mdfour_data);
476 +       
477         sum_init();
478  
479         while ((i = recv_token(f_in, &data)) != 0) {
480 @@ -221,6 +225,8 @@ static int receive_data(int f_in,struct 
481                         cleanup_got_literal = 1;
482  
483                         sum_update(data,i);
484 +                       if (md4)
485 +                               mdfour_update(&mdfour_data,data,i);
486  
487                         if (fd != -1 && write_file(fd,data,i) != i) {
488                                 rprintf(FERROR, "write failed on %s: %s\n",
489 @@ -248,6 +254,8 @@ static int receive_data(int f_in,struct 
490  
491                         see_token(map, len);
492                         sum_update(map,len);
493 +                       if (md4)
494 +                               mdfour_update(&mdfour_data,map,len);
495                 }
496  
497                 if (fd != -1 && write_file(fd,map,len) != (int) len) {
498 @@ -270,6 +278,8 @@ static int receive_data(int f_in,struct 
499         }
500  
501         sum_end(file_sum1);
502 +       if (md4)
503 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
504  
505         read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
506         if (verbose > 2) {
507 @@ -373,7 +383,7 @@ int recv_files(int f_in,struct file_list
508                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
509                         rprintf(FERROR, "fstat %s failed: %s\n",
510                                 full_fname(fnamecmp), strerror(errno));
511 -                       receive_data(f_in,NULL,-1,NULL,file->length);
512 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
513                         close(fd1);
514                         continue;
515                 }
516 @@ -386,7 +396,7 @@ int recv_files(int f_in,struct file_list
517                          */
518                         rprintf(FERROR,"recv_files: %s is a directory\n",
519                                 full_fname(fnamecmp));
520 -                       receive_data(f_in, NULL, -1, NULL, file->length);
521 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
522                         close(fd1);
523                         continue;
524                 }
525 @@ -438,7 +448,7 @@ int recv_files(int f_in,struct file_list
526                 if (fd2 == -1) {
527                         rprintf(FERROR, "mkstemp %s failed: %s\n",
528                                 full_fname(fnametmp), strerror(errno));
529 -                       receive_data(f_in,mapbuf,-1,NULL,file->length);
530 +                       receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
531                         if (mapbuf) unmap_file(mapbuf);
532                         if (fd1 != -1) close(fd1);
533                         continue;
534 @@ -451,7 +461,12 @@ int recv_files(int f_in,struct file_list
535                 }
536  
537                 /* recv file data */
538 -               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
539 +#ifdef HAVE_LINK
540 +               if (link_by_hash_dir) {
541 +                       file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
542 +               }
543 +#endif
544 +               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
545  
546                 log_recv(file, &initial_stats);
547  
548 --- rsync.c     13 May 2004 18:51:22 -0000      1.138
549 +++ rsync.c     13 May 2004 19:04:49 -0000
550 @@ -31,6 +31,7 @@ extern int am_generator;
551  extern int preserve_uid;
552  extern int preserve_gid;
553  extern int make_backups;
554 +extern char *link_by_hash_dir;
555  
556  
557  /*
558 @@ -236,8 +237,12 @@ void finish_transfer(char *fname, char *
559         if (make_backups && !make_backup(fname))
560                 return;
561  
562 -       /* move tmp file over real file */
563 -       ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
564 +#ifdef HAVE_LINK
565 +       if (link_by_hash_dir)
566 +               ret = link_by_hash(fnametmp,fname,file);
567 +       else
568 +#endif
569 +               ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
570         if (ret < 0) {
571                 rprintf(FERROR, "%s %s -> \"%s\": %s\n",
572                     ret == -2 ? "copy" : "rename",
573 --- rsync.h     13 May 2004 18:51:22 -0000      1.203
574 +++ rsync.h     13 May 2004 19:04:50 -0000
575 @@ -521,6 +521,14 @@ struct stats {
576         int current_file_index;
577  };
578  
579 +struct hashfile_struct {
580 +       struct hashfile_struct *next;
581 +       struct hashfile_struct *prev;
582 +       char *name;
583 +       int fd;
584 +       uint32 nlink;
585 +};
586 +
587  
588  /* we need this function because of the silly way in which duplicate
589     entries are handled in the file lists - we can't change this