Adding a new feature diff.
[rsync/rsync-patches.git] / link-by-hash.diff
1 --- hashlink.c  1969-12-31 19:00:00.000000000 -0500
2 +++ hashlink.c  2004-02-23 10:30:45.000000000 -0500
3 @@ -0,0 +1,351 @@
4 +/*
5 +   Copyright (C) Cronosys, LLC 2004
6 +
7 +   This program is free software; you can redistribute it and/or modify
8 +   it under the terms of the GNU General Public License as published by
9 +   the Free Software Foundation; either version 2 of the License, or
10 +   (at your option) any later version.
11 +
12 +   This program is distributed in the hope that it will be useful,
13 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
14 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 +   GNU General Public License for more details.
16 +
17 +   You should have received a copy of the GNU General Public License
18 +   along with this program; if not, write to the Free Software
19 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 +*/
21 +
22 +/* This file contains code used by the --link-by-hash option. */
23 +
24 +#include "rsync.h"
25 +
26 +extern char *link_by_hash_dir;
27 +
28 +#ifdef HAVE_LINK
29 +
30 +char* make_hash_name(struct file_struct *file)
31 +{
32 +       char hash[33], *dst;
33 +       unsigned char *src;
34 +       unsigned char c;
35 +       int i;
36 +
37 +       src = (unsigned char*)file->u.sum;
38 +       for (dst = hash, i = 0; i < 4; i++, src++) {
39 +               c = *src >> 4;
40 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
41 +               c = *src & 0x0f;
42 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
43 +       }
44 +       *dst++ = '/';
45 +       for (i = 0; i < 12; i++, src++) {
46 +               c = *src >> 4;
47 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
48 +               c = *src & 0x0f;
49 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
50 +       }
51 +       *dst = 0;
52 +
53 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
54 +       return dst;
55 +}
56 +
57 +
58 +void kill_hashfile(struct hashfile_struct *hashfile)
59 +{
60 +       if (!hashfile)
61 +               return;
62 +       free(hashfile->name);
63 +       close(hashfile->fd);
64 +       free(hashfile);
65 +}
66 +
67 +
68 +void kill_hashfiles(struct hashfile_struct *hashfiles)
69 +{
70 +       struct hashfile_struct *iter, *next;
71 +       if ((iter = hashfiles) != NULL) {
72 +               do {
73 +                       next = iter->next;
74 +                       kill_hashfile(iter);
75 +                       iter = next;
76 +               } while (iter != hashfiles);
77 +       }
78 +}
79 +
80 +
81 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
82 +{
83 +       DIR *d;
84 +       struct dirent *di;
85 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
86 +       STRUCT_STAT st;
87 +       long this_fnbr;
88 +
89 +       *fnbr = 0;
90 +       
91 +       /* Build a list of potential candidates and open
92 +        * them. */
93 +       if ((d = opendir(hashname)) == NULL) {
94 +               rprintf(FERROR,"opendir \"%s\": %s\n",
95 +                       hashname, strerror(errno));
96 +               free(hashname);
97 +               return NULL;
98 +       }
99 +       while ((di = readdir(d)) != NULL) {
100 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
101 +                       continue;
102 +               }
103 +
104 +               /* We need to have the largest fnbr in case we need to store
105 +                * a new file. */
106 +               this_fnbr = atol(di->d_name);
107 +               if (this_fnbr > *fnbr)
108 +                       *fnbr = this_fnbr;
109 +
110 +               hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
111 +               asprintf(&hashfile->name,"%s/%s",hashname,
112 +                        di->d_name);
113 +               if (do_stat(hashfile->name,&st) == -1) {
114 +                       rprintf(FERROR,"%s: %s", hashfile->name,
115 +                               strerror(errno));
116 +                       kill_hashfile(hashfile);
117 +                       continue;
118 +               }
119 +               if (st.st_size != size) {
120 +                       kill_hashfile(hashfile);
121 +                       continue;
122 +               }
123 +               hashfile->nlink = st.st_nlink;
124 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
125 +               if (hashfile->fd == -1) {
126 +                       rprintf(FERROR,"%s: %s\n", hashfile->name,
127 +                               strerror(errno));
128 +                       kill_hashfile(hashfile);
129 +                       continue;
130 +               }
131 +               if (hashfiles == NULL)
132 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
133 +               else {
134 +                       hashfile->next = hashfiles;
135 +                       hashfile->prev = hashfiles->prev;
136 +                       hashfile->next->prev = hashfile;
137 +                       hashfile->prev->next = hashfile;
138 +               }
139 +       }
140 +       closedir(d);
141 +
142 +       return hashfiles;
143 +}
144 +
145 +
146 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
147 +{
148 +       int amt, hamt;
149 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
150 +       struct hashfile_struct *iter, *next, *best;
151 +       uint32 nlink;
152 +
153 +       if (!files)
154 +               return NULL;
155 +
156 +       iter = files; /* in case files are 0 bytes */
157 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
158 +               iter = files;
159 +               do {
160 +                       /* Icky bit to resync when we steal the first node. */
161 +                       if (!files)
162 +                               files = iter;
163 +
164 +                       next = iter->next;
165 +
166 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
167 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
168 +                               if (iter == files) {
169 +                                       files = files->prev;
170 +                               }
171 +                               if (iter->next == iter) {
172 +                                       files = next = NULL;
173 +                               } else {
174 +                                       next = iter->next;
175 +                                       if (iter == files) {
176 +                                               /* So we know to resync */
177 +                                               files = NULL;
178 +                                       }
179 +                               }
180 +                               iter->next->prev = iter->prev;
181 +                               iter->prev->next = iter->next;
182 +                               kill_hashfile(iter);
183 +                       }
184 +
185 +                       iter = next;
186 +               } while (iter != files);
187 +
188 +               if (iter == NULL && files == NULL) {
189 +                       /* There are no matches. */
190 +                       return NULL;
191 +               }
192 +               
193 +       }
194 +
195 +       if (amt == -1) {
196 +               rprintf(FERROR,"%s",strerror(errno));
197 +               kill_hashfiles(files);
198 +               return NULL;
199 +       }
200 +
201 +       /* If we only have one file left, use it. */
202 +       if (files == files->next) {
203 +               return files;
204 +       }
205 +
206 +       /* All files which remain in the list are identical and should have
207 +        * the same size.  We pick the one with the lowest link count (we
208 +        * may have rolled over because we hit the maximum link count for
209 +        * the filesystem). */
210 +       best = iter = files;
211 +       nlink = iter->nlink;
212 +       do {
213 +               if (iter->nlink < nlink) {
214 +                       nlink = iter->nlink;
215 +                       best = iter;
216 +               }
217 +               iter = iter->next;
218 +       } while (iter != files);
219 +
220 +       best->next->prev = best->prev;
221 +       best->prev->next = best->next;
222 +       if (files == best)
223 +               files = files->next;
224 +       kill_hashfiles(files);
225 +       return best;
226 +}
227 +
228 +
229 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
230 +{
231 +       STRUCT_STAT st;
232 +       char *hashname = make_hash_name(file);          
233 +       int first = 0, rc;
234 +       char *linkname;
235 +       long last_fnbr;
236 +
237 +       if (file->length == 0) {
238 +               return robust_rename(fnametmp,fname,0644);
239 +       }
240 +
241 +       if (do_stat(hashname, &st) == -1) {
242 +               char *dirname;
243 +
244 +               /* Directory does not exist. */
245 +               dirname = strdup(hashname);
246 +               *strrchr(dirname,'/') = 0;
247 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
248 +                       rprintf(FERROR, "mkdir %s: %s\n", dirname,
249 +                               strerror(errno));
250 +                       free(hashname);
251 +                       free(dirname);
252 +                       return robust_rename(fnametmp,fname,0644);
253 +               }
254 +               free(dirname);
255 +
256 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
257 +                       rprintf(FERROR, "mkdir %s: %s\n", hashname,
258 +                               strerror(errno));
259 +                       free(hashname);
260 +                       return robust_rename(fnametmp,fname,0644);
261 +               }
262 +
263 +               first = 1;
264 +               asprintf(&linkname,"%s/0",hashname);
265 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
266 +                       
267 +       } else {
268 +               struct hashfile_struct *hashfiles, *hashfile;
269 +               int fd;
270 +
271 +               if (do_stat(fnametmp,&st) == -1) {
272 +                       rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
273 +                       return -1;
274 +               }
275 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
276 +
277 +               if (hashfiles == NULL) {
278 +                       first = 1;
279 +                       asprintf(&linkname,"%s/0",hashname);
280 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
281 +               } else {
282 +                       
283 +                       /* Search for one identical to us. */
284 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
285 +                               rprintf(FERROR,"%s: %s\n",fnametmp,
286 +                                       strerror(errno));
287 +                               kill_hashfiles(hashfiles);
288 +                               return -1;
289 +                       }
290 +                       hashfile = compare_hashfiles(fd, hashfiles);
291 +                       hashfiles = NULL;
292 +
293 +                       if (hashfile) {
294 +                               first = 0;
295 +                               linkname = strdup(hashfile->name);
296 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
297 +                               kill_hashfile(hashfile);
298 +                       } else {
299 +                               first = 1;
300 +                               asprintf(&linkname, "%s/%ld", hashname,
301 +                                        last_fnbr + 1);
302 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
303 +                       }
304 +               }
305 +       }
306 +
307 +       if (!first) {
308 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
309 +                               linkname, full_fname(fname));
310 +               rc = do_link(linkname, fname);
311 +               if (rc == -1) {
312 +                       if (errno == EMLINK) {
313 +                               first = 1;
314 +                               free(linkname);
315 +                               asprintf(&linkname,"%s/%ld",hashname,
316 +                                        last_fnbr + 1);
317 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
318 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
319 +                       } else {
320 +                               rprintf(FERROR,"link \"%s\" -> %s: %s\n",
321 +                                       linkname,full_fname(fname),
322 +                                       strerror(errno));
323 +                               robust_unlink(fname);
324 +                               rc = robust_rename(fnametmp,fname,0644);
325 +                       }
326 +               } else {
327 +                       do_unlink(fnametmp);
328 +               }
329 +       }
330 +
331 +       if (first) {
332 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
333 +                               full_fname(fname),linkname);
334 +
335 +               rc = robust_rename(fnametmp,fname,0644);
336 +               if (rc != 0) {
337 +                       rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
338 +                               full_fname(fnametmp),full_fname(fname),
339 +                               strerror(errno));
340 +               }
341 +               rc = do_link(fname,linkname);
342 +               if (rc != 0) {
343 +                       rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
344 +                               full_fname(fname),linkname,
345 +                               strerror(errno));
346 +               }
347 +       }
348 +
349 +       free(linkname);
350 +       free(hashname);
351 +       return rc;
352 +}
353 +
354 +#endif
355 --- Makefile.in 2004-02-23 10:22:51.000000000 -0500
356 +++ Makefile.in 2004-02-23 10:22:51.000000000 -0500
357 @@ -35,7 +35,7 @@
358         main.o checksum.o match.o syscall.o log.o backup.o
359  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
360         fileio.o batch.o clientname.o
361 -OBJS3=progress.o pipe.o
362 +OBJS3=progress.o pipe.o hashlink.o
363  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
364  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
365         popt/popthelp.o popt/poptparse.o
366 --- options.c   2004-02-23 10:22:51.000000000 -0500
367 +++ options.c   2004-02-23 10:29:14.000000000 -0500
368 @@ -119,6 +119,7 @@
369  char *password_file = NULL;
370  char *rsync_path = RSYNC_PATH;
371  char *backup_dir = NULL;
372 +char *link_by_hash_dir = NULL;
373  char backup_dir_buf[MAXPATHLEN];
374  int rsync_port = RSYNC_PORT;
375  int link_dest = 0;
376 @@ -264,6 +265,7 @@
377    rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
378    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
379    rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
380 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash to DIR for regular files\n");
381    rprintf(F," -P                          equivalent to --partial --progress\n");
382    rprintf(F," -z, --compress              compress file data\n");
383    rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
384 @@ -303,7 +305,7 @@
385  enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
386        OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
387        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
388 -      OPT_READ_BATCH, OPT_WRITE_BATCH,
389 +      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
390        OPT_REFUSED_BASE = 9000};
391  
392  static struct poptOption long_options[] = {
393 @@ -360,6 +362,7 @@
394    {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
395    {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
396    {"link-dest",        0,  POPT_ARG_STRING, 0,              OPT_LINK_DEST, 0, 0 },
397 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0,              OPT_LINK_BY_HASH, 0, 0},
398    /* TODO: Should this take an optional int giving the compression level? */
399    {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
400    {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
401 @@ -577,6 +580,19 @@
402                         return 0;
403  #endif
404  
405 +                case OPT_LINK_BY_HASH:
406 +#if HAVE_LINK
407 +                       link_by_hash_dir = (char *)poptGetOptArg(pc);
408 +                       checksum_seed = FIXED_CHECKSUM_SEED;
409 +                       break;
410 +#else
411 +                       snprintf(err_buf, sizeof err_buf,
412 +                                "hard links are not supported on this %s\n",
413 +                                am_server ? "server" : "client");
414 +                       rprintf(FERROR, "ERROR: %s", err_buf);
415 +                       return 0;
416 +#endif
417 +
418                 default:
419                         /* A large opt value means that set_refuse_options()
420                          * turned this option off (opt-BASE is its index). */
421 @@ -934,6 +950,11 @@
422                 args[ac++] = compare_dest;
423         }
424  
425 +       if (link_by_hash_dir && am_sender) {
426 +               args[ac++] = "--link-by-hash";
427 +               args[ac++] = link_by_hash_dir;
428 +       }
429 +
430         if (files_from && (!am_sender || remote_filesfrom_file)) {
431                 if (remote_filesfrom_file) {
432                         args[ac++] = "--files-from";
433 --- proto.h     2004-02-23 10:22:51.000000000 -0500
434 +++ proto.h     2004-02-23 10:22:51.000000000 -0500
435 @@ -93,6 +93,12 @@
436  void write_sum_head(int f, struct sum_struct *sum);
437  void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
438  void generate_files(int f, struct file_list *flist, char *local_name);
439 +char* make_hash_name(struct file_struct *file);
440 +void kill_hashfile(struct hashfile_struct *hashfile);
441 +void kill_hashfiles(struct hashfile_struct *hashfiles);
442 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
443 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
444 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
445  void init_hard_links(struct file_list *flist);
446  int hard_link_check(struct file_struct *file, int skip);
447  void do_hard_links(void);
448 --- receiver.c  2004-02-23 10:22:51.000000000 -0500
449 +++ receiver.c  2004-02-23 10:22:51.000000000 -0500
450 @@ -186,10 +186,11 @@
451  
452  
453  static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
454 -                       OFF_T total_size)
455 +                       OFF_T total_size,char *md4)
456  {
457         int i;
458         struct sum_struct sum;
459 +       struct mdfour mdfour_data;
460         unsigned int len;
461         OFF_T offset = 0;
462         OFF_T offset2;
463 @@ -199,7 +200,9 @@
464         char *map=NULL;
465  
466         read_sum_head(f_in, &sum);
467 -
468 +       if (md4)
469 +               mdfour_begin(&mdfour_data);
470 +       
471         sum_init();
472  
473         while ((i = recv_token(f_in, &data)) != 0) {
474 @@ -216,6 +219,8 @@
475                         cleanup_got_literal = 1;
476  
477                         sum_update(data,i);
478 +                       if (md4)
479 +                               mdfour_update(&mdfour_data,data,i);
480  
481                         if (fd != -1 && write_file(fd,data,i) != i) {
482                                 rprintf(FERROR, "write failed on %s: %s\n",
483 @@ -243,6 +248,8 @@
484  
485                         see_token(map, len);
486                         sum_update(map,len);
487 +                       if (md4)
488 +                               mdfour_update(&mdfour_data,map,len);
489                 }
490  
491                 if (fd != -1 && write_file(fd,map,len) != (int) len) {
492 @@ -265,6 +272,8 @@
493         }
494  
495         sum_end(file_sum1);
496 +       if (md4)
497 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
498  
499         read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
500         if (verbose > 2) {
501 @@ -299,6 +308,7 @@
502         extern int preserve_perms;
503         extern int delete_after;
504         extern int orig_umask;
505 +       extern char *link_by_hash_dir;
506         struct stats initial_stats;
507  
508         if (verbose > 2) {
509 @@ -372,7 +382,7 @@
510                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
511                         rprintf(FERROR, "fstat %s failed: %s\n",
512                                 full_fname(fnamecmp), strerror(errno));
513 -                       receive_data(f_in,NULL,-1,NULL,file->length);
514 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
515                         close(fd1);
516                         continue;
517                 }
518 @@ -385,7 +395,7 @@
519                          */
520                         rprintf(FERROR,"recv_files: %s is a directory\n",
521                                 full_fname(fnamecmp));
522 -                       receive_data(f_in, NULL, -1, NULL, file->length);
523 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
524                         close(fd1);
525                         continue;
526                 }
527 @@ -437,7 +447,7 @@
528                 if (fd2 == -1) {
529                         rprintf(FERROR, "mkstemp %s failed: %s\n",
530                                 full_fname(fnametmp), strerror(errno));
531 -                       receive_data(f_in,mapbuf,-1,NULL,file->length);
532 +                       receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
533                         if (mapbuf) unmap_file(mapbuf);
534                         if (fd1 != -1) close(fd1);
535                         continue;
536 @@ -450,7 +460,12 @@
537                 }
538  
539                 /* recv file data */
540 -               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
541 +#ifdef HAVE_LINK
542 +               if (link_by_hash_dir) {
543 +                       file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
544 +               }
545 +#endif
546 +               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
547  
548                 log_recv(file, &initial_stats);
549  
550 --- rsync.c     2004-02-23 10:22:51.000000000 -0500
551 +++ rsync.c     2004-02-23 10:27:15.000000000 -0500
552 @@ -33,6 +33,7 @@
553  extern int preserve_gid;
554  extern int preserve_perms;
555  extern int make_backups;
556 +extern char *link_by_hash_dir;
557  
558  
559  /*
560 @@ -236,6 +237,12 @@
561         if (make_backups && !make_backup(fname))
562                 return;
563  
564 +#ifdef HAVE_LINK
565 +       if (link_by_hash_dir) {
566 +               if ((ret = link_by_hash(fnametmp,fname,file)) != 0)
567 +                       return;
568 +       } else
569 +#endif
570         /* move tmp file over real file */
571         ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
572         if (ret != 0) {
573 --- rsync.h     2004-02-23 10:22:51.000000000 -0500
574 +++ rsync.h     2004-02-23 10:22:51.000000000 -0500
575 @@ -513,6 +513,14 @@
576         int current_file_index;
577  };
578  
579 +struct hashfile_struct {
580 +       struct hashfile_struct *next;
581 +       struct hashfile_struct *prev;
582 +       char *name;
583 +       int fd;
584 +       uint32 nlink;
585 +};
586 +
587  
588  /* we need this function because of the silly way in which duplicate
589     entries are handled in the file lists - we can't change this