This is now in CVS.
[rsync/rsync-patches.git] / fuzzy.diff
1 From rusty@rustcorp.com.au Wed Apr  3 17:18:42 2002
2 Return-Path: <rusty@rustcorp.com.au>
3 Delivered-To: mbp@samba.org
4 Received: from wagner.rustcorp.com.au (sydney1.au.ibm.com [202.135.142.193])
5         by lists.samba.org (Postfix) with ESMTP id EA7B849DC
6         for <mbp@samba.org>; Tue,  2 Apr 2002 23:06:29 -0800 (PST)
7 Received: from wagner.rustcorp.com.au ([127.0.0.1] helo=rustcorp.com.au)
8         by wagner.rustcorp.com.au with esmtp (Exim 3.35 #1 (Debian))
9         id 16set7-0000pL-00
10         for <mbp@samba.org>; Wed, 03 Apr 2002 17:08:57 +1000
11 From: Rusty Russell <rusty@rustcorp.com.au>
12 To: Martin Pool <mbp@samba.org>
13 Subject: Re: gzip patch 
14 In-reply-to: Your message of "Wed, 03 Apr 2002 12:04:59 +1000."
15              <20020403020455.GC18851@samba.org> 
16 Date: Wed, 03 Apr 2002 17:08:57 +1000
17 Sender: rusty@rustcorp.com.au
18 Message-Id: <E16set7-0000pL-00@wagner.rustcorp.com.au>
19 Status: RO
20 X-Status: A
21 Content-Length: 12810
22 Lines: 461
23
24 In message <20020403020455.GC18851@samba.org> you write:
25 > Hi,
26
27 > I think you said the other day that you had a working --rsyncable
28 > patch for gzip.  Could I have it please?
29
30 Hi Martin,
31
32         Just got your mail, sorry for the delay.  Found old patch on
33 google, and updated it for 2.5.4 (I know, but that's what apt-get
34 source gave me).
35
36 Compiles, otherwise untested.
37 Rusty.
38 --
39   Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
40
41 diff -urN rsync-2.5.4/Makefile.in rsync-2.5.4-fuzzy/Makefile.in
42 --- rsync-2.5.4/Makefile.in     Tue Feb 26 05:48:25 2002
43 +++ rsync-2.5.4-fuzzy/Makefile.in       Wed Apr  3 16:35:55 2002
44 @@ -28,7 +28,7 @@
45  ZLIBOBJ=zlib/deflate.o zlib/infblock.o zlib/infcodes.o zlib/inffast.o \
46         zlib/inflate.o zlib/inftrees.o zlib/infutil.o zlib/trees.o \
47         zlib/zutil.o zlib/adler32.o 
48 -OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o main.o checksum.o match.o syscall.o log.o backup.o
49 +OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o main.o checksum.o match.o syscall.o log.o backup.o alternate.o
50  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o fileio.o batch.o \
51         clientname.o
52  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
53 diff -urN rsync-2.5.4/alternate.c rsync-2.5.4-fuzzy/alternate.c
54 --- rsync-2.5.4/alternate.c     Thu Jan  1 10:00:00 1970
55 +++ rsync-2.5.4-fuzzy/alternate.c       Wed Apr  3 17:04:15 2002
56 @@ -0,0 +1,117 @@
57 +#include "rsync.h"
58 +
59 +extern char *compare_dest;
60 +extern int verbose;
61 +
62 +/* Alternate methods for opening files, if local doesn't exist */
63 +/* Sanity check that we are about to open regular file */
64 +int do_open_regular(char *fname)
65 +{
66 +       STRUCT_STAT st;
67 +
68 +       if (do_stat(fname, &st) == 0 && S_ISREG(st.st_mode))
69 +               return do_open(fname, O_RDONLY, 0);
70 +
71 +       return -1;
72 +}
73 +
74 +static void split_names(char *fname, char **dirname, char **basename)
75 +{
76 +       char *slash;
77 +
78 +       slash = strrchr(fname, '/');
79 +       if (slash) {
80 +               *dirname = fname;
81 +               *slash = '\0';
82 +               *basename = slash+1;
83 +       } else {
84 +               *basename = fname;
85 +               *dirname = ".";
86 +       }
87 +}
88 +
89 +static unsigned int measure_name(const char *name,
90 +                                const char *basename,
91 +                                const char *ext)
92 +{
93 +       int namelen = strlen(name);
94 +       int extlen = strlen(ext);
95 +       unsigned int score = 0;
96 +
97 +       /* Extensions must match */
98 +       if (namelen <= extlen || strcmp(name+namelen-extlen, ext) != 0)
99 +               return 0;
100 +
101 +       /* Now score depends on similarity of prefix */
102 +       for (; *name==*basename && *name; name++, basename++)
103 +               score++;
104 +       return score;
105 +}
106 +
107 +int open_alternate_base_fuzzy(const char *fname)
108 +{
109 +       DIR *d;
110 +       struct dirent *di;
111 +       char *basename, *dirname;
112 +       char mangled_name[MAXPATHLEN];
113 +       char bestname[MAXPATHLEN];
114 +       unsigned int bestscore = 0;
115 +       const char *ext;
116 +
117 +       /* FIXME: can we assume fname fits here? */
118 +       strcpy(mangled_name, fname);
119 +
120 +       split_names(mangled_name, &dirname, &basename);
121 +       d = opendir(dirname);
122 +       if (!d) {
123 +               rprintf(FERROR,"recv_generator opendir(%s): %s\n",
124 +                       dirname,strerror(errno));
125 +               return -1;
126 +       }
127 +
128 +       /* Get final extension, eg. .gz; never full basename though. */
129 +       ext = strrchr(basename + 1, '.');
130 +       if (!ext)
131 +               ext = basename + strlen(basename); /* ext = "" */
132 +
133 +       while ((di = readdir(d)) != NULL) {
134 +               const char *dname = d_name(di);
135 +               unsigned int score;
136 +
137 +               if (strcmp(dname,".")==0 ||
138 +                   strcmp(dname,"..")==0)
139 +                       continue;
140 +               
141 +               score = measure_name(dname, basename, ext);
142 +               if (verbose > 4)
143 +                       rprintf(FINFO,"fuzzy score for %s = %u\n",
144 +                               dname, score);
145 +               if (score > bestscore) {
146 +                       strcpy(bestname, dname); 
147 +                       bestscore = score;
148 +               }
149 +       }
150 +       closedir(d);
151 +
152 +       /* Found a candidate. */
153 +       if (bestscore != 0) {
154 +               char fuzzyname[MAXPATHLEN];
155 +
156 +               snprintf(fuzzyname,MAXPATHLEN,"%s/%s", dirname, bestname);
157 +               if (verbose > 2)
158 +                       rprintf(FINFO,"fuzzy match %s->%s\n",
159 +                               fname, fuzzyname);
160 +               return do_open_regular(fuzzyname);
161 +       }
162 +       return -1;
163 +}
164 +
165 +int open_alternate_base_comparedir(const char *fname)
166 +{
167 +       char fnamebuf[MAXPATHLEN];
168 +       /* try the file at compare_dest instead */
169 +       snprintf(fnamebuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
170 +
171 +       /* FIXME: now follows symlinks... */
172 +       return do_open_regular(fnamebuf);
173 +}
174 diff -urN rsync-2.5.4/generator.c rsync-2.5.4-fuzzy/generator.c
175 --- rsync-2.5.4/generator.c     Fri Feb  8 03:36:12 2002
176 +++ rsync-2.5.4-fuzzy/generator.c       Wed Apr  3 17:00:06 2002
177 @@ -42,11 +42,12 @@
178  extern int always_checksum;
179  extern int modify_window;
180  extern char *compare_dest;
181 +extern int fuzzy;
182  
183  
184  /* choose whether to skip a particular file */
185  static int skip_file(char *fname,
186 -                    struct file_struct *file, STRUCT_STAT *st)
187 +                    struct file_struct *file, const STRUCT_STAT *st)
188  {
189         if (st->st_size != file->length) {
190                 return 0;
191 @@ -185,7 +186,61 @@
192         return s;
193  }
194  
195 +/* Returns -1 for can't open (null file), -2 for skip */
196 +static int open_base_file(struct file_struct *file,
197 +                         char *fname, 
198 +                         int statret, 
199 +                         STRUCT_STAT *st)
200 +{
201 +       int fd = -1;
202 +
203 +       if (statret == 0) {
204 +               if (S_ISREG(st->st_mode)) {
205 +                       if (update_only
206 +                           && cmp_modtime(st->st_mtime, file->modtime) > 0) {
207 +                               if (verbose > 1)
208 +                                       rprintf(FINFO,"%s is newer\n",fname);
209 +                               return -2;
210 +                       }
211 +                       if (skip_file(fname, file, st)) {
212 +                               set_perms(fname, file, st, 1);
213 +                               return -2;
214 +                       }
215 +                       fd = do_open(fname, O_RDONLY, 0);
216 +                       if (fd == -1) {
217 +                               rprintf(FERROR,"failed to open %s, continuing : %s\n",fname,strerror(errno));
218 +                               return -1;
219 +                       } else
220 +                               return fd;
221 +               } else {
222 +                       /* Try to use symlink contents */
223 +                       if (S_ISLNK(st->st_mode)) {
224 +                               fd = do_open_regular(fname);
225 +                               /* Don't delete yet; receiver will need it */
226 +                       } else {
227 +                               if (delete_file(fname) != 0) {
228 +                                       if (fd != -1)
229 +                                               close(fd);
230 +                                       return -2;
231 +                               }
232 +                       }
233 +               }
234 +       }
235 +
236 +       if (fd == -1 && compare_dest != NULL)
237 +               fd = open_alternate_base_comparedir(fname);
238  
239 +       if (fd == -1 && fuzzy)
240 +               fd = open_alternate_base_fuzzy(fname);
241 +
242 +       /* Update stat to understand size */
243 +       if (fd != -1) {
244 +               if (do_fstat(fd, st) != 0)
245 +                       rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
246 +       }
247 +
248 +       return fd;
249 +}
250  
251  /*
252   * Acts on file number I from FLIST, whose name is fname.
253 @@ -203,9 +258,6 @@
254         struct sum_struct *s;
255         int statret;
256         struct file_struct *file = flist->files[i];
257 -       char *fnamecmp;
258 -       char fnamecmpbuf[MAXPATHLEN];
259 -       extern char *compare_dest;
260         extern int list_only;
261         extern int preserve_perms;
262         extern int only_existing;
263 @@ -341,82 +393,29 @@
264                 return;
265         }
266  
267 -       fnamecmp = fname;
268 -
269 -       if ((statret == -1) && (compare_dest != NULL)) {
270 -               /* try the file at compare_dest instead */
271 -               int saveerrno = errno;
272 -               snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
273 -               statret = link_stat(fnamecmpbuf,&st);
274 -               if (!S_ISREG(st.st_mode))
275 -                       statret = -1;
276 -               if (statret == -1)
277 -                       errno = saveerrno;
278 -               else
279 -                       fnamecmp = fnamecmpbuf;
280 -       }
281 -
282 -       if (statret == -1) {
283 -               if (errno == ENOENT) {
284 -                       write_int(f_out,i);
285 -                       if (!dry_run) send_sums(NULL,f_out);
286 -               } else {
287 -                       if (verbose > 1)
288 -                               rprintf(FERROR, RSYNC_NAME
289 -                                       ": recv_generator failed to open \"%s\": %s\n",
290 -                                       fname, strerror(errno));
291 -               }
292 -               return;
293 -       }
294 -
295 -       if (!S_ISREG(st.st_mode)) {
296 -               if (delete_file(fname) != 0) {
297 -                       return;
298 -               }
299 -
300 -               /* now pretend the file didn't exist */
301 -               write_int(f_out,i);
302 -               if (!dry_run) send_sums(NULL,f_out);    
303 -               return;
304 -       }
305 -
306 -       if (opt_ignore_existing && fnamecmp == fname) { 
307 -               if (verbose > 1)
308 -                       rprintf(FINFO,"%s exists\n",fname);
309 -               return;
310 -       } 
311 -
312 -       if (update_only && cmp_modtime(st.st_mtime,file->modtime)>0 && fnamecmp == fname) {
313 +       /* Failed to stat for some other reason. */
314 +       if (statret == -1 && errno != ENOENT) {
315                 if (verbose > 1)
316 -                       rprintf(FINFO,"%s is newer\n",fname);
317 +                       rprintf(FERROR, RSYNC_NAME
318 +                               ": recv_generator failed to open \"%s\": %s\n",
319 +                               fname, strerror(errno));
320                 return;
321         }
322  
323 -       if (skip_file(fname, file, &st)) {
324 -               if (fnamecmp == fname)
325 -                       set_perms(fname,file,&st,1);
326 -               return;
327 -       }
328 -
329 -       if (dry_run) {
330 -               write_int(f_out,i);
331 +       fd = open_base_file(file, fname, statret, &st);
332 +       if (fd == -2)
333                 return;
334 -       }
335 -
336 -       if (whole_file) {
337 -               write_int(f_out,i);
338 -               send_sums(NULL,f_out);    
339 -               return;
340 -       }
341 -
342 -       /* open the file */  
343 -       fd = do_open(fnamecmp, O_RDONLY, 0);
344  
345 -       if (fd == -1) {
346 -               rprintf(FERROR,RSYNC_NAME": failed to open \"%s\", continuing : %s\n",fnamecmp,strerror(errno));
347 -               /* pretend the file didn't exist */
348 +       if ((whole_file || dry_run) && fd != -1) {
349 +               close(fd);
350 +               fd = -1;
351 +       }
352
353 +       if (fd == -1) {
354 +               /* the file didn't exist, or we can pretend it doesn't */
355                 write_int(f_out,i);
356 -               send_sums(NULL,f_out);
357 +               if (!dry_run)
358 +                       send_sums(NULL,f_out);
359                 return;
360         }
361  
362 @@ -427,7 +426,7 @@
363         }
364  
365         if (verbose > 3)
366 -               rprintf(FINFO,"gen mapped %s of size %.0f\n",fnamecmp,(double)st.st_size);
367 +               rprintf(FINFO,"gen mapped %s of size %.0f\n",fname,(double)st.st_size);
368  
369         s = generate_sums(buf,st.st_size,adapt_block_size(file, block_size));
370  
371 diff -urN rsync-2.5.4/options.c rsync-2.5.4-fuzzy/options.c
372 --- rsync-2.5.4/options.c       Thu Feb 28 09:49:57 2002
373 +++ rsync-2.5.4-fuzzy/options.c Wed Apr  3 16:43:54 2002
374 @@ -73,6 +73,7 @@
375  #else
376  int modify_window=0;
377  #endif
378 +int fuzzy=0;
379  int blocking_io=-1;
380  
381  /** Network address family. **/
382 @@ -245,6 +246,7 @@
383    rprintf(F,"     --bwlimit=KBPS          limit I/O bandwidth, KBytes per second\n");
384    rprintf(F,"     --write-batch=PREFIX    write batch fileset starting with PREFIX\n");
385    rprintf(F,"     --read-batch=PREFIX     read batch fileset starting with PREFIX\n");
386 +  rprintf(F,"     --fuzzy                use similar file as basis if it does't exist\n");
387    rprintf(F," -h, --help                  show this help screen\n");
388  #ifdef INET6
389    rprintf(F," -4                          prefer IPv4\n");
390 @@ -340,6 +342,7 @@
391    {"hard-links",      'H', POPT_ARG_NONE,   &preserve_hard_links},
392    {"read-batch",       0,  POPT_ARG_STRING, &batch_prefix, OPT_READ_BATCH},
393    {"write-batch",      0,  POPT_ARG_STRING, &batch_prefix, OPT_WRITE_BATCH},
394 +  {"fuzzy",           0,  POPT_ARG_NONE,   &fuzzy},
395  #ifdef INET6
396    {0,                '4', POPT_ARG_VAL,    &default_af_hint,   AF_INET },
397    {0,                '6', POPT_ARG_VAL,    &default_af_hint,   AF_INET6 },
398 @@ -757,7 +760,9 @@
399                 args[ac++] = "--compare-dest";
400                 args[ac++] = compare_dest;
401         }
402 -
403 +       
404 +       if (fuzzy && am_sender)
405 +               args[ac++] = "--fuzzy";
406  
407         *argc = ac;
408  }
409 diff -urN rsync-2.5.4/proto.h rsync-2.5.4-fuzzy/proto.h
410 --- rsync-2.5.4/proto.h Sat Feb 23 11:05:06 2002
411 +++ rsync-2.5.4-fuzzy/proto.h   Wed Apr  3 16:35:25 2002
412 @@ -256,3 +256,6 @@
413  int cmp_modtime(time_t file1, time_t file2);
414  int _Insure_trap_error(int a1, int a2, int a3, int a4, int a5, int a6);
415  int sys_gettimeofday(struct timeval *tv);
416 +int do_open_regular(char *fname);
417 +int open_alternate_base_fuzzy(const char *fname);
418 +int open_alternate_base_comparedir(const char *fname);
419 diff -urN rsync-2.5.4/receiver.c rsync-2.5.4-fuzzy/receiver.c
420 --- rsync-2.5.4/receiver.c      Thu Feb 14 05:42:20 2002
421 +++ rsync-2.5.4-fuzzy/receiver.c        Wed Apr  3 16:46:46 2002
422 @@ -36,6 +36,7 @@
423  extern char *compare_dest;
424  extern int make_backups;
425  extern char *backup_suffix;
426 +extern int fuzzy;
427  
428  static struct delete_list {
429         DEV64_T dev;
430 @@ -307,8 +308,6 @@
431         char *fname;
432         char template[MAXPATHLEN];
433         char fnametmp[MAXPATHLEN];
434 -       char *fnamecmp;
435 -       char fnamecmpbuf[MAXPATHLEN];
436         struct map_struct *buf;
437         int i;
438         struct file_struct *file;
439 @@ -366,28 +365,24 @@
440                 if (verbose > 2)
441                         rprintf(FINFO,"recv_files(%s)\n",fname);
442  
443 -               fnamecmp = fname;
444 -
445                 /* open the file */  
446 -               fd1 = do_open(fnamecmp, O_RDONLY, 0);
447 +               fd1 = do_open(fname, O_RDONLY, 0);
448  
449 -               if ((fd1 == -1) && (compare_dest != NULL)) {
450 -                       /* try the file at compare_dest instead */
451 -                       snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",
452 -                                               compare_dest,fname);
453 -                       fnamecmp = fnamecmpbuf;
454 -                       fd1 = do_open(fnamecmp, O_RDONLY, 0);
455 -               }
456 +               if (fd1 == -1 && compare_dest != NULL)
457 +                       fd1 = open_alternate_base_comparedir(fname);
458 +
459 +               if (fd1 == -1 && fuzzy)
460 +                       fd1 = open_alternate_base_fuzzy(fname);
461  
462                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
463 -                       rprintf(FERROR,"fstat %s : %s\n",fnamecmp,strerror(errno));
464 +                       rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
465                         receive_data(f_in,NULL,-1,NULL,file->length);
466                         close(fd1);
467                         continue;
468                 }
469  
470                 if (fd1 != -1 && !S_ISREG(st.st_mode)) {
471 -                       rprintf(FERROR,"%s : not a regular file (recv_files)\n",fnamecmp);
472 +                       rprintf(FERROR,"%s : not a regular file (recv_files)\n",fname);
473                         receive_data(f_in,NULL,-1,NULL,file->length);
474                         close(fd1);
475                         continue;
476 @@ -403,7 +398,7 @@
477                 if (fd1 != -1 && st.st_size > 0) {
478                         buf = map_file(fd1,st.st_size);
479                         if (verbose > 2)
480 -                               rprintf(FINFO,"recv mapped %s of size %.0f\n",fnamecmp,(double)st.st_size);
481 +                               rprintf(FINFO,"recv mapped %s of size %.0f\n",fname,(double)st.st_size);
482                 } else {
483                         buf = NULL;
484                 }
485