Updated to apply cleanly.
[rsync/rsync-patches.git] / fuzzy.diff
1 Wayne Davison wrote:
2
3 I greatly simplified the changes to generator.c, making the patch
4 easier to maintain and fixing the failing test in the testsuite.
5 Very lightly tested.
6
7 --- generator.c 30 Jun 2004 07:27:30 -0000      1.93
8 +++ generator.c 30 Jun 2004 07:43:46 -0000
9 @@ -41,6 +41,7 @@ extern int ignore_times;
10  extern int size_only;
11  extern int io_timeout;
12  extern int protocol_version;
13 +extern int fuzzy;
14  extern int always_checksum;
15  extern char *compare_dest;
16  extern int link_dest;
17 @@ -257,6 +258,94 @@ static void generate_and_send_sums(struc
18  }
19  
20  
21 +static void split_names(char *fname, char **dirname, char **basename)
22 +{
23 +       char *slash = strrchr(fname, '/');
24 +       if (slash) {
25 +               *dirname = fname;
26 +               *slash = '\0';
27 +               *basename = slash+1;
28 +       } else {
29 +               *basename = fname;
30 +               *dirname = ".";
31 +       }
32 +}
33 +
34 +
35 +static unsigned int measure_name(const char *name, const char *basename,
36 +                                const char *ext)
37 +{
38 +       int namelen = strlen(name);
39 +       int extlen = strlen(ext);
40 +       unsigned int score = 0;
41 +
42 +       /* Extensions must match */
43 +       if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0)
44 +               return 0;
45 +
46 +       /* Now score depends on similarity of prefix */
47 +       for (; *name == *basename && *name; name++, basename++)
48 +               score++;
49 +       return score;
50 +}
51 +
52 +
53 +static int find_fuzzy(char **fname_ptr, char *buf, STRUCT_STAT *st_ptr)
54 +{
55 +       DIR *d;
56 +       struct dirent *di;
57 +       char *basename, *dirname;
58 +       char mangled_name[MAXPATHLEN];
59 +       char bestname[MAXPATHLEN];
60 +       unsigned int bestscore = 0;
61 +       const char *ext;
62 +
63 +       strlcpy(mangled_name, *fname_ptr, sizeof mangled_name);
64 +
65 +       split_names(mangled_name, &dirname, &basename);
66 +       if (!(d = opendir(dirname))) {
67 +               rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
68 +               return -1;
69 +       }
70 +
71 +       /* Get final extension, eg. .gz; never full basename though. */
72 +       ext = strrchr(basename + 1, '.');
73 +       if (!ext)
74 +               ext = basename + strlen(basename); /* ext = "" */
75 +
76 +       while ((di = readdir(d)) != NULL) {
77 +               const char *dname = d_name(di);
78 +               unsigned int score;
79 +
80 +               if (dname[0] == '.' && (dname[1] == '\0'
81 +                   || (dname[1] == '.' && dname[2] == '\0')))
82 +                       continue;
83 +
84 +               score = measure_name(dname, basename, ext);
85 +               if (verbose > 4) {
86 +                       rprintf(FINFO, "[%s] fuzzy score for %s = %u\n",
87 +                               who_am_i(), dname, score);
88 +               }
89 +               if (score > bestscore) {
90 +                       strlcpy(bestname, dname, sizeof bestname);
91 +                       bestscore = score;
92 +               }
93 +       }
94 +       closedir(d);
95 +
96 +       /* Found a candidate. */
97 +       if (bestscore != 0) {
98 +               pathjoin(buf, MAXPATHLEN, dirname, bestname);
99 +               if (verbose > 2) {
100 +                       rprintf(FINFO, "[%s] fuzzy match %s->%s\n",
101 +                               who_am_i(), *fname_ptr, buf);
102 +               }
103 +               *fname_ptr = buf;
104 +               return link_stat(buf, st_ptr, 0);
105 +       }
106 +       return -1;
107 +}
108 +
109  
110  /*
111   * Acts on file number @p i from @p flist, whose name is @p fname.
112 @@ -267,12 +356,12 @@ static void generate_and_send_sums(struc
113   * out.  It might be wrong.
114   */
115  static void recv_generator(char *fname, struct file_struct *file, int i,
116 -                          int f_out)
117 +                          int f_out, int f_nameout)
118  {
119         int fd;
120         STRUCT_STAT st;
121         struct map_struct *mapbuf;
122 -       int statret;
123 +       int statret, fuzzy_file = 0;
124         char *fnamecmp;
125         char fnamecmpbuf[MAXPATHLEN];
126  
127 @@ -431,8 +520,10 @@ static void recv_generator(char *fname, 
128                 statret = link_stat(fnamecmpbuf, &st, 0);
129                 if (!S_ISREG(st.st_mode))
130                         statret = -1;
131 -               if (statret == -1)
132 +               if (statret < 0) {
133                         errno = saveerrno;
134 +                       *fnamecmpbuf = '\0';
135 +               }
136  #if HAVE_LINK
137                 else if (link_dest && !dry_run) {
138                         if (do_link(fnamecmpbuf, fname) != 0) {
139 @@ -440,18 +531,30 @@ static void recv_generator(char *fname, 
140                                         rsyserr(FINFO, errno, "link %s => %s",
141                                                 fnamecmpbuf, fname);
142                                 }
143 -                       }
144 -                       fnamecmp = fnamecmpbuf;
145 +                               fnamecmp = fnamecmpbuf;
146 +                       } else
147 +                               *fnamecmpbuf = '\0';
148                 }
149  #endif
150                 else
151                         fnamecmp = fnamecmpbuf;
152 +       } else
153 +               *fnamecmpbuf = '\0';
154 +
155 +       if (statret == -1 && fuzzy) {
156 +               statret = find_fuzzy(&fnamecmp, fnamecmpbuf, &st);
157 +               if (!S_ISREG(st.st_mode))
158 +                       statret = -1;
159 +               else
160 +                       fuzzy_file = 1;
161         }
162  
163         if (statret == -1) {
164                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
165                         return;
166                 if (errno == ENOENT) {
167 +                       if (f_nameout >= 0)
168 +                               write(f_nameout, "", 1);
169                         write_int(f_out,i);
170                         if (!dry_run)
171                                 write_sum_head(f_out, NULL);
172 @@ -471,37 +574,43 @@ static void recv_generator(char *fname, 
173                 /* now pretend the file didn't exist */
174                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
175                         return;
176 +               if (f_nameout >= 0)
177 +                       write(f_nameout, "", 1);
178                 write_int(f_out,i);
179                 if (!dry_run)
180                         write_sum_head(f_out, NULL);
181                 return;
182         }
183  
184 -       if (opt_ignore_existing && fnamecmp == fname) {
185 +       if (opt_ignore_existing && !*fnamecmpbuf) {
186                 if (verbose > 1)
187                         rprintf(FINFO,"%s exists\n",fname);
188                 return;
189         }
190  
191 -       if (update_only && fnamecmp == fname
192 +       if (update_only && !*fnamecmpbuf
193             && cmp_modtime(st.st_mtime, file->modtime) > 0) {
194                 if (verbose > 1)
195                         rprintf(FINFO,"%s is newer\n",fname);
196                 return;
197         }
198  
199 -       if (skip_file(fname, file, &st)) {
200 -               if (fnamecmp == fname)
201 +       if (!fuzzy_file && skip_file(fname, file, &st)) {
202 +               if (!*fnamecmpbuf)
203                         set_perms(fname, file, &st, PERMS_REPORT);
204                 return;
205         }
206  
207         if (dry_run) {
208 +               if (f_nameout >= 0)
209 +                       write(f_nameout, "", 1);
210                 write_int(f_out,i);
211                 return;
212         }
213  
214         if (disable_deltas_p()) {
215 +               if (f_nameout >= 0)
216 +                       write(f_nameout, "", 1);
217                 write_int(f_out,i);
218                 write_sum_head(f_out, NULL);
219                 return;
220 @@ -516,6 +625,8 @@ static void recv_generator(char *fname, 
221                 /* pretend the file didn't exist */
222                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
223                         return;
224 +               if (f_nameout >= 0)
225 +                       write(f_nameout, "", 1);
226                 write_int(f_out,i);
227                 write_sum_head(f_out, NULL);
228                 return;
229 @@ -534,6 +645,8 @@ static void recv_generator(char *fname, 
230         if (verbose > 2)
231                 rprintf(FINFO, "generating and sending sums for %d\n", i);
232  
233 +       if (f_nameout >= 0)
234 +               write(f_nameout, fnamecmpbuf, strlen(fnamecmpbuf) + 1);
235         write_int(f_out,i);
236         generate_and_send_sums(mapbuf, st.st_size, f_out);
237  
238 @@ -543,7 +656,8 @@ static void recv_generator(char *fname, 
239  }
240  
241  
242 -void generate_files(int f, struct file_list *flist, char *local_name)
243 +void generate_files(int f, struct file_list *flist, char *local_name,
244 +                   int f_nameout)
245  {
246         int i;
247         int phase = 0;
248 @@ -584,7 +698,7 @@ void generate_files(int f, struct file_l
249                 }
250  
251                 recv_generator(local_name ? local_name : f_name_to(file, fbuf),
252 -                              file, i, f);
253 +                              file, i, f, f_nameout);
254         }
255  
256         phase++;
257 @@ -601,7 +715,7 @@ void generate_files(int f, struct file_l
258         while ((i = get_redo_num()) != -1) {
259                 struct file_struct *file = flist->files[i];
260                 recv_generator(local_name ? local_name : f_name_to(file, fbuf),
261 -                              file, i, f);
262 +                              file, i, f, f_nameout);
263         }
264  
265         phase++;
266 @@ -620,7 +734,7 @@ void generate_files(int f, struct file_l
267                 if (!file->basename || !S_ISDIR(file->mode))
268                         continue;
269                 recv_generator(local_name ? local_name : f_name(file),
270 -                              file, i, -1);
271 +                              file, i, -1, -1);
272         }
273  
274         if (verbose > 2)
275 --- main.c      30 Jun 2004 07:27:30 -0000      1.202
276 +++ main.c      30 Jun 2004 07:43:47 -0000
277 @@ -429,7 +429,7 @@ static int do_recv(int f_in,int f_out,st
278  {
279         int pid;
280         int status = 0;
281 -       int error_pipe[2];
282 +       int error_pipe[2], name_pipe[2];
283  
284         if (preserve_hard_links)
285                 init_hard_links(flist);
286 @@ -441,8 +441,8 @@ static int do_recv(int f_in,int f_out,st
287                 }
288         }
289  
290 -       if (fd_pair(error_pipe) < 0) {
291 -               rprintf(FERROR,"error pipe failed in do_recv\n");
292 +       if (fd_pair(error_pipe) < 0 || fd_pair(name_pipe) < 0) {
293 +               rprintf(FERROR, "fd_pair() failed in do_recv\n");
294                 exit_cleanup(RERR_SOCKETIO);
295         }
296  
297 @@ -450,8 +450,10 @@ static int do_recv(int f_in,int f_out,st
298  
299         if ((pid = do_fork()) == 0) {
300                 close(error_pipe[0]);
301 +               close(name_pipe[1]);
302                 if (f_in != f_out)
303                         close(f_out);
304 +               set_blocking(name_pipe[0]);
305  
306                 /* we can't let two processes write to the socket at one time */
307                 io_multiplexing_close();
308 @@ -459,7 +461,7 @@ static int do_recv(int f_in,int f_out,st
309                 /* set place to send errors */
310                 set_msg_fd_out(error_pipe[1]);
311  
312 -               recv_files(f_in,flist,local_name);
313 +               recv_files(f_in, flist, local_name, name_pipe[0]);
314                 io_flush(FULL_FLUSH);
315                 report(f_in);
316  
317 @@ -475,14 +477,16 @@ static int do_recv(int f_in,int f_out,st
318         am_generator = 1;
319  
320         close(error_pipe[1]);
321 +       close(name_pipe[0]);
322         if (f_in != f_out)
323                 close(f_in);
324 +       set_blocking(name_pipe[1]);
325  
326         io_start_buffering_out(f_out);
327  
328         set_msg_fd_in(error_pipe[0]);
329  
330 -       generate_files(f_out, flist, local_name);
331 +       generate_files(f_out, flist, local_name, name_pipe[1]);
332  
333         get_redo_num(); /* Read final MSG_DONE and any prior messages. */
334         report(-1);
335 --- options.c   20 Jun 2004 19:47:05 -0000      1.157
336 +++ options.c   30 Jun 2004 07:43:47 -0000
337 @@ -94,6 +94,7 @@ int ignore_errors = 0;
338  int modify_window = 0;
339  int blocking_io = -1;
340  int checksum_seed = 0;
341 +int fuzzy = 0;
342  unsigned int block_size = 0;
343  
344  
345 @@ -270,6 +271,7 @@ void usage(enum logcode F)
346    rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
347    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
348    rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
349 +  rprintf(F,"     --fuzzy                 use similar file as basis if basis doesn't exist\n");
350    rprintf(F," -P                          equivalent to --partial --progress\n");
351    rprintf(F," -z, --compress              compress file data\n");
352    rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
353 @@ -368,6 +370,7 @@ static struct poptOption long_options[] 
354    {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
355    {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
356    {"link-dest",        0,  POPT_ARG_STRING, &compare_dest,  OPT_LINK_DEST, 0, 0 },
357 +  {"fuzzy",            0,  POPT_ARG_NONE,   &fuzzy, 0, 0, 0 },
358    /* TODO: Should this take an optional int giving the compression level? */
359    {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
360    {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
361 @@ -989,6 +992,9 @@ void server_options(char **args,int *arg
362                 }
363         }
364  
365 +       if (fuzzy && am_sender)
366 +               args[ac++] = "--fuzzy";
367 +
368         *argc = ac;
369         return;
370  
371 --- receiver.c  30 Jun 2004 07:27:30 -0000      1.84
372 +++ receiver.c  30 Jun 2004 07:43:47 -0000
373 @@ -36,7 +36,6 @@ extern int preserve_perms;
374  extern int cvs_exclude;
375  extern int io_error;
376  extern char *tmpdir;
377 -extern char *compare_dest;
378  extern int make_backups;
379  extern int do_progress;
380  extern char *backup_dir;
381 @@ -293,14 +292,15 @@ static int receive_data(int f_in,struct 
382   * main routine for receiver process.
383   *
384   * Receiver process runs on the same host as the generator process. */
385 -int recv_files(int f_in,struct file_list *flist,char *local_name)
386 +int recv_files(int f_in, struct file_list *flist, char *local_name,
387 +              int f_name)
388  {
389         int fd1,fd2;
390         STRUCT_STAT st;
391         char *fname, fbuf[MAXPATHLEN];
392         char template[MAXPATHLEN];
393         char fnametmp[MAXPATHLEN];
394 -       char *fnamecmp;
395 +       char *fnamecmp, *cp;
396         char fnamecmpbuf[MAXPATHLEN];
397         struct map_struct *mapbuf;
398         struct file_struct *file;
399 @@ -364,19 +364,19 @@ int recv_files(int f_in,struct file_list
400                 if (verbose > 2)
401                         rprintf(FINFO,"recv_files(%s)\n",fname);
402  
403 -               fnamecmp = fname;
404 +               for (cp = fnamecmpbuf; ; cp++) {
405 +                       if (read(f_name, cp, 1) <= 0) {
406 +                               rsyserr(FERROR, errno, "fname-pipe read failed");
407 +                               exit_cleanup(RERR_PROTOCOL);
408 +                       }
409 +                       if (!*cp)
410 +                               break;
411 +               }
412 +               fnamecmp = *fnamecmpbuf ? fnamecmpbuf : fname;
413  
414                 /* open the file */
415                 fd1 = do_open(fnamecmp, O_RDONLY, 0);
416  
417 -               if (fd1 == -1 && compare_dest != NULL) {
418 -                       /* try the file at compare_dest instead */
419 -                       pathjoin(fnamecmpbuf, sizeof fnamecmpbuf,
420 -                                compare_dest, fname);
421 -                       fnamecmp = fnamecmpbuf;
422 -                       fd1 = do_open(fnamecmp, O_RDONLY, 0);
423 -               }
424 -
425                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
426                         rsyserr(FERROR, errno, "fstat %s failed",
427                                 full_fname(fnamecmp));
428 @@ -385,7 +385,7 @@ int recv_files(int f_in,struct file_list
429                         continue;
430                 }
431  
432 -               if (fd1 != -1 && S_ISDIR(st.st_mode) && fnamecmp == fname) {
433 +               if (fd1 != -1 && S_ISDIR(st.st_mode) && !*fnamecmpbuf) {
434                         /* this special handling for directories
435                          * wouldn't be necessary if robust_rename()
436                          * and the underlying robust_unlink could cope
437 --- rsync.yo    5 Jun 2004 16:16:30 -0000       1.171
438 +++ rsync.yo    30 Jun 2004 07:43:48 -0000
439 @@ -325,6 +325,7 @@ verb(
440   -T  --temp-dir=DIR          create temporary files in directory DIR
441       --compare-dest=DIR      also compare received files relative to DIR
442       --link-dest=DIR         create hardlinks to DIR for unchanged files
443 +     --fuzzy                 use similar file as basis if basis is gone
444   -P                          equivalent to --partial --progress
445   -z, --compress              compress file data
446   -C, --cvs-exclude           auto ignore files in the same way CVS does