Incorporated the g2r-basis-filename.diff changes so that we don't
[rsync/rsync-patches.git] / fuzzy.diff
1 Wayne Davison wrote:
2
3 I greatly simplified the changes to generator.c, making the patch
4 easier to maintain and fixing the failing test in the testsuite.
5 Very lightly tested.
6
7 --- generator.c 29 Jun 2004 19:19:00 -0000      1.92
8 +++ generator.c 30 Jun 2004 07:35:56 -0000
9 @@ -41,6 +41,7 @@ extern int ignore_times;
10  extern int size_only;
11  extern int io_timeout;
12  extern int protocol_version;
13 +extern int fuzzy;
14  extern int always_checksum;
15  extern char *compare_dest;
16  extern int link_dest;
17 @@ -257,6 +258,94 @@ static void generate_and_send_sums(struc
18  }
19  
20  
21 +static void split_names(char *fname, char **dirname, char **basename)
22 +{
23 +       char *slash = strrchr(fname, '/');
24 +       if (slash) {
25 +               *dirname = fname;
26 +               *slash = '\0';
27 +               *basename = slash+1;
28 +       } else {
29 +               *basename = fname;
30 +               *dirname = ".";
31 +       }
32 +}
33 +
34 +
35 +static unsigned int measure_name(const char *name, const char *basename,
36 +                                const char *ext)
37 +{
38 +       int namelen = strlen(name);
39 +       int extlen = strlen(ext);
40 +       unsigned int score = 0;
41 +
42 +       /* Extensions must match */
43 +       if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0)
44 +               return 0;
45 +
46 +       /* Now score depends on similarity of prefix */
47 +       for (; *name == *basename && *name; name++, basename++)
48 +               score++;
49 +       return score;
50 +}
51 +
52 +
53 +static int find_fuzzy(char **fname_ptr, char *buf, STRUCT_STAT *st_ptr)
54 +{
55 +       DIR *d;
56 +       struct dirent *di;
57 +       char *basename, *dirname;
58 +       char mangled_name[MAXPATHLEN];
59 +       char bestname[MAXPATHLEN];
60 +       unsigned int bestscore = 0;
61 +       const char *ext;
62 +
63 +       strlcpy(mangled_name, *fname_ptr, sizeof mangled_name);
64 +
65 +       split_names(mangled_name, &dirname, &basename);
66 +       if (!(d = opendir(dirname))) {
67 +               rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
68 +               return -1;
69 +       }
70 +
71 +       /* Get final extension, eg. .gz; never full basename though. */
72 +       ext = strrchr(basename + 1, '.');
73 +       if (!ext)
74 +               ext = basename + strlen(basename); /* ext = "" */
75 +
76 +       while ((di = readdir(d)) != NULL) {
77 +               const char *dname = d_name(di);
78 +               unsigned int score;
79 +
80 +               if (dname[0] == '.' && (dname[1] == '\0'
81 +                   || (dname[1] == '.' && dname[2] == '\0')))
82 +                       continue;
83 +
84 +               score = measure_name(dname, basename, ext);
85 +               if (verbose > 4) {
86 +                       rprintf(FINFO, "[%s] fuzzy score for %s = %u\n",
87 +                               who_am_i(), dname, score);
88 +               }
89 +               if (score > bestscore) {
90 +                       strlcpy(bestname, dname, sizeof bestname);
91 +                       bestscore = score;
92 +               }
93 +       }
94 +       closedir(d);
95 +
96 +       /* Found a candidate. */
97 +       if (bestscore != 0) {
98 +               pathjoin(buf, MAXPATHLEN, dirname, bestname);
99 +               if (verbose > 2) {
100 +                       rprintf(FINFO, "[%s] fuzzy match %s->%s\n",
101 +                               who_am_i(), *fname_ptr, buf);
102 +               }
103 +               *fname_ptr = buf;
104 +               return link_stat(buf, st_ptr, 0);
105 +       }
106 +       return -1;
107 +}
108 +
109  
110  /*
111   * Acts on file number @p i from @p flist, whose name is @p fname.
112 @@ -267,12 +356,12 @@ static void generate_and_send_sums(struc
113   * out.  It might be wrong.
114   */
115  static void recv_generator(char *fname, struct file_struct *file, int i,
116 -                          int f_out)
117 +                          int f_out, int f_nameout)
118  {
119         int fd;
120         STRUCT_STAT st;
121         struct map_struct *mapbuf;
122 -       int statret;
123 +       int statret, fuzzy_file = 0;
124         char *fnamecmp;
125         char fnamecmpbuf[MAXPATHLEN];
126  
127 @@ -431,8 +520,10 @@ static void recv_generator(char *fname, 
128                 statret = link_stat(fnamecmpbuf, &st, 0);
129                 if (!S_ISREG(st.st_mode))
130                         statret = -1;
131 -               if (statret == -1)
132 +               if (statret < 0) {
133                         errno = saveerrno;
134 +                       *fnamecmpbuf = '\0';
135 +               }
136  #if HAVE_LINK
137                 else if (link_dest && !dry_run) {
138                         if (do_link(fnamecmpbuf, fname) != 0) {
139 @@ -440,18 +531,30 @@ static void recv_generator(char *fname, 
140                                         rsyserr(FINFO, errno, "link %s => %s",
141                                                 fnamecmpbuf, fname);
142                                 }
143 -                       }
144 -                       fnamecmp = fnamecmpbuf;
145 +                               fnamecmp = fnamecmpbuf;
146 +                       } else
147 +                               *fnamecmpbuf = '\0';
148                 }
149  #endif
150                 else
151                         fnamecmp = fnamecmpbuf;
152 +       } else
153 +               *fnamecmpbuf = '\0';
154 +
155 +       if (statret == -1 && fuzzy) {
156 +               statret = find_fuzzy(&fnamecmp, fnamecmpbuf, &st);
157 +               if (!S_ISREG(st.st_mode))
158 +                       statret = -1;
159 +               else
160 +                       fuzzy_file = 1;
161         }
162  
163         if (statret == -1) {
164                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
165                         return;
166                 if (errno == ENOENT) {
167 +                       if (f_nameout >= 0)
168 +                               write(f_nameout, "", 1);
169                         write_int(f_out,i);
170                         if (!dry_run)
171                                 write_sum_head(f_out, NULL);
172 @@ -471,37 +574,43 @@ static void recv_generator(char *fname, 
173                 /* now pretend the file didn't exist */
174                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
175                         return;
176 +               if (f_nameout >= 0)
177 +                       write(f_nameout, "", 1);
178                 write_int(f_out,i);
179                 if (!dry_run)
180                         write_sum_head(f_out, NULL);
181                 return;
182         }
183  
184 -       if (opt_ignore_existing && fnamecmp == fname) {
185 +       if (opt_ignore_existing && !*fnamecmpbuf) {
186                 if (verbose > 1)
187                         rprintf(FINFO,"%s exists\n",fname);
188                 return;
189         }
190  
191 -       if (update_only && fnamecmp == fname
192 +       if (update_only && !*fnamecmpbuf
193             && cmp_modtime(st.st_mtime, file->modtime) > 0) {
194                 if (verbose > 1)
195                         rprintf(FINFO,"%s is newer\n",fname);
196                 return;
197         }
198  
199 -       if (skip_file(fname, file, &st)) {
200 -               if (fnamecmp == fname)
201 +       if (!fuzzy_file && skip_file(fname, file, &st)) {
202 +               if (!*fnamecmpbuf)
203                         set_perms(fname, file, &st, PERMS_REPORT);
204                 return;
205         }
206  
207         if (dry_run) {
208 +               if (f_nameout >= 0)
209 +                       write(f_nameout, "", 1);
210                 write_int(f_out,i);
211                 return;
212         }
213  
214         if (disable_deltas_p()) {
215 +               if (f_nameout >= 0)
216 +                       write(f_nameout, "", 1);
217                 write_int(f_out,i);
218                 write_sum_head(f_out, NULL);
219                 return;
220 @@ -516,6 +625,8 @@ static void recv_generator(char *fname, 
221                 /* pretend the file didn't exist */
222                 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
223                         return;
224 +               if (f_nameout >= 0)
225 +                       write(f_nameout, "", 1);
226                 write_int(f_out,i);
227                 write_sum_head(f_out, NULL);
228                 return;
229 @@ -534,6 +645,8 @@ static void recv_generator(char *fname, 
230         if (verbose > 2)
231                 rprintf(FINFO, "generating and sending sums for %d\n", i);
232  
233 +       if (f_nameout >= 0)
234 +               write(f_nameout, fnamecmpbuf, strlen(fnamecmpbuf) + 1);
235         write_int(f_out,i);
236         generate_and_send_sums(mapbuf, st.st_size, f_out);
237  
238 @@ -543,10 +656,11 @@ static void recv_generator(char *fname, 
239  }
240  
241  
242 -void generate_files(int f, struct file_list *flist, char *local_name)
243 +void generate_files(int f, struct file_list *flist, char *local_name,
244 +                   int f_nameout)
245  {
246         int i;
247 -       int phase=0;
248 +       int phase = 0;
249         char fbuf[MAXPATHLEN];
250  
251         if (verbose > 2) {
252 @@ -584,7 +698,7 @@ void generate_files(int f, struct file_l
253                 }
254  
255                 recv_generator(local_name ? local_name : f_name_to(file, fbuf),
256 -                              file, i, f);
257 +                              file, i, f, f_nameout);
258         }
259  
260         phase++;
261 @@ -601,7 +715,7 @@ void generate_files(int f, struct file_l
262         while ((i = get_redo_num()) != -1) {
263                 struct file_struct *file = flist->files[i];
264                 recv_generator(local_name ? local_name : f_name_to(file, fbuf),
265 -                              file, i, f);
266 +                              file, i, f, f_nameout);
267         }
268  
269         phase++;
270 @@ -620,7 +734,7 @@ void generate_files(int f, struct file_l
271                 if (!file->basename || !S_ISDIR(file->mode))
272                         continue;
273                 recv_generator(local_name ? local_name : f_name(file),
274 -                              file, i, -1);
275 +                              file, i, -1, -1);
276         }
277  
278         if (verbose > 2)
279 --- main.c      28 Jun 2004 17:45:40 -0000      1.201
280 +++ main.c      30 Jun 2004 07:35:57 -0000
281 @@ -428,8 +428,8 @@ static void do_server_sender(int f_in, i
282  static int do_recv(int f_in,int f_out,struct file_list *flist,char *local_name)
283  {
284         int pid;
285 -       int status=0;
286 -       int error_pipe[2];
287 +       int status = 0;
288 +       int error_pipe[2], name_pipe[2];
289  
290         if (preserve_hard_links)
291                 init_hard_links(flist);
292 @@ -441,17 +441,19 @@ static int do_recv(int f_in,int f_out,st
293                 }
294         }
295  
296 -       if (fd_pair(error_pipe) < 0) {
297 -               rprintf(FERROR,"error pipe failed in do_recv\n");
298 +       if (fd_pair(error_pipe) < 0 || fd_pair(name_pipe) < 0) {
299 +               rprintf(FERROR, "fd_pair() failed in do_recv\n");
300                 exit_cleanup(RERR_SOCKETIO);
301         }
302  
303         io_flush(NORMAL_FLUSH);
304  
305 -       if ((pid=do_fork()) == 0) {
306 +       if ((pid = do_fork()) == 0) {
307                 close(error_pipe[0]);
308 +               close(name_pipe[1]);
309                 if (f_in != f_out)
310                         close(f_out);
311 +               set_blocking(name_pipe[0]);
312  
313                 /* we can't let two processes write to the socket at one time */
314                 io_multiplexing_close();
315 @@ -459,7 +461,7 @@ static int do_recv(int f_in,int f_out,st
316                 /* set place to send errors */
317                 set_msg_fd_out(error_pipe[1]);
318  
319 -               recv_files(f_in,flist,local_name);
320 +               recv_files(f_in, flist, local_name, name_pipe[0]);
321                 io_flush(FULL_FLUSH);
322                 report(f_in);
323  
324 @@ -475,14 +477,16 @@ static int do_recv(int f_in,int f_out,st
325         am_generator = 1;
326  
327         close(error_pipe[1]);
328 +       close(name_pipe[0]);
329         if (f_in != f_out)
330                 close(f_in);
331 +       set_blocking(name_pipe[1]);
332  
333         io_start_buffering_out(f_out);
334  
335         set_msg_fd_in(error_pipe[0]);
336  
337 -       generate_files(f_out, flist, local_name);
338 +       generate_files(f_out, flist, local_name, name_pipe[1]);
339  
340         get_redo_num(); /* Read final MSG_DONE and any prior messages. */
341         report(-1);
342 --- options.c   20 Jun 2004 19:47:05 -0000      1.157
343 +++ options.c   30 Jun 2004 07:35:57 -0000
344 @@ -94,6 +94,7 @@ int ignore_errors = 0;
345  int modify_window = 0;
346  int blocking_io = -1;
347  int checksum_seed = 0;
348 +int fuzzy = 0;
349  unsigned int block_size = 0;
350  
351  
352 @@ -270,6 +271,7 @@ void usage(enum logcode F)
353    rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
354    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
355    rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
356 +  rprintf(F,"     --fuzzy                 use similar file as basis if basis doesn't exist\n");
357    rprintf(F," -P                          equivalent to --partial --progress\n");
358    rprintf(F," -z, --compress              compress file data\n");
359    rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
360 @@ -368,6 +370,7 @@ static struct poptOption long_options[] 
361    {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
362    {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
363    {"link-dest",        0,  POPT_ARG_STRING, &compare_dest,  OPT_LINK_DEST, 0, 0 },
364 +  {"fuzzy",            0,  POPT_ARG_NONE,   &fuzzy, 0, 0, 0 },
365    /* TODO: Should this take an optional int giving the compression level? */
366    {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
367    {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
368 @@ -989,6 +992,9 @@ void server_options(char **args,int *arg
369                 }
370         }
371  
372 +       if (fuzzy && am_sender)
373 +               args[ac++] = "--fuzzy";
374 +
375         *argc = ac;
376         return;
377  
378 --- receiver.c  29 Jun 2004 15:12:01 -0000      1.83
379 +++ receiver.c  30 Jun 2004 07:35:57 -0000
380 @@ -36,7 +36,6 @@ extern int preserve_perms;
381  extern int cvs_exclude;
382  extern int io_error;
383  extern char *tmpdir;
384 -extern char *compare_dest;
385  extern int make_backups;
386  extern int do_progress;
387  extern char *backup_dir;
388 @@ -293,14 +292,15 @@ static int receive_data(int f_in,struct 
389   * main routine for receiver process.
390   *
391   * Receiver process runs on the same host as the generator process. */
392 -int recv_files(int f_in,struct file_list *flist,char *local_name)
393 +int recv_files(int f_in, struct file_list *flist, char *local_name,
394 +              int f_name)
395  {
396         int fd1,fd2;
397         STRUCT_STAT st;
398         char *fname, fbuf[MAXPATHLEN];
399         char template[MAXPATHLEN];
400         char fnametmp[MAXPATHLEN];
401 -       char *fnamecmp;
402 +       char *fnamecmp, *cp;
403         char fnamecmpbuf[MAXPATHLEN];
404         struct map_struct *mapbuf;
405         struct file_struct *file;
406 @@ -364,19 +364,19 @@ int recv_files(int f_in,struct file_list
407                 if (verbose > 2)
408                         rprintf(FINFO,"recv_files(%s)\n",fname);
409  
410 -               fnamecmp = fname;
411 +               for (cp = fnamecmpbuf; ; cp++) {
412 +                       if (read(f_name, cp, 1) <= 0) {
413 +                               rsyserr(FERROR, errno, "fname-pipe read failed");
414 +                               exit_cleanup(RERR_PROTOCOL);
415 +                       }
416 +                       if (!*cp)
417 +                               break;
418 +               }
419 +               fnamecmp = *fnamecmpbuf ? fnamecmpbuf : fname;
420  
421                 /* open the file */
422                 fd1 = do_open(fnamecmp, O_RDONLY, 0);
423  
424 -               if (fd1 == -1 && compare_dest != NULL) {
425 -                       /* try the file at compare_dest instead */
426 -                       pathjoin(fnamecmpbuf, sizeof fnamecmpbuf,
427 -                                compare_dest, fname);
428 -                       fnamecmp = fnamecmpbuf;
429 -                       fd1 = do_open(fnamecmp, O_RDONLY, 0);
430 -               }
431 -
432                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
433                         rsyserr(FERROR, errno, "fstat %s failed",
434                                 full_fname(fnamecmp));
435 @@ -385,7 +385,7 @@ int recv_files(int f_in,struct file_list
436                         continue;
437                 }
438  
439 -               if (fd1 != -1 && S_ISDIR(st.st_mode) && fnamecmp == fname) {
440 +               if (fd1 != -1 && S_ISDIR(st.st_mode) && !*fnamecmpbuf) {
441                         /* this special handling for directories
442                          * wouldn't be necessary if robust_rename()
443                          * and the underlying robust_unlink could cope
444 --- rsync.yo    5 Jun 2004 16:16:30 -0000       1.171
445 +++ rsync.yo    30 Jun 2004 07:35:58 -0000
446 @@ -325,6 +325,7 @@ verb(
447   -T  --temp-dir=DIR          create temporary files in directory DIR
448       --compare-dest=DIR      also compare received files relative to DIR
449       --link-dest=DIR         create hardlinks to DIR for unchanged files
450 +     --fuzzy                 use similar file as basis if basis is gone
451   -P                          equivalent to --partial --progress
452   -z, --compress              compress file data
453   -C, --cvs-exclude           auto ignore files in the same way CVS does