Applied to trunk.
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
9cf86680
WD
7--- orig/generator.c 2005-01-15 21:18:09
8+++ generator.c 2005-01-15 21:20:10
0808daa5
WD
9@@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
dc800efa 17@@ -239,6 +240,90 @@ static void generate_and_send_sums(int f
58118c25
WD
18 }
19
20
47dd7a31
WD
21+static void split_names(char *fname, char **dirname, char **basename)
22+{
23+ char *slash = strrchr(fname, '/');
24+ if (slash) {
25+ *dirname = fname;
26+ *slash = '\0';
27+ *basename = slash+1;
28+ } else {
29+ *basename = fname;
30+ *dirname = ".";
31+ }
32+}
33+
58118c25 34+
47dd7a31
WD
35+static unsigned int measure_name(const char *name, const char *basename,
36+ const char *ext)
37+{
38+ int namelen = strlen(name);
39+ int extlen = strlen(ext);
40+ unsigned int score = 0;
41+
42+ /* Extensions must match */
43+ if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0)
44+ return 0;
45+
46+ /* Now score depends on similarity of prefix */
47+ for (; *name == *basename && *name; name++, basename++)
48+ score++;
49+ return score;
50+}
51+
58118c25 52+
09fb2223 53+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
54+{
55+ DIR *d;
56+ struct dirent *di;
57+ char *basename, *dirname;
58+ char mangled_name[MAXPATHLEN];
59+ char bestname[MAXPATHLEN];
60+ unsigned int bestscore = 0;
61+ const char *ext;
62+
09fb2223 63+ strlcpy(mangled_name, fname, sizeof mangled_name);
47dd7a31
WD
64+
65+ split_names(mangled_name, &dirname, &basename);
66+ if (!(d = opendir(dirname))) {
67+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
68+ return -1;
69+ }
70+
71+ /* Get final extension, eg. .gz; never full basename though. */
dd39e657 72+ if (!(ext = strrchr(basename + 1, '.')))
47dd7a31
WD
73+ ext = basename + strlen(basename); /* ext = "" */
74+
75+ while ((di = readdir(d)) != NULL) {
76+ const char *dname = d_name(di);
77+ unsigned int score;
78+
79+ if (dname[0] == '.' && (dname[1] == '\0'
80+ || (dname[1] == '.' && dname[2] == '\0')))
81+ continue;
82+
83+ score = measure_name(dname, basename, ext);
84+ if (verbose > 4) {
dc800efa
WD
85+ rprintf(FINFO, "fuzzy score for %s = %u\n",
86+ dname, score);
47dd7a31
WD
87+ }
88+ if (score > bestscore) {
8c5b8235 89+ strlcpy(bestname, dname, sizeof bestname);
47dd7a31
WD
90+ bestscore = score;
91+ }
92+ }
93+ closedir(d);
94+
95+ /* Found a candidate. */
96+ if (bestscore != 0) {
8c5b8235 97+ pathjoin(buf, MAXPATHLEN, dirname, bestname);
dc800efa
WD
98+ if (verbose > 2)
99+ rprintf(FINFO, "fuzzy match %s->%s\n", fname, buf);
58118c25 100+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
101+ }
102+ return -1;
103+}
58118c25
WD
104+
105
106 /*
107 * Acts on file number @p i from @p flist, whose name is @p fname.
dc800efa 108@@ -493,6 +578,15 @@ static void recv_generator(char *fname,
f48a237e
WD
109 } else
110 partialptr = NULL;
824abc86 111
09fb2223
WD
112+ if (statret == -1 && fuzzy_basis) {
113+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
114+ && S_ISREG(st.st_mode)) {
115+ statret = 0;
116+ fnamecmp = fnamecmpbuf;
0edc7d7f 117+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 118+ }
824abc86
WD
119+ }
120+
09fb2223
WD
121 if (statret == -1) {
122 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 123 return;
dc800efa 124@@ -521,6 +615,8 @@ static void recv_generator(char *fname,
241013b4 125
9cf86680 126 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
127 ;
128+ else if (fnamecmp_type == FNAMECMP_FUZZY)
129+ ;
130 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 131 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 132 set_perms(fname, file, &st, PERMS_REPORT);
dc800efa 133@@ -593,8 +689,24 @@ prepare_to_open:
0edc7d7f 134
09fb2223 135 notify_others:
0edc7d7f
WD
136 write_int(f_out, i);
137- if (f_out_name >= 0)
138+ if (f_out_name >= 0) {
09fb2223 139 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 140+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
141+ uchar lenbuf[3], *lb = lenbuf;
142+ int len = strlen(fnamecmpbuf);
143+ if (len > 0x7F) {
144+#if MAXPATHLEN > 0x7FFF
145+ *lb++ = len / 0x10000 + 0x80;
146+ *lb++ = len / 0x100;
147+#else
148+ *lb++ = len / 0x100 + 0x80;
149+#endif
150+ }
151+ *lb = len;
152+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
153+ write_buf(f_out_name, fnamecmpbuf, len);
154+ }
0edc7d7f 155+ }
09fb2223 156
0edc7d7f
WD
157 if (dry_run || read_batch)
158 return;
3eabe3a3
WD
159--- orig/main.c 2005-01-14 18:30:18
160+++ main.c 2005-01-14 18:33:15
d5753a22 161@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
162 extern int preserve_hard_links;
163 extern int protocol_version;
164 extern int recurse;
09fb2223 165+extern int fuzzy_basis;
495f1899
WD
166 extern int relative_paths;
167 extern int rsync_port;
168 extern int whole_file;
3eabe3a3 169@@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
170 int pid;
171 int status = 0;
172 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
173- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
174+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
175+ && !dry_run;
495f1899 176
d5753a22
WD
177 /* The receiving side mustn't obey this, or an existing symlink that
178 * points to an identical file won't be replaced by the referent. */
9cf86680
WD
179--- orig/options.c 2005-01-15 21:23:15
180+++ options.c 2005-01-15 21:08:13
0808daa5 181@@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
f6c3b300 182 int size_only = 0;
0808daa5 183 int daemon_bwlimit = 0;
f6c3b300 184 int bwlimit = 0;
09fb2223 185+int fuzzy_basis = 0;
f6c3b300
WD
186 size_t bwlimit_writemax = 0;
187 int delete_after = 0;
188 int only_existing = 0;
0808daa5 189@@ -288,6 +289,7 @@ void usage(enum logcode F)
f0533c4c 190 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
191 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
192 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 193+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
194 rprintf(F," -P equivalent to --partial --progress\n");
195 rprintf(F," -z, --compress compress file data\n");
196 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
37da98ae 197@@ -384,6 +386,7 @@ static struct poptOption long_options[]
0808daa5
WD
198 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
199 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
200 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 201+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
202 /* TODO: Should this take an optional int giving the compression level? */
203 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 204 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
49839562
WD
205@@ -952,10 +955,10 @@ int parse_arguments(int *argc, const cha
206 return 0;
207 }
208 keep_partial = 0;
dc800efa 209- if (dest_option) {
89072d7e
WD
210+ if (dest_option || fuzzy_basis) {
211 snprintf(err_buf, sizeof err_buf,
212 "--inplace does not yet work with %s\n",
213- dest_option);
214+ dest_option ? dest_option : "--fuzzy");
dd39e657
WD
215 return 0;
216 }
49839562 217 #else
37da98ae
WD
218@@ -1240,6 +1243,9 @@ void server_options(char **args,int *arg
219 args[ac++] = "--no-relative";
241013b4 220 }
7b675ff5 221
09fb2223 222+ if (fuzzy_basis && am_sender)
241013b4 223+ args[ac++] = "--fuzzy";
7b675ff5 224+
241013b4 225 *argc = ac;
f74d2272 226 return;
7b675ff5 227
9cf86680
WD
228--- orig/receiver.c 2005-01-15 21:18:09
229+++ receiver.c 2005-01-15 21:21:02
230@@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
231 }
232
233
234+static void read_gen_name(int fd, char *buf)
235+{
236+ int len = read_byte(fd);
237+ if (len & 0x80) {
238+#if MAXPATHLEN > 32767
239+ uchar lenbuf[2];
240+ read_buf(fd, (char *)lenbuf, 2);
241+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
242+#else
243+ len = (len & ~0x80) * 0x100 + read_byte(fd);
244+#endif
245+ }
246+ if (len >= MAXPATHLEN) {
247+ rprintf(FERROR, "bogus data on generator name pipe\n");
248+ exit_cleanup(RERR_PROTOCOL);
249+ }
250+
251+ read_sbuf(fd, buf, len);
252+}
253+
254+
255 static void discard_receive_data(int f_in, OFF_T length)
256 {
257 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
9cf86680 258@@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 259 case FNAMECMP_BACKUP:
f48a237e 260 fnamecmp = get_backup_name(fname);
09fb2223 261 break;
0edc7d7f 262+ case FNAMECMP_FUZZY:
09fb2223
WD
263+ read_gen_name(f_in_name, fnamecmpbuf);
264+ fnamecmp = fnamecmpbuf;
265+ break;
266 default:
9cf86680
WD
267 if (j >= basis_dir_cnt) {
268 rprintf(FERROR,
269--- orig/rsync.h 2005-01-15 21:18:09
270+++ rsync.h 2005-01-15 21:24:09
271@@ -127,6 +127,7 @@
0edc7d7f
WD
272 #define FNAMECMP_FNAME 0x80
273 #define FNAMECMP_PARTIAL_DIR 0x81
274 #define FNAMECMP_BACKUP 0x82
275+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
276
277
278 /* Log-message categories. FLOG is only used on the daemon side to
dc800efa 279--- orig/rsync.yo 2005-01-15 04:36:32
637c560e 280+++ rsync.yo 2005-01-15 21:48:52
0808daa5 281@@ -358,6 +358,7 @@ verb(
f0533c4c 282 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
283 --copy-dest=DIR ... and include copies of unchanged files
284 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 285+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
286 -P equivalent to --partial --progress
287 -z, --compress compress file data
288 -C, --cvs-exclude auto ignore files in the same way CVS does
637c560e
WD
289@@ -876,6 +877,11 @@ Note that rsync versions prior to 2.6.1
290 (or implied by -a). If the receiving rsync is not new enough, you can work
291 around this bug by avoiding the -o option.
292
293+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
294+basis file for any destination file that is missing. The current algorithm
295+looks for a similarly-named file in the same directory as the destination
296+file, and, if found, uses that to try to speed up the transfer.
297+
298 dit(bf(-z, --compress)) With this option, rsync compresses any data from
299 the files that it sends to the destination machine. This
300 option is useful on slow connections. The compression method used is the