Some preliminary changes got mixed in with the last commit, so
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
0f626034 7--- orig/generator.c 2005-01-17 23:11:45
e55625fb 8+++ generator.c 2005-01-17 23:38:46
0808daa5
WD
9@@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
e55625fb 17@@ -242,6 +243,85 @@ static void generate_and_send_sums(int f
58118c25
WD
18 }
19
20
47dd7a31
WD
21+static unsigned int measure_name(const char *name, const char *basename,
22+ const char *ext)
23+{
24+ int namelen = strlen(name);
25+ int extlen = strlen(ext);
26+ unsigned int score = 0;
27+
28+ /* Extensions must match */
29+ if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0)
30+ return 0;
31+
32+ /* Now score depends on similarity of prefix */
33+ for (; *name == *basename && *name; name++, basename++)
34+ score++;
35+ return score;
36+}
37+
58118c25 38+
09fb2223 39+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
40+{
41+ DIR *d;
42+ struct dirent *di;
0f626034 43+ char *basename, *dirname, *slash;
47dd7a31
WD
44+ char bestname[MAXPATHLEN];
45+ unsigned int bestscore = 0;
46+ const char *ext;
47+
0f626034
WD
48+ strlcpy(buf, fname, MAXPATHLEN);
49+ if ((slash = strrchr(buf, '/')) != NULL) {
50+ dirname = buf;
51+ *slash = '\0';
52+ basename = slash + 1;
53+ } else {
54+ basename = buf;
55+ dirname = ".";
56+ }
47dd7a31 57+
47dd7a31
WD
58+ if (!(d = opendir(dirname))) {
59+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
60+ return -1;
61+ }
e55625fb
WD
62+ if (slash)
63+ *slash = '/';
47dd7a31
WD
64+
65+ /* Get final extension, eg. .gz; never full basename though. */
0f626034
WD
66+ for (ext = basename; *ext == '.'; ext++) {}
67+ if (!(ext = strrchr(ext, '.')))
47dd7a31
WD
68+ ext = basename + strlen(basename); /* ext = "" */
69+
70+ while ((di = readdir(d)) != NULL) {
71+ const char *dname = d_name(di);
72+ unsigned int score;
73+
74+ if (dname[0] == '.' && (dname[1] == '\0'
75+ || (dname[1] == '.' && dname[2] == '\0')))
76+ continue;
77+
78+ score = measure_name(dname, basename, ext);
79+ if (verbose > 4) {
dc800efa
WD
80+ rprintf(FINFO, "fuzzy score for %s = %u\n",
81+ dname, score);
47dd7a31
WD
82+ }
83+ if (score > bestscore) {
8c5b8235 84+ strlcpy(bestname, dname, sizeof bestname);
47dd7a31
WD
85+ bestscore = score;
86+ }
87+ }
88+ closedir(d);
89+
90+ /* Found a candidate. */
91+ if (bestscore != 0) {
e55625fb 92+ strlcpy(basename, bestname, MAXPATHLEN - (basename - buf));
dc800efa
WD
93+ if (verbose > 2)
94+ rprintf(FINFO, "fuzzy match %s->%s\n", fname, buf);
58118c25 95+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
96+ }
97+ return -1;
98+}
58118c25
WD
99+
100
101 /*
102 * Acts on file number @p i from @p flist, whose name is @p fname.
e55625fb 103@@ -496,6 +576,15 @@ static void recv_generator(char *fname,
f48a237e
WD
104 } else
105 partialptr = NULL;
824abc86 106
09fb2223
WD
107+ if (statret == -1 && fuzzy_basis) {
108+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
109+ && S_ISREG(st.st_mode)) {
110+ statret = 0;
111+ fnamecmp = fnamecmpbuf;
0edc7d7f 112+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 113+ }
824abc86
WD
114+ }
115+
09fb2223
WD
116 if (statret == -1) {
117 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 118 return;
e55625fb 119@@ -524,6 +613,8 @@ static void recv_generator(char *fname,
241013b4 120
9cf86680 121 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
122 ;
123+ else if (fnamecmp_type == FNAMECMP_FUZZY)
124+ ;
125 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 126 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 127 set_perms(fname, file, &st, PERMS_REPORT);
e55625fb 128@@ -598,8 +689,24 @@ notify_others:
0edc7d7f 129 write_int(f_out, i);
0f626034
WD
130 if (protocol_version >= 29 && inplace && !read_batch)
131 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
132- if (f_out_name >= 0)
133+ if (f_out_name >= 0) {
09fb2223 134 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 135+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
136+ uchar lenbuf[3], *lb = lenbuf;
137+ int len = strlen(fnamecmpbuf);
138+ if (len > 0x7F) {
139+#if MAXPATHLEN > 0x7FFF
140+ *lb++ = len / 0x10000 + 0x80;
141+ *lb++ = len / 0x100;
142+#else
143+ *lb++ = len / 0x100 + 0x80;
144+#endif
145+ }
146+ *lb = len;
147+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
148+ write_buf(f_out_name, fnamecmpbuf, len);
149+ }
0edc7d7f 150+ }
09fb2223 151
0edc7d7f
WD
152 if (dry_run || read_batch)
153 return;
0f626034 154--- orig/main.c 2005-01-17 23:11:45
3eabe3a3 155+++ main.c 2005-01-14 18:33:15
d5753a22 156@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
157 extern int preserve_hard_links;
158 extern int protocol_version;
159 extern int recurse;
09fb2223 160+extern int fuzzy_basis;
495f1899
WD
161 extern int relative_paths;
162 extern int rsync_port;
163 extern int whole_file;
3eabe3a3 164@@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
165 int pid;
166 int status = 0;
167 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
168- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
169+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
170+ && !dry_run;
495f1899 171
d5753a22
WD
172 /* The receiving side mustn't obey this, or an existing symlink that
173 * points to an identical file won't be replaced by the referent. */
0f626034 174--- orig/options.c 2005-01-17 23:11:45
9cf86680 175+++ options.c 2005-01-15 21:08:13
0808daa5 176@@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
f6c3b300 177 int size_only = 0;
0808daa5 178 int daemon_bwlimit = 0;
f6c3b300 179 int bwlimit = 0;
09fb2223 180+int fuzzy_basis = 0;
f6c3b300
WD
181 size_t bwlimit_writemax = 0;
182 int delete_after = 0;
183 int only_existing = 0;
0808daa5 184@@ -288,6 +289,7 @@ void usage(enum logcode F)
f0533c4c 185 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
186 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
187 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 188+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
189 rprintf(F," -P equivalent to --partial --progress\n");
190 rprintf(F," -z, --compress compress file data\n");
191 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
37da98ae 192@@ -384,6 +386,7 @@ static struct poptOption long_options[]
0808daa5
WD
193 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
194 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
195 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 196+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
197 /* TODO: Should this take an optional int giving the compression level? */
198 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 199 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
0f626034 200@@ -1234,6 +1237,9 @@ void server_options(char **args,int *arg
37da98ae 201 args[ac++] = "--no-relative";
241013b4 202 }
7b675ff5 203
09fb2223 204+ if (fuzzy_basis && am_sender)
241013b4 205+ args[ac++] = "--fuzzy";
7b675ff5 206+
241013b4 207 *argc = ac;
f74d2272 208 return;
7b675ff5 209
0f626034 210--- orig/receiver.c 2005-01-17 23:11:45
9cf86680
WD
211+++ receiver.c 2005-01-15 21:21:02
212@@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
213 }
214
215
216+static void read_gen_name(int fd, char *buf)
217+{
218+ int len = read_byte(fd);
219+ if (len & 0x80) {
220+#if MAXPATHLEN > 32767
221+ uchar lenbuf[2];
222+ read_buf(fd, (char *)lenbuf, 2);
223+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
224+#else
225+ len = (len & ~0x80) * 0x100 + read_byte(fd);
226+#endif
227+ }
228+ if (len >= MAXPATHLEN) {
229+ rprintf(FERROR, "bogus data on generator name pipe\n");
230+ exit_cleanup(RERR_PROTOCOL);
231+ }
232+
233+ read_sbuf(fd, buf, len);
234+}
235+
236+
237 static void discard_receive_data(int f_in, OFF_T length)
238 {
239 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
9cf86680 240@@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 241 case FNAMECMP_BACKUP:
f48a237e 242 fnamecmp = get_backup_name(fname);
09fb2223 243 break;
0edc7d7f 244+ case FNAMECMP_FUZZY:
09fb2223
WD
245+ read_gen_name(f_in_name, fnamecmpbuf);
246+ fnamecmp = fnamecmpbuf;
247+ break;
248 default:
9cf86680
WD
249 if (j >= basis_dir_cnt) {
250 rprintf(FERROR,
0f626034 251--- orig/rsync.h 2005-01-17 23:11:45
9cf86680 252+++ rsync.h 2005-01-15 21:24:09
0f626034 253@@ -128,6 +128,7 @@
0edc7d7f
WD
254 #define FNAMECMP_FNAME 0x80
255 #define FNAMECMP_PARTIAL_DIR 0x81
256 #define FNAMECMP_BACKUP 0x82
257+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
258
259
260 /* Log-message categories. FLOG is only used on the daemon side to
0f626034 261--- orig/rsync.yo 2005-01-17 23:11:46
637c560e 262+++ rsync.yo 2005-01-15 21:48:52
0808daa5 263@@ -358,6 +358,7 @@ verb(
f0533c4c 264 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
265 --copy-dest=DIR ... and include copies of unchanged files
266 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 267+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
268 -P equivalent to --partial --progress
269 -z, --compress compress file data
270 -C, --cvs-exclude auto ignore files in the same way CVS does
0f626034
WD
271@@ -878,6 +879,11 @@ Note that rsync versions prior to 2.6.1
272 (or implied by -a). You can work-around this bug by avoiding the -o option
273 when sending to an old rsync.
637c560e
WD
274
275+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
276+basis file for any destination file that is missing. The current algorithm
277+looks for a similarly-named file in the same directory as the destination
278+file, and, if found, uses that to try to speed up the transfer.
279+
280 dit(bf(-z, --compress)) With this option, rsync compresses any data from
281 the files that it sends to the destination machine. This
282 option is useful on slow connections. The compression method used is the