Fixed a failing hunk.
[rsync/rsync-patches.git] / fuzzy.diff
CommitLineData
824abc86 1The changes to generator.c were greatly simplified, making the patch
8c5b8235 2easier to maintain and fixing the failing test in the testsuite.
0808daa5 3Lightly tested.
241013b4 4
824abc86
WD
5Be sure to run "make proto" before "make".
6
0f626034
WD
7--- orig/generator.c 2005-01-17 23:11:45
8+++ generator.c 2005-01-16 02:16:38
0808daa5
WD
9@@ -44,6 +44,7 @@ extern int size_only;
10 extern OFF_T max_size;
58118c25
WD
11 extern int io_timeout;
12 extern int protocol_version;
09fb2223 13+extern int fuzzy_basis;
58118c25 14 extern int always_checksum;
b952a177 15 extern char *partial_dir;
0808daa5 16 extern char *basis_dir[];
0f626034 17@@ -242,6 +243,83 @@ static void generate_and_send_sums(int f
58118c25
WD
18 }
19
20
47dd7a31
WD
21+static unsigned int measure_name(const char *name, const char *basename,
22+ const char *ext)
23+{
24+ int namelen = strlen(name);
25+ int extlen = strlen(ext);
26+ unsigned int score = 0;
27+
28+ /* Extensions must match */
29+ if (namelen <= extlen || strcmp(name + namelen - extlen, ext) != 0)
30+ return 0;
31+
32+ /* Now score depends on similarity of prefix */
33+ for (; *name == *basename && *name; name++, basename++)
34+ score++;
35+ return score;
36+}
37+
58118c25 38+
09fb2223 39+static int find_fuzzy(const char *fname, char *buf, STRUCT_STAT *st_ptr)
47dd7a31
WD
40+{
41+ DIR *d;
42+ struct dirent *di;
0f626034 43+ char *basename, *dirname, *slash;
47dd7a31
WD
44+ char bestname[MAXPATHLEN];
45+ unsigned int bestscore = 0;
46+ const char *ext;
47+
0f626034
WD
48+ strlcpy(buf, fname, MAXPATHLEN);
49+ if ((slash = strrchr(buf, '/')) != NULL) {
50+ dirname = buf;
51+ *slash = '\0';
52+ basename = slash + 1;
53+ } else {
54+ basename = buf;
55+ dirname = ".";
56+ }
47dd7a31 57+
47dd7a31
WD
58+ if (!(d = opendir(dirname))) {
59+ rsyserr(FERROR, errno, "recv_generator opendir(%s)", dirname);
60+ return -1;
61+ }
62+
63+ /* Get final extension, eg. .gz; never full basename though. */
0f626034
WD
64+ for (ext = basename; *ext == '.'; ext++) {}
65+ if (!(ext = strrchr(ext, '.')))
47dd7a31
WD
66+ ext = basename + strlen(basename); /* ext = "" */
67+
68+ while ((di = readdir(d)) != NULL) {
69+ const char *dname = d_name(di);
70+ unsigned int score;
71+
72+ if (dname[0] == '.' && (dname[1] == '\0'
73+ || (dname[1] == '.' && dname[2] == '\0')))
74+ continue;
75+
76+ score = measure_name(dname, basename, ext);
77+ if (verbose > 4) {
dc800efa
WD
78+ rprintf(FINFO, "fuzzy score for %s = %u\n",
79+ dname, score);
47dd7a31
WD
80+ }
81+ if (score > bestscore) {
8c5b8235 82+ strlcpy(bestname, dname, sizeof bestname);
47dd7a31
WD
83+ bestscore = score;
84+ }
85+ }
86+ closedir(d);
87+
88+ /* Found a candidate. */
89+ if (bestscore != 0) {
0f626034 90+ strlcpy(basename, MAXPATHLEN - (basename - buf), bestname);
dc800efa
WD
91+ if (verbose > 2)
92+ rprintf(FINFO, "fuzzy match %s->%s\n", fname, buf);
58118c25 93+ return link_stat(buf, st_ptr, 0);
47dd7a31
WD
94+ }
95+ return -1;
96+}
58118c25
WD
97+
98
99 /*
100 * Acts on file number @p i from @p flist, whose name is @p fname.
0f626034 101@@ -496,6 +574,15 @@ static void recv_generator(char *fname,
f48a237e
WD
102 } else
103 partialptr = NULL;
824abc86 104
09fb2223
WD
105+ if (statret == -1 && fuzzy_basis) {
106+ if (find_fuzzy(fname, fnamecmpbuf, &st) == 0
107+ && S_ISREG(st.st_mode)) {
108+ statret = 0;
109+ fnamecmp = fnamecmpbuf;
0edc7d7f 110+ fnamecmp_type = FNAMECMP_FUZZY;
09fb2223 111+ }
824abc86
WD
112+ }
113+
09fb2223
WD
114 if (statret == -1) {
115 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
7628f156 116 return;
0f626034 117@@ -524,6 +611,8 @@ static void recv_generator(char *fname,
241013b4 118
9cf86680 119 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
0808daa5
WD
120 ;
121+ else if (fnamecmp_type == FNAMECMP_FUZZY)
122+ ;
123 else if (unchanged_file(fnamecmp, file, &st)) {
0edc7d7f 124 if (fnamecmp_type == FNAMECMP_FNAME)
8c5b8235 125 set_perms(fname, file, &st, PERMS_REPORT);
0f626034 126@@ -598,8 +687,24 @@ notify_others:
0edc7d7f 127 write_int(f_out, i);
0f626034
WD
128 if (protocol_version >= 29 && inplace && !read_batch)
129 write_byte(f_out, fnamecmp_type);
0edc7d7f
WD
130- if (f_out_name >= 0)
131+ if (f_out_name >= 0) {
09fb2223 132 write_byte(f_out_name, fnamecmp_type);
0edc7d7f 133+ if (fnamecmp_type == FNAMECMP_FUZZY) {
09fb2223
WD
134+ uchar lenbuf[3], *lb = lenbuf;
135+ int len = strlen(fnamecmpbuf);
136+ if (len > 0x7F) {
137+#if MAXPATHLEN > 0x7FFF
138+ *lb++ = len / 0x10000 + 0x80;
139+ *lb++ = len / 0x100;
140+#else
141+ *lb++ = len / 0x100 + 0x80;
142+#endif
143+ }
144+ *lb = len;
145+ write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
146+ write_buf(f_out_name, fnamecmpbuf, len);
147+ }
0edc7d7f 148+ }
09fb2223 149
0edc7d7f
WD
150 if (dry_run || read_batch)
151 return;
0f626034 152--- orig/main.c 2005-01-17 23:11:45
3eabe3a3 153+++ main.c 2005-01-14 18:33:15
d5753a22 154@@ -48,6 +48,7 @@ extern int keep_dirlinks;
495f1899
WD
155 extern int preserve_hard_links;
156 extern int protocol_version;
157 extern int recurse;
09fb2223 158+extern int fuzzy_basis;
495f1899
WD
159 extern int relative_paths;
160 extern int rsync_port;
161 extern int whole_file;
3eabe3a3 162@@ -464,7 +465,8 @@ static int do_recv(int f_in,int f_out,st
495f1899
WD
163 int pid;
164 int status = 0;
165 int error_pipe[2], name_pipe[2];
3eabe3a3
WD
166- BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
167+ BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
168+ && !dry_run;
495f1899 169
d5753a22
WD
170 /* The receiving side mustn't obey this, or an existing symlink that
171 * points to an identical file won't be replaced by the referent. */
0f626034 172--- orig/options.c 2005-01-17 23:11:45
9cf86680 173+++ options.c 2005-01-15 21:08:13
0808daa5 174@@ -86,6 +86,7 @@ int copy_unsafe_links = 0;
f6c3b300 175 int size_only = 0;
0808daa5 176 int daemon_bwlimit = 0;
f6c3b300 177 int bwlimit = 0;
09fb2223 178+int fuzzy_basis = 0;
f6c3b300
WD
179 size_t bwlimit_writemax = 0;
180 int delete_after = 0;
181 int only_existing = 0;
0808daa5 182@@ -288,6 +289,7 @@ void usage(enum logcode F)
f0533c4c 183 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
0808daa5
WD
184 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
185 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
dc800efa 186+ rprintf(F," --fuzzy find similar file for basis when no dest file\n");
f0533c4c
WD
187 rprintf(F," -P equivalent to --partial --progress\n");
188 rprintf(F," -z, --compress compress file data\n");
189 rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
37da98ae 190@@ -384,6 +386,7 @@ static struct poptOption long_options[]
0808daa5
WD
191 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
192 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
193 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
09fb2223 194+ {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
f0533c4c
WD
195 /* TODO: Should this take an optional int giving the compression level? */
196 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
5388f859 197 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
0f626034 198@@ -1234,6 +1237,9 @@ void server_options(char **args,int *arg
37da98ae 199 args[ac++] = "--no-relative";
241013b4 200 }
7b675ff5 201
09fb2223 202+ if (fuzzy_basis && am_sender)
241013b4 203+ args[ac++] = "--fuzzy";
7b675ff5 204+
241013b4 205 *argc = ac;
f74d2272 206 return;
7b675ff5 207
0f626034 208--- orig/receiver.c 2005-01-17 23:11:45
9cf86680
WD
209+++ receiver.c 2005-01-15 21:21:02
210@@ -324,6 +324,27 @@ static int receive_data(int f_in, char *
09fb2223
WD
211 }
212
213
214+static void read_gen_name(int fd, char *buf)
215+{
216+ int len = read_byte(fd);
217+ if (len & 0x80) {
218+#if MAXPATHLEN > 32767
219+ uchar lenbuf[2];
220+ read_buf(fd, (char *)lenbuf, 2);
221+ len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
222+#else
223+ len = (len & ~0x80) * 0x100 + read_byte(fd);
224+#endif
225+ }
226+ if (len >= MAXPATHLEN) {
227+ rprintf(FERROR, "bogus data on generator name pipe\n");
228+ exit_cleanup(RERR_PROTOCOL);
229+ }
230+
231+ read_sbuf(fd, buf, len);
232+}
233+
234+
235 static void discard_receive_data(int f_in, OFF_T length)
236 {
237 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
9cf86680 238@@ -454,6 +475,10 @@ int recv_files(int f_in, struct file_lis
0edc7d7f 239 case FNAMECMP_BACKUP:
f48a237e 240 fnamecmp = get_backup_name(fname);
09fb2223 241 break;
0edc7d7f 242+ case FNAMECMP_FUZZY:
09fb2223
WD
243+ read_gen_name(f_in_name, fnamecmpbuf);
244+ fnamecmp = fnamecmpbuf;
245+ break;
246 default:
9cf86680
WD
247 if (j >= basis_dir_cnt) {
248 rprintf(FERROR,
0f626034 249--- orig/rsync.h 2005-01-17 23:11:45
9cf86680 250+++ rsync.h 2005-01-15 21:24:09
0f626034 251@@ -128,6 +128,7 @@
0edc7d7f
WD
252 #define FNAMECMP_FNAME 0x80
253 #define FNAMECMP_PARTIAL_DIR 0x81
254 #define FNAMECMP_BACKUP 0x82
255+#define FNAMECMP_FUZZY 0x83
09fb2223
WD
256
257
258 /* Log-message categories. FLOG is only used on the daemon side to
0f626034 259--- orig/rsync.yo 2005-01-17 23:11:46
637c560e 260+++ rsync.yo 2005-01-15 21:48:52
0808daa5 261@@ -358,6 +358,7 @@ verb(
f0533c4c 262 --compare-dest=DIR also compare received files relative to DIR
0808daa5
WD
263 --copy-dest=DIR ... and include copies of unchanged files
264 --link-dest=DIR hardlink to files in DIR when unchanged
dc800efa 265+ --fuzzy find similar file for basis when no dest
f0533c4c
WD
266 -P equivalent to --partial --progress
267 -z, --compress compress file data
268 -C, --cvs-exclude auto ignore files in the same way CVS does
0f626034
WD
269@@ -878,6 +879,11 @@ Note that rsync versions prior to 2.6.1
270 (or implied by -a). You can work-around this bug by avoiding the -o option
271 when sending to an old rsync.
637c560e
WD
272
273+dit(bf(--fuzzy)) This option tells rsync that it should look around for a
274+basis file for any destination file that is missing. The current algorithm
275+looks for a similarly-named file in the same directory as the destination
276+file, and, if found, uses that to try to speed up the transfer.
277+
278 dit(bf(-z, --compress)) With this option, rsync compresses any data from
279 the files that it sends to the destination machine. This
280 option is useful on slow connections. The compression method used is the