Commit | Line | Data |
---|---|---|
4535c2b6 WD |
1 | This patch makes rsync look try to find a basis file for a file that |
2 | doesn't already have one. | |
241013b4 | 3 | |
824abc86 WD |
4 | Be sure to run "make proto" before "make". |
5 | ||
4535c2b6 | 6 | --- orig/flist.c 2005-02-13 21:17:16 |
36366d5c WD |
7 | +++ flist.c 2005-02-13 09:49:22 |
8 | @@ -330,7 +330,7 @@ void send_file_entry(struct file_struct | |
9 | char fname[MAXPATHLEN]; | |
10 | int l1, l2; | |
11 | ||
12 | - if (f == -1) | |
13 | + if (f < 0) | |
14 | return; | |
15 | ||
16 | if (!file) { | |
17 | @@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l | |
18 | struct file_struct *file; | |
19 | char fbuf[MAXPATHLEN]; | |
20 | ||
21 | - if (!(file = make_file(fname, flist, ALL_FILTERS))) | |
22 | + file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS); | |
23 | + if (!file) | |
24 | return; | |
25 | ||
26 | maybe_emit_filelist_progress(flist); | |
4535c2b6 | 27 | @@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f) |
36366d5c WD |
28 | |
29 | clean_flist(flist, relative_paths, 1); | |
30 | ||
31 | - if (f != -1) { | |
32 | + if (f >= 0) { | |
33 | /* Now send the uid/gid list. This was introduced in | |
34 | * protocol version 15 */ | |
35 | recv_uid_list(f, flist); | |
4535c2b6 | 36 | @@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn) |
36366d5c WD |
37 | return k > 0 && strcmp(fn+k, backup_suffix) == 0; |
38 | } | |
39 | ||
40 | +struct file_list *get_dirlist(const char *dirname, int ignore_excludes) | |
41 | +{ | |
42 | + struct file_list *dirlist; | |
43 | + char dirbuf[MAXPATHLEN]; | |
44 | + int dlen; | |
45 | + int save_recurse = recurse; | |
46 | + | |
47 | + dlen = strlcpy(dirbuf, dirname, MAXPATHLEN); | |
48 | + if (dlen >= MAXPATHLEN) | |
49 | + return NULL; | |
50 | + | |
51 | + dirlist = flist_new(WITHOUT_HLINK, "get_dirlist"); | |
52 | + recurse = 0; | |
53 | + send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen); | |
54 | + recurse = save_recurse; | |
55 | + | |
56 | + return dirlist; | |
57 | +} | |
58 | + | |
59 | ||
60 | /* This function is used to implement per-directory deletion, and | |
61 | * is used by all the --delete-WHEN options. Note that the fbuf | |
b78a6aba | 62 | --- orig/generator.c 2005-02-13 05:50:28 |
d7d8b822 | 63 | +++ generator.c 2005-02-13 21:47:28 |
79f132a1 | 64 | @@ -47,6 +47,7 @@ extern int size_only; |
0808daa5 | 65 | extern OFF_T max_size; |
58118c25 WD |
66 | extern int io_timeout; |
67 | extern int protocol_version; | |
09fb2223 | 68 | +extern int fuzzy_basis; |
58118c25 | 69 | extern int always_checksum; |
b952a177 | 70 | extern char *partial_dir; |
0808daa5 | 71 | extern char *basis_dir[]; |
d7d8b822 | 72 | @@ -227,6 +228,59 @@ static void generate_and_send_sums(int f |
4370504a | 73 | unmap_file(mapbuf); |
58118c25 WD |
74 | } |
75 | ||
36366d5c WD |
76 | +/* Try to find a filename in the same dir as "fname" with a similar name. */ |
77 | +static int find_fuzzy(struct file_struct *file, struct file_list *dirlist) | |
47dd7a31 | 78 | +{ |
36366d5c WD |
79 | + int fname_len, fname_suf_len; |
80 | + const char *fname_suf, *fname = file->basename; | |
054f3f90 | 81 | + uint32 lowest_dist = 0x7FFFFFFF; |
36366d5c | 82 | + int j, lowest_j = -1; |
47dd7a31 | 83 | + |
36366d5c WD |
84 | + fname_len = strlen(fname); |
85 | + fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len); | |
47dd7a31 | 86 | + |
36366d5c WD |
87 | + for (j = 0; j < dirlist->count; j++) { |
88 | + struct file_struct *fp = dirlist->files[j]; | |
89 | + const char *suf, *name; | |
90 | + int len, suf_len; | |
054f3f90 | 91 | + uint32 dist; |
47dd7a31 | 92 | + |
4535c2b6 WD |
93 | + if (!S_ISREG(fp->mode) || !fp->length |
94 | + || fp->flags & FLAG_NO_FUZZY) | |
47dd7a31 WD |
95 | + continue; |
96 | + | |
36366d5c | 97 | + name = fp->basename; |
d7d8b822 WD |
98 | + |
99 | + if (fp->length == file->length | |
100 | + && fp->modtime == file->modtime) { | |
101 | + if (verbose > 4) { | |
102 | + rprintf(FINFO, | |
103 | + "fuzzy size/modtime match for %s\n", | |
104 | + name); | |
105 | + } | |
106 | + return j; | |
107 | + } | |
108 | + | |
36366d5c WD |
109 | + len = strlen(name); |
110 | + suf = find_filename_suffix(name, len, &suf_len); | |
054f3f90 | 111 | + |
36366d5c | 112 | + dist = fuzzy_distance(name, len, fname, fname_len); |
fc82f579 | 113 | + /* Add some extra weight to how well the suffixes match. */ |
36366d5c WD |
114 | + dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) |
115 | + * 10; | |
4cd1daea | 116 | + if (verbose > 4) { |
4535c2b6 | 117 | + rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n", |
36366d5c | 118 | + name, (int)(dist>>16), (int)(dist&0xFFFF)); |
47dd7a31 | 119 | + } |
fc82f579 | 120 | + if (dist <= lowest_dist) { |
054f3f90 | 121 | + lowest_dist = dist; |
36366d5c | 122 | + lowest_j = j; |
47dd7a31 WD |
123 | + } |
124 | + } | |
47dd7a31 | 125 | + |
36366d5c | 126 | + return lowest_j; |
47dd7a31 | 127 | +} |
58118c25 WD |
128 | + |
129 | ||
a7219d20 WD |
130 | /* Acts on flist->file's ndx'th item, whose name is fname. If a directory, |
131 | * make sure it exists, and has the right permissions/timestamp info. For | |
d7d8b822 | 132 | @@ -241,6 +295,8 @@ static void recv_generator(char *fname, |
36366d5c WD |
133 | int f_out, int f_out_name) |
134 | { | |
135 | static int missing_below = -1; | |
136 | + static char *fuzzy_dirname = NULL; | |
137 | + static struct file_list *fuzzy_dirlist = NULL; | |
138 | int fd = -1, f_copy = -1; | |
139 | STRUCT_STAT st, partial_st; | |
140 | struct file_struct *back_file = NULL; | |
d7d8b822 | 141 | @@ -275,6 +331,16 @@ static void recv_generator(char *fname, |
36366d5c WD |
142 | statret = -1; |
143 | stat_errno = ENOENT; | |
144 | } else { | |
145 | + if (fuzzy_basis && S_ISREG(file->mode)) { | |
146 | + char *dn = file->dirname ? file->dirname : "."; | |
147 | + if (fuzzy_dirname != dn) { | |
148 | + if (fuzzy_dirlist) | |
149 | + flist_free(fuzzy_dirlist); | |
150 | + fuzzy_dirname = dn; | |
151 | + fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1); | |
152 | + } | |
153 | + } | |
154 | + | |
155 | statret = link_stat(fname, &st, | |
156 | keep_dirlinks && S_ISDIR(file->mode)); | |
157 | stat_errno = errno; | |
d7d8b822 | 158 | @@ -492,6 +558,24 @@ static void recv_generator(char *fname, |
f48a237e WD |
159 | } else |
160 | partialptr = NULL; | |
824abc86 | 161 | |
36366d5c WD |
162 | + if (statret == -1 && fuzzy_basis && dry_run <= 1) { |
163 | + int j = find_fuzzy(file, fuzzy_dirlist); | |
164 | + if (j >= 0) { | |
165 | + struct file_struct *fp = fuzzy_dirlist->files[j]; | |
166 | + f_name_to(fp, fnamecmpbuf); | |
167 | + if (verbose > 2) { | |
d7d8b822 | 168 | + rprintf(FINFO, "fuzzy basis selected for %s: %s\n", |
36366d5c WD |
169 | + safe_fname(fname), safe_fname(fnamecmpbuf)); |
170 | + } | |
171 | + st.st_mode = fp->mode; | |
172 | + st.st_size = fp->length; | |
173 | + st.st_mtime = fp->modtime; | |
09fb2223 WD |
174 | + statret = 0; |
175 | + fnamecmp = fnamecmpbuf; | |
0edc7d7f | 176 | + fnamecmp_type = FNAMECMP_FUZZY; |
09fb2223 | 177 | + } |
824abc86 WD |
178 | + } |
179 | + | |
09fb2223 WD |
180 | if (statret == -1) { |
181 | if (preserve_hard_links && hard_link_check(file, HL_SKIP)) | |
7628f156 | 182 | return; |
d7d8b822 | 183 | @@ -520,6 +604,8 @@ static void recv_generator(char *fname, |
241013b4 | 184 | |
9cf86680 | 185 | if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH) |
0808daa5 WD |
186 | ; |
187 | + else if (fnamecmp_type == FNAMECMP_FUZZY) | |
188 | + ; | |
189 | else if (unchanged_file(fnamecmp, file, &st)) { | |
0edc7d7f | 190 | if (fnamecmp_type == FNAMECMP_FNAME) |
8c5b8235 | 191 | set_perms(fname, file, &st, PERMS_REPORT); |
d7d8b822 | 192 | @@ -540,6 +626,11 @@ prepare_to_open: |
36366d5c WD |
193 | statret = -1; |
194 | goto notify_others; | |
195 | } | |
196 | + if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) { | |
197 | + int j = flist_find(fuzzy_dirlist, file); | |
4535c2b6 WD |
198 | + if (j >= 0) /* don't use updating file as future fuzzy basis */ |
199 | + fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY; | |
36366d5c WD |
200 | + } |
201 | ||
202 | /* open the file */ | |
203 | fd = do_open(fnamecmp, O_RDONLY, 0); | |
d7d8b822 | 204 | @@ -594,8 +685,24 @@ notify_others: |
a7219d20 | 205 | write_int(f_out, ndx); |
0f626034 WD |
206 | if (protocol_version >= 29 && inplace && !read_batch) |
207 | write_byte(f_out, fnamecmp_type); | |
0edc7d7f WD |
208 | - if (f_out_name >= 0) |
209 | + if (f_out_name >= 0) { | |
09fb2223 | 210 | write_byte(f_out_name, fnamecmp_type); |
0edc7d7f | 211 | + if (fnamecmp_type == FNAMECMP_FUZZY) { |
09fb2223 WD |
212 | + uchar lenbuf[3], *lb = lenbuf; |
213 | + int len = strlen(fnamecmpbuf); | |
214 | + if (len > 0x7F) { | |
215 | +#if MAXPATHLEN > 0x7FFF | |
216 | + *lb++ = len / 0x10000 + 0x80; | |
217 | + *lb++ = len / 0x100; | |
218 | +#else | |
219 | + *lb++ = len / 0x100 + 0x80; | |
220 | +#endif | |
221 | + } | |
222 | + *lb = len; | |
223 | + write_buf(f_out_name, lenbuf, lb - lenbuf + 1); | |
224 | + write_buf(f_out_name, fnamecmpbuf, len); | |
225 | + } | |
0edc7d7f | 226 | + } |
09fb2223 | 227 | |
0edc7d7f WD |
228 | if (dry_run || read_batch) |
229 | return; | |
b78a6aba | 230 | --- orig/main.c 2005-02-07 20:41:56 |
3eabe3a3 | 231 | +++ main.c 2005-01-14 18:33:15 |
b78a6aba | 232 | @@ -44,6 +44,7 @@ extern int keep_dirlinks; |
495f1899 WD |
233 | extern int preserve_hard_links; |
234 | extern int protocol_version; | |
235 | extern int recurse; | |
09fb2223 | 236 | +extern int fuzzy_basis; |
495f1899 WD |
237 | extern int relative_paths; |
238 | extern int rsync_port; | |
239 | extern int whole_file; | |
b78a6aba | 240 | @@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st |
495f1899 WD |
241 | int pid; |
242 | int status = 0; | |
243 | int error_pipe[2], name_pipe[2]; | |
3eabe3a3 WD |
244 | - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run; |
245 | + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis) | |
246 | + && !dry_run; | |
495f1899 | 247 | |
d5753a22 WD |
248 | /* The receiving side mustn't obey this, or an existing symlink that |
249 | * points to an identical file won't be replaced by the referent. */ | |
b78a6aba | 250 | --- orig/options.c 2005-02-13 05:50:28 |
d7d8b822 | 251 | +++ options.c 2005-02-13 21:41:41 |
b78a6aba | 252 | @@ -89,6 +89,7 @@ int copy_unsafe_links = 0; |
f6c3b300 | 253 | int size_only = 0; |
0808daa5 | 254 | int daemon_bwlimit = 0; |
f6c3b300 | 255 | int bwlimit = 0; |
09fb2223 | 256 | +int fuzzy_basis = 0; |
f6c3b300 | 257 | size_t bwlimit_writemax = 0; |
f6c3b300 | 258 | int only_existing = 0; |
be73a66e | 259 | int opt_ignore_existing = 0; |
b78a6aba WD |
260 | @@ -302,6 +303,7 @@ void usage(enum logcode F) |
261 | rprintf(F," --size-only skip files that match in size\n"); | |
262 | rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n"); | |
263 | rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n"); | |
d7d8b822 | 264 | + rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n"); |
f0533c4c | 265 | rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); |
0808daa5 WD |
266 | rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); |
267 | rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); | |
b78a6aba | 268 | @@ -411,6 +413,7 @@ static struct poptOption long_options[] |
0808daa5 WD |
269 | {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, |
270 | {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, | |
271 | {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, | |
d7d8b822 | 272 | + {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, |
f0533c4c WD |
273 | /* TODO: Should this take an optional int giving the compression level? */ |
274 | {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, | |
5388f859 | 275 | {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, |
b78a6aba | 276 | @@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg |
4370504a WD |
277 | if (!implied_dirs && !am_sender) |
278 | args[ac++] = "--no-implied-dirs"; | |
7b675ff5 | 279 | |
09fb2223 | 280 | + if (fuzzy_basis && am_sender) |
241013b4 | 281 | + args[ac++] = "--fuzzy"; |
7b675ff5 | 282 | + |
241013b4 | 283 | *argc = ac; |
f74d2272 | 284 | return; |
7b675ff5 | 285 | |
b78a6aba | 286 | --- orig/receiver.c 2005-02-11 10:53:14 |
9cf86680 | 287 | +++ receiver.c 2005-01-15 21:21:02 |
b78a6aba | 288 | @@ -257,6 +257,27 @@ static int receive_data(int f_in, char * |
09fb2223 WD |
289 | } |
290 | ||
291 | ||
292 | +static void read_gen_name(int fd, char *buf) | |
293 | +{ | |
294 | + int len = read_byte(fd); | |
295 | + if (len & 0x80) { | |
296 | +#if MAXPATHLEN > 32767 | |
297 | + uchar lenbuf[2]; | |
298 | + read_buf(fd, (char *)lenbuf, 2); | |
299 | + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1]; | |
300 | +#else | |
301 | + len = (len & ~0x80) * 0x100 + read_byte(fd); | |
302 | +#endif | |
303 | + } | |
304 | + if (len >= MAXPATHLEN) { | |
305 | + rprintf(FERROR, "bogus data on generator name pipe\n"); | |
306 | + exit_cleanup(RERR_PROTOCOL); | |
307 | + } | |
308 | + | |
309 | + read_sbuf(fd, buf, len); | |
310 | +} | |
311 | + | |
312 | + | |
313 | static void discard_receive_data(int f_in, OFF_T length) | |
314 | { | |
315 | receive_data(f_in, NULL, -1, 0, NULL, -1, length); | |
b78a6aba | 316 | @@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis |
0edc7d7f | 317 | case FNAMECMP_BACKUP: |
f48a237e | 318 | fnamecmp = get_backup_name(fname); |
09fb2223 | 319 | break; |
0edc7d7f | 320 | + case FNAMECMP_FUZZY: |
09fb2223 WD |
321 | + read_gen_name(f_in_name, fnamecmpbuf); |
322 | + fnamecmp = fnamecmpbuf; | |
323 | + break; | |
324 | default: | |
9cf86680 WD |
325 | if (j >= basis_dir_cnt) { |
326 | rprintf(FERROR, | |
b78a6aba | 327 | --- orig/rsync.h 2005-02-12 19:54:27 |
4535c2b6 WD |
328 | +++ rsync.h 2005-02-13 21:19:16 |
329 | @@ -60,6 +60,7 @@ | |
330 | #define FLAG_TOP_DIR (1<<0) | |
331 | #define FLAG_HLINK_EOL (1<<1) /* generator only */ | |
332 | #define FLAG_MOUNT_POINT (1<<2) /* sender only */ | |
333 | +#define FLAG_NO_FUZZY (1<<2) /* generator only */ | |
334 | #define FLAG_DEL_HERE (1<<3) /* receiver/generator */ | |
335 | ||
336 | /* update this if you make incompatible changes */ | |
337 | @@ -127,6 +128,7 @@ | |
0edc7d7f WD |
338 | #define FNAMECMP_FNAME 0x80 |
339 | #define FNAMECMP_PARTIAL_DIR 0x81 | |
340 | #define FNAMECMP_BACKUP 0x82 | |
341 | +#define FNAMECMP_FUZZY 0x83 | |
09fb2223 | 342 | |
a27b0830 | 343 | /* For calling delete_file() */ |
be73a66e | 344 | #define DEL_DIR (1<<0) |
d7d8b822 WD |
345 | --- orig/rsync.yo 2005-02-13 21:51:10 |
346 | +++ rsync.yo 2005-02-13 21:41:52 | |
b78a6aba WD |
347 | @@ -351,6 +351,7 @@ to the detailed description below for a |
348 | --size-only skip files that match in size | |
349 | --modify-window=NUM compare mod-times with reduced accuracy | |
d7d8b822 WD |
350 | -T, --temp-dir=DIR create temporary files in directory DIR |
351 | + -y, --fuzzy find similar file for basis if no dest file | |
f0533c4c | 352 | --compare-dest=DIR also compare received files relative to DIR |
0808daa5 WD |
353 | --copy-dest=DIR ... and include copies of unchanged files |
354 | --link-dest=DIR hardlink to files in DIR when unchanged | |
d7d8b822 | 355 | @@ -909,6 +910,16 @@ scratch directory when creating temporar |
b78a6aba WD |
356 | transferred on the receiving side. The default behavior is to create |
357 | the temporary files in the receiving directory. | |
637c560e | 358 | |
d7d8b822 | 359 | +dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a |
637c560e | 360 | +basis file for any destination file that is missing. The current algorithm |
d7d8b822 WD |
361 | +looks in the same directory as the destination file for either a file that |
362 | +has an identical size and modified-time, or a similarly-named file. If | |
363 | +found, rsync uses the fuzzy basis file to try to speed up the transfer. | |
364 | + | |
365 | +Note that the use of the bf(--delete) option might get rid of any potential | |
366 | +fuzzy-match files, so either use bf(--delete-after) or specify some | |
367 | +filename exclusions if you need to prevent this. | |
637c560e | 368 | + |
b78a6aba WD |
369 | dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on |
370 | the destination machine as an additional hierarchy to compare destination | |
371 | files against doing transfers (if the files are missing in the destination | |
372 | --- orig/util.c 2005-02-11 10:53:15 | |
36366d5c WD |
373 | +++ util.c 2005-02-13 09:44:25 |
374 | @@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned | |
054f3f90 WD |
375 | return malloc(size * num); |
376 | return realloc(ptr, size * num); | |
377 | } | |
378 | + | |
4cd1daea WD |
379 | +/* Take a filename and filename length and return the most significant |
380 | + * filename suffix we can find. This ignores suffixes such as "~", | |
381 | + * ".bak", ".orig", ".~1~", etc. */ | |
a27b0830 | 382 | +const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr) |
4cd1daea WD |
383 | +{ |
384 | + const char *suf, *s; | |
4cd1daea | 385 | + BOOL had_tilde; |
a27b0830 | 386 | + int s_len; |
4cd1daea WD |
387 | + |
388 | + /* One or more dots at the start aren't a suffix. */ | |
389 | + while (fn_len && *fn == '.') fn++, fn_len--; | |
390 | + | |
391 | + /* Ignore the ~ in a "foo~" filename. */ | |
392 | + if (fn_len > 1 && fn[fn_len-1] == '~') | |
393 | + fn_len--, had_tilde = True; | |
394 | + else | |
395 | + had_tilde = False; | |
396 | + | |
397 | + /* Assume we don't find an suffix. */ | |
398 | + suf = ""; | |
399 | + *len_ptr = 0; | |
400 | + | |
401 | + /* Find the last significant suffix. */ | |
36366d5c WD |
402 | + for (s = fn + fn_len; fn_len > 1; ) { |
403 | + while (*--s != '.' && s != fn) {} | |
4cd1daea WD |
404 | + if (s == fn) |
405 | + break; | |
406 | + s_len = fn_len - (s - fn); | |
407 | + fn_len = s - fn; | |
408 | + if (s_len == 3) { | |
409 | + if (strcmp(s+1, "bak") == 0 | |
410 | + || strcmp(s+1, "old") == 0) | |
411 | + continue; | |
412 | + } else if (s_len == 4) { | |
413 | + if (strcmp(s+1, "orig") == 0) | |
414 | + continue; | |
415 | + } else if (s_len > 2 && had_tilde | |
416 | + && s[1] == '~' && isdigit(s[2])) | |
417 | + continue; | |
418 | + *len_ptr = s_len; | |
419 | + suf = s; | |
36366d5c WD |
420 | + if (s_len == 1) |
421 | + break; | |
4cd1daea WD |
422 | + /* Determine if the suffix is all digits. */ |
423 | + for (s++, s_len--; s_len > 0; s++, s_len--) { | |
424 | + if (!isdigit(*s)) | |
425 | + return suf; | |
426 | + } | |
427 | + /* An all-digit suffix may not be that signficant. */ | |
36366d5c | 428 | + s = suf; |
4cd1daea WD |
429 | + } |
430 | + | |
431 | + return suf; | |
432 | +} | |
433 | + | |
054f3f90 WD |
434 | +/* This is an implementation of the Levenshtein distance algorithm. It |
435 | + * was implemented to avoid needing a two-dimensional matrix (to save | |
436 | + * memory). It was also tweaked to try to factor in the ASCII distance | |
437 | + * between changed characters as a minor distance quantity. The normal | |
438 | + * Levenshtein units of distance (each signifying a single change between | |
439 | + * the two strings) are defined as a "UNIT". */ | |
440 | + | |
441 | +#define UNIT (1 << 16) | |
442 | + | |
443 | +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2) | |
444 | +{ | |
445 | + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc; | |
446 | + int32 cost; | |
447 | + int i1, i2; | |
448 | + | |
4cd1daea WD |
449 | + if (!len1 || !len2) { |
450 | + if (!len1) { | |
451 | + s1 = s2; | |
452 | + len1 = len2; | |
453 | + } | |
454 | + for (i1 = 0, cost = 0; i1 < len1; i1++) | |
455 | + cost += s1[i1]; | |
456 | + return (int32)len1 * UNIT + cost; | |
457 | + } | |
054f3f90 WD |
458 | + |
459 | + for (i2 = 0; i2 < len2; i2++) | |
460 | + a[i2] = (i2+1) * UNIT; | |
461 | + | |
462 | + for (i1 = 0; i1 < len1; i1++) { | |
463 | + diag = i1 * UNIT; | |
464 | + above = (i1+1) * UNIT; | |
465 | + for (i2 = 0; i2 < len2; i2++) { | |
466 | + left = a[i2]; | |
467 | + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) { | |
468 | + if (cost < 0) | |
469 | + cost = UNIT - cost; | |
470 | + else | |
471 | + cost = UNIT + cost; | |
472 | + } | |
473 | + diag_inc = diag + cost; | |
474 | + left_inc = left + UNIT + *((uchar*)s1+i1); | |
475 | + above_inc = above + UNIT + *((uchar*)s2+i2); | |
476 | + a[i2] = above = left < above | |
477 | + ? (left_inc < diag_inc ? left_inc : diag_inc) | |
478 | + : (above_inc < diag_inc ? above_inc : diag_inc); | |
479 | + diag = left; | |
480 | + } | |
481 | + } | |
482 | + | |
483 | + return a[len2-1]; | |
484 | +} |