Commit | Line | Data |
---|---|---|
c966e8e8 WD |
1 | From: Tobi Oetiker tobi{at}oetiker.ch |
2 | Date: 2007-04-23 | |
3 | ||
4 | I am using rsync for hard-link backup. I found that there is a | |
5 | major problem with frequent backup filling up the file system cache | |
6 | with all the data from the files being backed up. The effect is | |
7 | that all the other 'sensible' data in the cache gets thrown out in | |
8 | the process. This is rather unfortunate as the performance of the | |
9 | system becomes very bad after running rsync. | |
10 | ||
11 | Some research showed, that | |
12 | ||
13 | posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED); | |
14 | ||
15 | would tell the OS that it should not keep the file in cache. I | |
16 | have written a patch for rsync that adds the | |
17 | ||
18 | --drop-cache | |
19 | ||
20 | option which activates posix_fadvise64. | |
21 | ||
22 | There are some caveats though: | |
23 | ||
24 | * When calling posix_fadvise64 while writing a file, only the | |
25 | part of the cache will be release which has already been | |
26 | written to disk. This means we have to call fdatasync before | |
27 | calling posix_fadvise64 and this will unfortunately slow down | |
28 | operations considerably. On my test system I get 240 KByte/s. | |
29 | ||
30 | The patch has been optimized, so that the impact on large files | |
31 | will be considerably lowered by calling posix_fadvise64 only | |
32 | after a few megabytes have been written. | |
33 | ||
34 | * When reading a file which has been cached *Before* rsync read | |
35 | it, the content of the file will be released from cache never | |
36 | the less, which may not be intended. I have unfortunately not | |
37 | found a method for determining if a file is in cache or not | |
38 | (ideas?) | |
39 | ||
40 | I found that running rsync of an lvm snapshot is a good way | |
41 | around this problem, since the snapshot data is cached | |
42 | separately from the original. It has the additional benefit of | |
43 | making the backups more consistent. | |
44 | ||
45 | * I don't really know the rsync code, so it may be that the patch | |
46 | is calling fadvise for files where this would not be necessary. | |
47 | ||
48 | * The patch is tested only on Linux 2.6.18 | |
49 | ||
50 | If you have any input on this, please let me know. | |
51 | ||
52 | You can get the latest edition of the patch from | |
53 | ||
54 | http://tobi.oetiker.ch/patches/ | |
55 | ||
56 | cheers | |
57 | tobi | |
58 | ||
59 | Changes: | |
60 | ||
61 | 2007-04-23 | |
62 | ||
63 | * pass --drop-cache on to the remote server | |
64 | * make test works now | |
65 | ||
66 | --- old/checksum.c | |
67 | +++ new/checksum.c | |
68 | @@ -148,7 +148,7 @@ void file_checksum(char *fname, char *su | |
69 | mdfour_result(&m, (uchar *)sum); | |
70 | } | |
71 | ||
72 | - close(fd); | |
73 | + fadv_close(fd); | |
74 | unmap_file(buf); | |
75 | } | |
76 | ||
77 | --- old/fileio.c | |
78 | +++ new/fileio.c | |
79 | @@ -26,15 +26,18 @@ | |
80 | #endif | |
81 | ||
82 | extern int sparse_files; | |
83 | - | |
84 | static char last_byte; | |
85 | static int last_sparse; | |
86 | ||
87 | +extern int drop_cache; | |
88 | + | |
89 | + | |
90 | + | |
91 | int sparse_end(int f) | |
92 | { | |
93 | if (last_sparse) { | |
94 | do_lseek(f,-1,SEEK_CUR); | |
95 | - return (write(f,&last_byte,1) == 1 ? 0 : -1); | |
96 | + return (fadv_write(f,&last_byte,1) == 1 ? 0 : -1); | |
97 | } | |
98 | last_sparse = 0; | |
99 | return 0; | |
100 | @@ -61,7 +64,7 @@ static int write_sparse(int f,char *buf, | |
101 | if (l1 == len) | |
102 | return len; | |
103 | ||
104 | - ret = write(f, buf + l1, len - (l1+l2)); | |
105 | + ret = fadv_write(f, buf + l1, len - (l1+l2)); | |
106 | if (ret == -1 || ret == 0) | |
107 | return ret; | |
108 | else if (ret != (int) (len - (l1+l2))) | |
109 | @@ -84,7 +87,7 @@ int flush_write_file(int f) | |
110 | char *bp = wf_writeBuf; | |
111 | ||
112 | while (wf_writeBufCnt > 0) { | |
113 | - if ((ret = write(f, bp, wf_writeBufCnt)) < 0) { | |
114 | + if ((ret = fadv_write(f, bp, wf_writeBufCnt)) < 0) { | |
115 | if (errno == EINTR) | |
116 | continue; | |
117 | return ret; | |
118 | @@ -235,7 +238,7 @@ char *map_ptr(struct map_struct *map, OF | |
119 | map->p_len = window_size; | |
120 | ||
121 | while (read_size > 0) { | |
122 | - nread = read(map->fd, map->p + read_offset, read_size); | |
123 | + nread = fadv_read(map->fd, map->p + read_offset, read_size); | |
124 | if (nread <= 0) { | |
125 | if (!map->status) | |
126 | map->status = nread ? errno : ENODATA; | |
127 | --- old/generator.c | |
128 | +++ new/generator.c | |
129 | @@ -1614,18 +1614,18 @@ static void recv_generator(char *fname, | |
130 | ||
131 | if (inplace && make_backups > 0 && fnamecmp_type == FNAMECMP_FNAME) { | |
132 | if (!(backupptr = get_backup_name(fname))) { | |
133 | - close(fd); | |
134 | + fadv_close(fd); | |
135 | goto cleanup; | |
136 | } | |
137 | if (!(back_file = make_file(fname, NULL, NULL, 0, NO_FILTERS))) { | |
138 | - close(fd); | |
139 | + fadv_close(fd); | |
140 | goto pretend_missing; | |
141 | } | |
142 | if (robust_unlink(backupptr) && errno != ENOENT) { | |
143 | rsyserr(FERROR, errno, "unlink %s", | |
144 | full_fname(backupptr)); | |
145 | unmake_file(back_file); | |
146 | - close(fd); | |
147 | + fadv_close(fd); | |
148 | goto cleanup; | |
149 | } | |
150 | if ((f_copy = do_open(backupptr, | |
151 | @@ -1633,7 +1633,7 @@ static void recv_generator(char *fname, | |
152 | rsyserr(FERROR, errno, "open %s", | |
153 | full_fname(backupptr)); | |
154 | unmake_file(back_file); | |
155 | - close(fd); | |
156 | + fadv_close(fd); | |
157 | goto cleanup; | |
158 | } | |
159 | fnamecmp_type = FNAMECMP_BACKUP; | |
160 | @@ -1695,7 +1695,7 @@ static void recv_generator(char *fname, | |
161 | generate_and_send_sums(fd, sx.st.st_size, f_out, f_copy); | |
162 | ||
163 | if (f_copy >= 0) { | |
164 | - close(f_copy); | |
165 | + fadv_close(f_copy); | |
166 | set_file_attrs(backupptr, back_file, NULL, NULL, 0); | |
167 | if (verbose > 1) { | |
168 | rprintf(FINFO, "backed up %s to %s\n", | |
169 | @@ -1704,7 +1704,7 @@ static void recv_generator(char *fname, | |
170 | unmake_file(back_file); | |
171 | } | |
172 | ||
173 | - close(fd); | |
174 | + fadv_close(fd); | |
175 | ||
176 | cleanup: | |
177 | #ifdef SUPPORT_ACLS | |
178 | --- old/options.c | |
179 | +++ new/options.c | |
180 | @@ -57,6 +57,7 @@ int preserve_gid = 0; | |
181 | int preserve_times = 0; | |
182 | int omit_dir_times = 0; | |
183 | int update_only = 0; | |
184 | +int drop_cache = 0; | |
185 | int cvs_exclude = 0; | |
186 | int dry_run = 0; | |
187 | int do_xfers = 1; | |
188 | @@ -310,6 +311,7 @@ void usage(enum logcode F) | |
189 | rprintf(F," --backup-dir=DIR make backups into hierarchy based in DIR\n"); | |
190 | rprintf(F," --suffix=SUFFIX set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX); | |
191 | rprintf(F," -u, --update skip files that are newer on the receiver\n"); | |
192 | + rprintf(F," --drop-cache tell OS to drop caching of file data\n"); | |
193 | rprintf(F," --inplace update destination files in-place (SEE MAN PAGE)\n"); | |
194 | rprintf(F," --append append data onto shorter files\n"); | |
195 | rprintf(F," -d, --dirs transfer directories without recursing\n"); | |
196 | @@ -506,6 +508,7 @@ static struct poptOption long_options[] | |
197 | {"size-only", 0, POPT_ARG_NONE, &size_only, 0, 0, 0 }, | |
198 | {"one-file-system", 'x', POPT_ARG_NONE, 0, 'x', 0, 0 }, | |
199 | {"update", 'u', POPT_ARG_NONE, &update_only, 0, 0, 0 }, | |
200 | + {"drop-cache", 0, POPT_ARG_NONE, &drop_cache, 0, 0, 0 }, | |
201 | {"existing", 0, POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, | |
202 | {"ignore-non-existing",0,POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, | |
203 | {"ignore-existing", 0, POPT_ARG_NONE, &ignore_existing, 0, 0, 0 }, | |
204 | @@ -1603,6 +1606,9 @@ void server_options(char **args,int *arg | |
205 | if (!am_sender) | |
206 | args[ac++] = "--sender"; | |
207 | ||
208 | + if (drop_cache) | |
209 | + args[ac++] = "--drop-cache"; | |
210 | + | |
211 | x = 1; | |
212 | argstr[0] = '-'; | |
213 | for (i = 0; i < verbose; i++) | |
214 | --- old/receiver.c | |
215 | +++ new/receiver.c | |
216 | @@ -554,7 +554,7 @@ int recv_files(int f_in, char *local_nam | |
217 | rsyserr(FERROR, errno, "fstat %s failed", | |
218 | full_fname(fnamecmp)); | |
219 | discard_receive_data(f_in, F_LENGTH(file)); | |
220 | - close(fd1); | |
221 | + fadv_close(fd1); | |
222 | if (inc_recurse) | |
223 | send_msg_int(MSG_NO_SEND, ndx); | |
224 | continue; | |
225 | @@ -569,14 +569,14 @@ int recv_files(int f_in, char *local_nam | |
226 | rprintf(FERROR,"recv_files: %s is a directory\n", | |
227 | full_fname(fnamecmp)); | |
228 | discard_receive_data(f_in, F_LENGTH(file)); | |
229 | - close(fd1); | |
230 | + fadv_close(fd1); | |
231 | if (inc_recurse) | |
232 | send_msg_int(MSG_NO_SEND, ndx); | |
233 | continue; | |
234 | } | |
235 | ||
236 | if (fd1 != -1 && !S_ISREG(st.st_mode)) { | |
237 | - close(fd1); | |
238 | + fadv_close(fd1); | |
239 | fd1 = -1; | |
240 | } | |
241 | ||
242 | @@ -604,7 +604,7 @@ int recv_files(int f_in, char *local_nam | |
243 | full_fname(fname)); | |
244 | discard_receive_data(f_in, F_LENGTH(file)); | |
245 | if (fd1 != -1) | |
246 | - close(fd1); | |
247 | + fadv_close(fd1); | |
248 | if (inc_recurse) | |
249 | send_msg_int(MSG_NO_SEND, ndx); | |
250 | continue; | |
251 | @@ -613,7 +613,7 @@ int recv_files(int f_in, char *local_nam | |
252 | if (!get_tmpname(fnametmp,fname)) { | |
253 | discard_receive_data(f_in, F_LENGTH(file)); | |
254 | if (fd1 != -1) | |
255 | - close(fd1); | |
256 | + fadv_close(fd1); | |
257 | if (inc_recurse) | |
258 | send_msg_int(MSG_NO_SEND, ndx); | |
259 | continue; | |
260 | @@ -641,7 +641,7 @@ int recv_files(int f_in, char *local_nam | |
261 | full_fname(fnametmp)); | |
262 | discard_receive_data(f_in, F_LENGTH(file)); | |
263 | if (fd1 != -1) | |
264 | - close(fd1); | |
265 | + fadv_close(fd1); | |
266 | if (inc_recurse) | |
267 | send_msg_int(MSG_NO_SEND, ndx); | |
268 | continue; | |
269 | @@ -663,8 +663,8 @@ int recv_files(int f_in, char *local_nam | |
270 | log_item(log_code, file, &initial_stats, iflags, NULL); | |
271 | ||
272 | if (fd1 != -1) | |
273 | - close(fd1); | |
274 | - if (close(fd2) < 0) { | |
275 | + fadv_close(fd1); | |
276 | + if (fadv_close(fd2) < 0) { | |
277 | rsyserr(FERROR, errno, "close failed on %s", | |
278 | full_fname(fnametmp)); | |
279 | exit_cleanup(RERR_FILEIO); | |
280 | --- old/rsync.yo | |
281 | +++ new/rsync.yo | |
282 | @@ -335,6 +335,7 @@ to the detailed description below for a | |
283 | --super receiver attempts super-user activities | |
284 | --fake-super store/recover privileged attrs using xattrs | |
285 | -S, --sparse handle sparse files efficiently | |
286 | + --drop-cache tell OS to drop caching of file data | |
287 | -n, --dry-run show what would have been transferred | |
288 | -W, --whole-file copy files whole (without rsync algorithm) | |
289 | -x, --one-file-system don't cross filesystem boundaries | |
290 | @@ -956,6 +957,10 @@ NOTE: Don't use this option when the des | |
291 | filesystem. It doesn't seem to handle seeks over null regions | |
292 | correctly and ends up corrupting the files. | |
293 | ||
294 | +dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data. This | |
295 | +prevents rsync from filling up the filesystem cache. This can sometimes help | |
296 | +to make a system perform better by keeping non-rsync files in the disk cache. | |
297 | + | |
298 | dit(bf(-n, --dry-run)) This tells rsync to not do any file transfers, | |
299 | instead it will just report the actions it would have taken. | |
300 | ||
301 | --- old/sender.c | |
302 | +++ new/sender.c | |
303 | @@ -307,7 +307,7 @@ void send_files(int f_in, int f_out) | |
304 | io_error |= IOERR_GENERAL; | |
305 | rsyserr(FERROR, errno, "fstat failed"); | |
306 | free_sums(s); | |
307 | - close(fd); | |
308 | + fadv_close(fd); | |
309 | exit_cleanup(RERR_PROTOCOL); | |
310 | } | |
311 | ||
312 | @@ -351,7 +351,7 @@ void send_files(int f_in, int f_out) | |
313 | full_fname(fname)); | |
314 | } | |
315 | } | |
316 | - close(fd); | |
317 | + fadv_close(fd); | |
318 | ||
319 | free_sums(s); | |
320 | ||
321 | --- old/t_unsafe.c | |
322 | +++ new/t_unsafe.c | |
323 | @@ -28,6 +28,7 @@ int am_root = 0; | |
324 | int read_only = 0; | |
325 | int list_only = 0; | |
326 | int verbose = 0; | |
327 | +int drop_cache = 0; | |
328 | int preserve_perms = 0; | |
329 | ||
330 | int | |
331 | --- old/util.c | |
332 | +++ new/util.c | |
333 | @@ -24,6 +24,7 @@ | |
334 | ||
335 | extern int verbose; | |
336 | extern int dry_run; | |
337 | +extern int drop_cache; | |
338 | extern int module_id; | |
339 | extern int modify_window; | |
340 | extern int relative_paths; | |
341 | @@ -39,6 +40,88 @@ char curr_dir[MAXPATHLEN]; | |
342 | unsigned int curr_dir_len; | |
343 | int curr_dir_depth; /* This is only set for a sanitizing daemon. */ | |
344 | ||
345 | +extern int drop_cache; | |
346 | + | |
347 | +static struct stat fadv_fd_stat[255]; | |
348 | +static off_t fadv_fd_pos[255]; | |
349 | +static int fadv_fd_init = 0; | |
350 | + | |
351 | +static void fadv_fd_init_func(void){ | |
352 | + if (fadv_fd_init ==0){ | |
353 | + int i; | |
354 | + fadv_fd_init = 1; | |
355 | + for (i=0;i<255;i++){ | |
356 | + fadv_fd_pos[i] = 0; | |
357 | + fadv_fd_stat[i].st_dev = 0; | |
358 | + fadv_fd_stat[i].st_ino = 0; | |
359 | + } | |
360 | + } | |
361 | +} | |
362 | + | |
363 | +static void fadv_drop(int fd, int sync){ | |
364 | + struct stat stat; | |
365 | + /* trail 1 MB behind in dropping. we do this to make | |
366 | + sure that the same block or stripe does not have | |
367 | + to be written twice */ | |
368 | + int pos = lseek(fd,0,SEEK_CUR) - 1024*1024; | |
369 | + if (fd > 255){ | |
370 | + return; | |
371 | + } | |
372 | + fadv_fd_init_func(); | |
373 | + fstat(fd,&stat); | |
374 | + if ( fadv_fd_stat[fd].st_dev == stat.st_dev | |
375 | + && fadv_fd_stat[fd].st_ino == stat.st_ino ) { | |
376 | + if ( fadv_fd_pos[fd] < pos - 16*1024*1024 ) { | |
377 | + if (sync) { | |
378 | + /* if the file is not flushed to disk before calling fadvise, | |
379 | + then the Cache will not be freed and the advise gets ignored | |
380 | + this does give a severe hit on performance. If only there | |
381 | + was a way to mark cache so that it gets release once the data | |
382 | + is written to disk. */ | |
383 | + fdatasync(fd); | |
384 | + } | |
385 | + posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED); | |
386 | + fadv_fd_pos[fd] = pos; | |
387 | + } | |
388 | + } else { | |
389 | + fadv_fd_stat[fd].st_dev = stat.st_dev; | |
390 | + fadv_fd_stat[fd].st_ino = stat.st_ino; | |
391 | + fadv_fd_pos[fd] = 0; | |
392 | + } | |
393 | +} | |
394 | + | |
395 | +ssize_t fadv_write(int fd, const void *buf, size_t count) | |
396 | +{ | |
397 | + int ret = write(fd, buf, count); | |
398 | + if (drop_cache) { | |
399 | + fadv_drop(fd,1); | |
400 | + } | |
401 | + return ret; | |
402 | +} | |
403 | + | |
404 | +ssize_t fadv_read(int fd, void *buf, size_t count) | |
405 | +{ | |
406 | + int ret = read(fd, buf, count); | |
407 | + if (drop_cache) { | |
408 | + fadv_drop(fd,0); | |
409 | + } | |
410 | + return ret; | |
411 | +} | |
412 | + | |
413 | +int fadv_close(int fd){ | |
414 | + if (drop_cache) { | |
415 | + /* drop everything after we are done */ | |
416 | + /* if the file is not flushed to disk before calling fadvise, | |
417 | + then the Cache will not be freed and the advise gets ignored | |
418 | + this does give a severe hit on performance. If only there | |
419 | + was a way to mark cache so that it gets release once the data | |
420 | + is written to disk. */ | |
421 | + fdatasync(fd); | |
422 | + posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED); | |
423 | + } | |
424 | + return close(fd); | |
425 | +} | |
426 | + | |
427 | /* Set a fd into nonblocking mode. */ | |
428 | void set_nonblocking(int fd) | |
429 | { | |
430 | @@ -221,7 +304,7 @@ int full_write(int desc, const char *ptr | |
431 | ||
432 | total_written = 0; | |
433 | while (len > 0) { | |
434 | - int written = write(desc, ptr, len); | |
435 | + int written = fadv_write(desc, ptr, len); | |
436 | if (written < 0) { | |
437 | if (errno == EINTR) | |
438 | continue; | |
439 | @@ -253,7 +336,7 @@ static int safe_read(int desc, char *ptr | |
440 | return len; | |
441 | ||
442 | do { | |
443 | - n_chars = read(desc, ptr, len); | |
444 | + n_chars = fadv_read(desc, ptr, len); | |
445 | } while (n_chars < 0 && errno == EINTR); | |
446 | ||
447 | return n_chars; | |
448 | @@ -284,32 +367,32 @@ int copy_file(const char *source, const | |
449 | ofd = do_open(dest, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, mode); | |
450 | if (ofd == -1) { | |
451 | rsyserr(FERROR, errno, "open %s", full_fname(dest)); | |
452 | - close(ifd); | |
453 | + fadv_close(ifd); | |
454 | return -1; | |
455 | } | |
456 | ||
457 | while ((len = safe_read(ifd, buf, sizeof buf)) > 0) { | |
458 | if (full_write(ofd, buf, len) < 0) { | |
459 | rsyserr(FERROR, errno, "write %s", full_fname(dest)); | |
460 | - close(ifd); | |
461 | - close(ofd); | |
462 | + fadv_close(ifd); | |
463 | + fadv_close(ofd); | |
464 | return -1; | |
465 | } | |
466 | } | |
467 | ||
468 | if (len < 0) { | |
469 | rsyserr(FERROR, errno, "read %s", full_fname(source)); | |
470 | - close(ifd); | |
471 | - close(ofd); | |
472 | + fadv_close(ifd); | |
473 | + fadv_close(ofd); | |
474 | return -1; | |
475 | } | |
476 | ||
477 | - if (close(ifd) < 0) { | |
478 | + if (fadv_close(ifd) < 0) { | |
479 | rsyserr(FINFO, errno, "close failed on %s", | |
480 | full_fname(source)); | |
481 | } | |
482 | ||
483 | - if (close(ofd) < 0) { | |
484 | + if (fadv_close(ofd) < 0) { | |
485 | rsyserr(FERROR, errno, "close failed on %s", | |
486 | full_fname(dest)); | |
487 | return -1; |