Commit | Line | Data |
---|---|---|
c966e8e8 WD |
1 | From: Tobi Oetiker tobi{at}oetiker.ch |
2 | Date: 2007-04-23 | |
3 | ||
4 | I am using rsync for hard-link backup. I found that there is a | |
5 | major problem with frequent backup filling up the file system cache | |
6 | with all the data from the files being backed up. The effect is | |
7 | that all the other 'sensible' data in the cache gets thrown out in | |
8 | the process. This is rather unfortunate as the performance of the | |
9 | system becomes very bad after running rsync. | |
10 | ||
11 | Some research showed, that | |
12 | ||
13 | posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED); | |
14 | ||
15 | would tell the OS that it should not keep the file in cache. I | |
16 | have written a patch for rsync that adds the | |
17 | ||
18 | --drop-cache | |
19 | ||
20 | option which activates posix_fadvise64. | |
21 | ||
22 | There are some caveats though: | |
23 | ||
24 | * When calling posix_fadvise64 while writing a file, only the | |
25 | part of the cache will be release which has already been | |
26 | written to disk. This means we have to call fdatasync before | |
27 | calling posix_fadvise64 and this will unfortunately slow down | |
28 | operations considerably. On my test system I get 240 KByte/s. | |
29 | ||
30 | The patch has been optimized, so that the impact on large files | |
31 | will be considerably lowered by calling posix_fadvise64 only | |
32 | after a few megabytes have been written. | |
33 | ||
34 | * When reading a file which has been cached *Before* rsync read | |
35 | it, the content of the file will be released from cache never | |
36 | the less, which may not be intended. I have unfortunately not | |
37 | found a method for determining if a file is in cache or not | |
38 | (ideas?) | |
39 | ||
40 | I found that running rsync of an lvm snapshot is a good way | |
41 | around this problem, since the snapshot data is cached | |
42 | separately from the original. It has the additional benefit of | |
43 | making the backups more consistent. | |
44 | ||
45 | * I don't really know the rsync code, so it may be that the patch | |
46 | is calling fadvise for files where this would not be necessary. | |
47 | ||
48 | * The patch is tested only on Linux 2.6.18 | |
49 | ||
50 | If you have any input on this, please let me know. | |
51 | ||
52 | You can get the latest edition of the patch from | |
53 | ||
54 | http://tobi.oetiker.ch/patches/ | |
55 | ||
56 | cheers | |
57 | tobi | |
58 | ||
e2e42a01 | 59 | Changes: |
c966e8e8 WD |
60 | |
61 | 2007-04-23 | |
62 | ||
63 | * pass --drop-cache on to the remote server | |
64 | * make test works now | |
65 | ||
fc557362 WD |
66 | To use this patch, run these commands for a successful build: |
67 | ||
68 | patch -p1 <patches/drop-cache.diff | |
69 | ./configure (optional if already run) | |
70 | make | |
71 | ||
cc3e685d | 72 | diff --git a/checksum.c b/checksum.c |
fc557362 | 73 | index 811b5b6..f1f51cb 100644 |
cc3e685d WD |
74 | --- a/checksum.c |
75 | +++ b/checksum.c | |
fc557362 WD |
76 | @@ -24,6 +24,10 @@ |
77 | extern int checksum_seed; | |
78 | extern int protocol_version; | |
c966e8e8 | 79 | |
14317044 WD |
80 | +#ifdef HAVE_POSIX_FADVISE64 |
81 | +#define close(fd) fadv_close(fd) | |
82 | +#endif | |
83 | + | |
84 | /* | |
85 | a simple 32 bit checksum that can be upadted from either end | |
86 | (inspired by Mark Adler's Adler-32 checksum) | |
cc3e685d | 87 | diff --git a/cleanup.c b/cleanup.c |
fc557362 | 88 | index 19ef072..84a6cf3 100644 |
cc3e685d WD |
89 | --- a/cleanup.c |
90 | +++ b/cleanup.c | |
fc557362 | 91 | @@ -47,7 +47,13 @@ void close_all(void) |
14317044 WD |
92 | int fd; |
93 | int ret; | |
94 | STRUCT_STAT st; | |
95 | +#endif | |
96 | + | |
97 | +#ifdef HAVE_POSIX_FADVISE64 | |
98 | + fadv_close_all(); | |
99 | +#endif | |
100 | ||
101 | +#ifdef SHUTDOWN_ALL_SOCKETS | |
102 | max_fd = sysconf(_SC_OPEN_MAX) - 1; | |
103 | for (fd = max_fd; fd >= 0; fd--) { | |
104 | if ((ret = do_fstat(fd, &st)) == 0) { | |
cc3e685d | 105 | diff --git a/configure.in b/configure.in |
fc557362 | 106 | index bc7d4a7..e9ff45a 100644 |
cc3e685d WD |
107 | --- a/configure.in |
108 | +++ b/configure.in | |
fc557362 | 109 | @@ -553,7 +553,7 @@ AC_CHECK_FUNCS(waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \ |
14317044 WD |
110 | setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \ |
111 | strerror putenv iconv_open locale_charset nl_langinfo getxattr \ | |
fc557362 WD |
112 | extattr_get_link sigaction sigprocmask setattrlist getgrouplist \ |
113 | - initgroups) | |
114 | + initgroups posix_fadvise64) | |
14317044 | 115 | |
4c15e800 WD |
116 | dnl cygwin iconv.h defines iconv_open as libiconv_open |
117 | if test x"$ac_cv_func_iconv_open" != x"yes"; then | |
cc3e685d | 118 | diff --git a/fileio.c b/fileio.c |
fc557362 | 119 | index 0faa619..0326fe8 100644 |
cc3e685d WD |
120 | --- a/fileio.c |
121 | +++ b/fileio.c | |
fc557362 | 122 | @@ -31,6 +31,12 @@ extern int sparse_files; |
c966e8e8 | 123 | static char last_byte; |
abd3adb8 | 124 | static OFF_T sparse_seek = 0; |
c966e8e8 | 125 | |
14317044 WD |
126 | +#ifdef HAVE_POSIX_FADVISE64 |
127 | +#define close(fd) fadv_close(fd) | |
128 | +#define read(fd,buf,len) fadv_read(fd,buf,len) | |
129 | +#define write(fd,buf,len) fadv_write(fd,buf,len) | |
130 | +#endif | |
c966e8e8 WD |
131 | + |
132 | int sparse_end(int f) | |
133 | { | |
c8a8b4a7 | 134 | int ret; |
cc3e685d | 135 | diff --git a/generator.c b/generator.c |
fc557362 | 136 | index 12007a1..f0c7a91 100644 |
cc3e685d WD |
137 | --- a/generator.c |
138 | +++ b/generator.c | |
fc557362 | 139 | @@ -112,6 +112,10 @@ static int need_retouch_dir_times; |
c8a8b4a7 | 140 | static int need_retouch_dir_perms; |
14317044 | 141 | static const char *solo_file = NULL; |
c966e8e8 | 142 | |
14317044 WD |
143 | +#ifdef HAVE_POSIX_FADVISE64 |
144 | +#define close(fd) fadv_close(fd) | |
145 | +#endif | |
146 | + | |
fc557362 WD |
147 | enum nonregtype { |
148 | TYPE_DIR, TYPE_SPECIAL, TYPE_DEVICE, TYPE_SYMLINK | |
149 | }; | |
cc3e685d | 150 | diff --git a/options.c b/options.c |
fc557362 | 151 | index e7c6c61..ce55d8e 100644 |
cc3e685d WD |
152 | --- a/options.c |
153 | +++ b/options.c | |
c0c7984e | 154 | @@ -60,6 +60,7 @@ int preserve_uid = 0; |
c8a8b4a7 | 155 | int preserve_gid = 0; |
c966e8e8 | 156 | int preserve_times = 0; |
c966e8e8 WD |
157 | int update_only = 0; |
158 | +int drop_cache = 0; | |
159 | int cvs_exclude = 0; | |
160 | int dry_run = 0; | |
161 | int do_xfers = 1; | |
fc557362 | 162 | @@ -670,6 +671,9 @@ void usage(enum logcode F) |
c966e8e8 WD |
163 | rprintf(F," --backup-dir=DIR make backups into hierarchy based in DIR\n"); |
164 | rprintf(F," --suffix=SUFFIX set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX); | |
165 | rprintf(F," -u, --update skip files that are newer on the receiver\n"); | |
14317044 | 166 | +#ifdef HAVE_POSIX_FADVISE64 |
c966e8e8 | 167 | + rprintf(F," --drop-cache tell OS to drop caching of file data\n"); |
14317044 | 168 | +#endif |
c966e8e8 WD |
169 | rprintf(F," --inplace update destination files in-place (SEE MAN PAGE)\n"); |
170 | rprintf(F," --append append data onto shorter files\n"); | |
c8a8b4a7 | 171 | rprintf(F," --append-verify like --append, but with old data in file checksum\n"); |
fc557362 | 172 | @@ -891,6 +895,9 @@ static struct poptOption long_options[] = { |
85096e5e WD |
173 | {"no-one-file-system",'x',POPT_ARG_VAL, &one_file_system, 0, 0, 0 }, |
174 | {"no-x", 'x', POPT_ARG_VAL, &one_file_system, 0, 0, 0 }, | |
c966e8e8 | 175 | {"update", 'u', POPT_ARG_NONE, &update_only, 0, 0, 0 }, |
14317044 | 176 | +#ifdef HAVE_POSIX_FADVISE64 |
c966e8e8 | 177 | + {"drop-cache", 0, POPT_ARG_NONE, &drop_cache, 0, 0, 0 }, |
14317044 | 178 | +#endif |
c966e8e8 WD |
179 | {"existing", 0, POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, |
180 | {"ignore-non-existing",0,POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, | |
181 | {"ignore-existing", 0, POPT_ARG_NONE, &ignore_existing, 0, 0, 0 }, | |
fc557362 | 182 | @@ -2235,6 +2242,11 @@ void server_options(char **args, int *argc_p) |
c966e8e8 WD |
183 | if (!am_sender) |
184 | args[ac++] = "--sender"; | |
185 | ||
14317044 | 186 | +#ifdef HAVE_POSIX_FADVISE64 |
c966e8e8 WD |
187 | + if (drop_cache) |
188 | + args[ac++] = "--drop-cache"; | |
14317044 | 189 | +#endif |
c966e8e8 WD |
190 | + |
191 | x = 1; | |
192 | argstr[0] = '-'; | |
c8a8b4a7 | 193 | |
cc3e685d | 194 | diff --git a/receiver.c b/receiver.c |
fc557362 | 195 | index 4325e30..a3da64e 100644 |
cc3e685d WD |
196 | --- a/receiver.c |
197 | +++ b/receiver.c | |
fc557362 | 198 | @@ -64,6 +64,10 @@ static flist_ndx_list batch_redo_list; |
14317044 | 199 | /* We're either updating the basis file or an identical copy: */ |
e2b0842a | 200 | static int updating_basis_or_equiv; |
c966e8e8 | 201 | |
14317044 WD |
202 | +#ifdef HAVE_POSIX_FADVISE64 |
203 | +#define close(fd) fadv_close(fd) | |
204 | +#endif | |
205 | + | |
206 | /* | |
207 | * get_tmpname() - create a tmp filename for a given filename | |
208 | * | |
cc3e685d | 209 | diff --git a/rsync.yo b/rsync.yo |
fc557362 | 210 | index 941f7a5..512aa6b 100644 |
cc3e685d WD |
211 | --- a/rsync.yo |
212 | +++ b/rsync.yo | |
fc557362 | 213 | @@ -359,6 +359,7 @@ to the detailed description below for a complete description. verb( |
c966e8e8 WD |
214 | --super receiver attempts super-user activities |
215 | --fake-super store/recover privileged attrs using xattrs | |
216 | -S, --sparse handle sparse files efficiently | |
217 | + --drop-cache tell OS to drop caching of file data | |
e2b0842a | 218 | -n, --dry-run perform a trial run with no changes made |
f2863bc0 | 219 | -W, --whole-file copy files whole (w/o delta-xfer algorithm) |
c966e8e8 | 220 | -x, --one-file-system don't cross filesystem boundaries |
fc557362 | 221 | @@ -1120,6 +1121,10 @@ NOTE: Don't use this option when the destination is a Solaris "tmpfs" |
c966e8e8 WD |
222 | filesystem. It doesn't seem to handle seeks over null regions |
223 | correctly and ends up corrupting the files. | |
224 | ||
225 | +dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data. This | |
226 | +prevents rsync from filling up the filesystem cache. This can sometimes help | |
227 | +to make a system perform better by keeping non-rsync files in the disk cache. | |
228 | + | |
e2b0842a WD |
229 | dit(bf(-n, --dry-run)) This makes rsync perform a trial run that doesn't |
230 | make any changes (and produces mostly the same output as a real run). It | |
231 | is most commonly used in combination with the bf(-v, --verbose) and/or | |
cc3e685d | 232 | diff --git a/sender.c b/sender.c |
fc557362 | 233 | index bf8221d..9e23dbb 100644 |
cc3e685d WD |
234 | --- a/sender.c |
235 | +++ b/sender.c | |
fc557362 | 236 | @@ -45,6 +45,10 @@ extern int write_batch; |
14317044 | 237 | extern struct stats stats; |
c8a8b4a7 | 238 | extern struct file_list *cur_flist, *first_flist, *dir_flist; |
c966e8e8 | 239 | |
14317044 WD |
240 | +#ifdef HAVE_POSIX_FADVISE64 |
241 | +#define close(fd) fadv_close(fd) | |
242 | +#endif | |
243 | + | |
244 | /** | |
245 | * @file | |
246 | * | |
cc3e685d | 247 | diff --git a/t_unsafe.c b/t_unsafe.c |
fc557362 | 248 | index 9ba0aaa..3cb55e9 100644 |
cc3e685d WD |
249 | --- a/t_unsafe.c |
250 | +++ b/t_unsafe.c | |
fc557362 WD |
251 | @@ -27,6 +27,7 @@ int dry_run = 0; |
252 | int am_root = 0; | |
c966e8e8 WD |
253 | int read_only = 0; |
254 | int list_only = 0; | |
c966e8e8 | 255 | +int drop_cache = 0; |
fc557362 | 256 | int human_readable = 0; |
c966e8e8 | 257 | int preserve_perms = 0; |
a5e6228a | 258 | int preserve_executability = 0; |
cc3e685d | 259 | diff --git a/util.c b/util.c |
fc557362 | 260 | index 0cafed6..06d8770 100644 |
cc3e685d WD |
261 | --- a/util.c |
262 | +++ b/util.c | |
fc557362 WD |
263 | @@ -27,6 +27,7 @@ |
264 | ||
c966e8e8 | 265 | extern int dry_run; |
c966e8e8 | 266 | extern int module_id; |
14317044 | 267 | +extern int drop_cache; |
c966e8e8 WD |
268 | extern int modify_window; |
269 | extern int relative_paths; | |
fc557362 | 270 | extern int preserve_xattrs; |
91270139 | 271 | @@ -42,6 +43,131 @@ char curr_dir[MAXPATHLEN]; |
c966e8e8 WD |
272 | unsigned int curr_dir_len; |
273 | int curr_dir_depth; /* This is only set for a sanitizing daemon. */ | |
274 | ||
14317044 WD |
275 | +#ifdef HAVE_POSIX_FADVISE64 |
276 | +#define FADV_BUFFER_SIZE 1024*1024*16 | |
c966e8e8 | 277 | + |
14317044 WD |
278 | +static struct stat fadv_fd_stat[1024]; |
279 | +static off_t fadv_fd_pos[1024]; | |
280 | +static int fadv_fd_init = 0; | |
281 | +static int fadv_max_fd = 0; | |
282 | +static int fadv_close_ring_tail = 0; | |
283 | +static int fadv_close_ring_head = 0; | |
284 | +static int fadv_close_ring_size = 0; | |
285 | +static int fadv_close_ring[1024]; | |
286 | +static int fadv_close_buffer_size = 0; | |
c966e8e8 | 287 | + |
14317044 WD |
288 | +static void fadv_fd_init_func(void) |
289 | +{ | |
290 | + if (fadv_fd_init == 0) { | |
291 | + int i; | |
292 | + fadv_fd_init = 1; | |
293 | + if (fadv_max_fd == 0){ | |
294 | + fadv_max_fd = sysconf(_SC_OPEN_MAX) - 20; | |
295 | + if (fadv_max_fd < 0) | |
296 | + fadv_max_fd = 1; | |
297 | + if (fadv_max_fd > 1000) | |
298 | + fadv_max_fd = 1000; | |
299 | + } | |
300 | + for (i = 0; i < fadv_max_fd; i++) { | |
301 | + fadv_fd_pos[i] = 0; | |
302 | + fadv_fd_stat[i].st_dev = 0; | |
303 | + fadv_fd_stat[i].st_ino = 0; | |
304 | + } | |
305 | + } | |
c966e8e8 | 306 | +} |
14317044 WD |
307 | + |
308 | +static void fadv_drop(int fd, int sync) | |
309 | +{ | |
310 | + struct stat sb; | |
311 | + int pos; | |
312 | + | |
313 | + /* Trail 1 MB behind in dropping. we do this to make | |
314 | + * sure that the same block or stripe does not have | |
315 | + * to be written twice. */ | |
316 | + if (fd > fadv_max_fd) | |
317 | + return; | |
318 | + pos = lseek(fd, 0, SEEK_CUR) - 1024*1024; | |
c966e8e8 | 319 | + fadv_fd_init_func(); |
14317044 WD |
320 | + fstat(fd, &sb); |
321 | + if (fadv_fd_stat[fd].st_dev == sb.st_dev | |
322 | + && fadv_fd_stat[fd].st_ino == sb.st_ino) { | |
323 | + if (fadv_fd_pos[fd] < pos - FADV_BUFFER_SIZE) { | |
324 | + if (sync) { | |
325 | + /* If the file is not flushed to disk before calling fadvise, | |
326 | + * then the Cache will not be freed and the advise gets ignored | |
327 | + * this does give a severe hit on performance. If only there | |
328 | + * was a way to mark cache so that it gets release once the data | |
329 | + * is written to disk. */ | |
330 | + fdatasync(fd); | |
331 | + } | |
332 | + posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED); | |
333 | + fadv_fd_pos[fd] = pos; | |
334 | + } | |
335 | + } else { | |
336 | + fadv_fd_stat[fd].st_dev = sb.st_dev; | |
337 | + fadv_fd_stat[fd].st_ino = sb.st_ino; | |
338 | + fadv_fd_pos[fd] = 0; | |
339 | + } | |
c966e8e8 | 340 | +} |
14317044 | 341 | + |
c966e8e8 WD |
342 | +ssize_t fadv_write(int fd, const void *buf, size_t count) |
343 | +{ | |
14317044 WD |
344 | + int ret = write(fd, buf, count); |
345 | + if (drop_cache) | |
346 | + fadv_drop(fd, 1); | |
347 | + return ret; | |
c966e8e8 WD |
348 | +} |
349 | + | |
350 | +ssize_t fadv_read(int fd, void *buf, size_t count) | |
351 | +{ | |
14317044 WD |
352 | + int ret = read(fd, buf, count); |
353 | + if (drop_cache) | |
354 | + fadv_drop(fd, 0); | |
355 | + return ret; | |
356 | +} | |
357 | + | |
358 | +void fadv_close_all(void) | |
359 | +{ | |
360 | + while (fadv_close_ring_size > 0){ | |
361 | + fdatasync(fadv_close_ring[fadv_close_ring_tail]); | |
362 | + posix_fadvise64(fadv_close_ring[fadv_close_ring_tail], 0, 0,POSIX_FADV_DONTNEED); | |
363 | + fadv_close_ring_size--; | |
364 | + close(fadv_close_ring[fadv_close_ring_tail]); | |
365 | + fadv_close_ring_tail = (fadv_close_ring_tail + 1) % fadv_max_fd; | |
366 | + fadv_close_buffer_size = 0; | |
367 | + } | |
c966e8e8 WD |
368 | +} |
369 | + | |
14317044 WD |
370 | +int fadv_close(int fd) |
371 | +{ | |
372 | + if (drop_cache) { | |
373 | + /* If the file is not flushed to disk before calling fadvise, | |
374 | + * then the Cache will not be freed and the advise gets ignored | |
375 | + * this does give a severe hit on performance. So instead of doing | |
376 | + * it right away, we save us a copy of the filehandle and do it | |
377 | + * some time before we are out of filehandles. This speeds | |
378 | + * up operation for small files massively. It is directly | |
379 | + * related to the number of spare file handles you have. */ | |
380 | + int newfd = dup(fd); | |
381 | + int pos = lseek(fd, 0, SEEK_CUR); | |
382 | + fadv_fd_init_func(); | |
383 | + fadv_close_buffer_size += pos - fadv_fd_pos[fd]; | |
384 | + fadv_close_ring[fadv_close_ring_head] = newfd; | |
e2e42a01 | 385 | + fadv_close_ring_head = (fadv_close_ring_head + 1) % fadv_max_fd; |
14317044 WD |
386 | + fadv_close_ring_size ++; |
387 | + if (fadv_close_ring_size == fadv_max_fd || fadv_close_buffer_size > 1024*1024 ){ | |
388 | + /* it seems fastest to drop things 'in groups' */ | |
389 | + fadv_close_all(); | |
e2e42a01 | 390 | + } |
14317044 WD |
391 | + } |
392 | + return close(fd); | |
c966e8e8 | 393 | +} |
14317044 WD |
394 | + |
395 | +#define close(fd) fadv_close(fd) | |
396 | +#define read(fd,buf,len) fadv_read(fd,buf,len) | |
397 | +#define write(fd,buf,len) fadv_write(fd,buf,len) | |
398 | +#endif | |
c966e8e8 WD |
399 | + |
400 | /* Set a fd into nonblocking mode. */ | |
401 | void set_nonblocking(int fd) | |
402 | { |