| 1 | From: Tobi Oetiker tobi{at}oetiker.ch |
| 2 | Date: 2007-04-23 |
| 3 | |
| 4 | I am using rsync for hard-link backup. I found that there is a |
| 5 | major problem with frequent backup filling up the file system cache |
| 6 | with all the data from the files being backed up. The effect is |
| 7 | that all the other 'sensible' data in the cache gets thrown out in |
| 8 | the process. This is rather unfortunate as the performance of the |
| 9 | system becomes very bad after running rsync. |
| 10 | |
| 11 | Some research showed, that |
| 12 | |
| 13 | posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED); |
| 14 | |
| 15 | would tell the OS that it should not keep the file in cache. I |
| 16 | have written a patch for rsync that adds the |
| 17 | |
| 18 | --drop-cache |
| 19 | |
| 20 | option which activates posix_fadvise64. |
| 21 | |
| 22 | There are some caveats though: |
| 23 | |
| 24 | * When calling posix_fadvise64 while writing a file, only the |
| 25 | part of the cache will be release which has already been |
| 26 | written to disk. This means we have to call fdatasync before |
| 27 | calling posix_fadvise64 and this will unfortunately slow down |
| 28 | operations considerably. On my test system I get 240 KByte/s. |
| 29 | |
| 30 | The patch has been optimized, so that the impact on large files |
| 31 | will be considerably lowered by calling posix_fadvise64 only |
| 32 | after a few megabytes have been written. |
| 33 | |
| 34 | * When reading a file which has been cached *Before* rsync read |
| 35 | it, the content of the file will be released from cache never |
| 36 | the less, which may not be intended. I have unfortunately not |
| 37 | found a method for determining if a file is in cache or not |
| 38 | (ideas?) |
| 39 | |
| 40 | I found that running rsync of an lvm snapshot is a good way |
| 41 | around this problem, since the snapshot data is cached |
| 42 | separately from the original. It has the additional benefit of |
| 43 | making the backups more consistent. |
| 44 | |
| 45 | * I don't really know the rsync code, so it may be that the patch |
| 46 | is calling fadvise for files where this would not be necessary. |
| 47 | |
| 48 | * The patch is tested only on Linux 2.6.18 |
| 49 | |
| 50 | If you have any input on this, please let me know. |
| 51 | |
| 52 | You can get the latest edition of the patch from |
| 53 | |
| 54 | http://tobi.oetiker.ch/patches/ |
| 55 | |
| 56 | cheers |
| 57 | tobi |
| 58 | |
| 59 | Changes: |
| 60 | |
| 61 | 2007-04-23 |
| 62 | |
| 63 | * pass --drop-cache on to the remote server |
| 64 | * make test works now |
| 65 | |
| 66 | To use this patch, run these commands for a successful build: |
| 67 | |
| 68 | patch -p1 <patches/drop-cache.diff |
| 69 | ./configure (optional if already run) |
| 70 | make |
| 71 | |
| 72 | based-on: a01e3b490eb36ccf9e704840e1b6683dab867550 |
| 73 | diff --git a/checksum.c b/checksum.c |
| 74 | --- a/checksum.c |
| 75 | +++ b/checksum.c |
| 76 | @@ -24,6 +24,10 @@ |
| 77 | extern int checksum_seed; |
| 78 | extern int protocol_version; |
| 79 | |
| 80 | +#ifdef HAVE_POSIX_FADVISE64 |
| 81 | +#define close(fd) fadv_close(fd) |
| 82 | +#endif |
| 83 | + |
| 84 | /* |
| 85 | a simple 32 bit checksum that can be upadted from either end |
| 86 | (inspired by Mark Adler's Adler-32 checksum) |
| 87 | diff --git a/cleanup.c b/cleanup.c |
| 88 | --- a/cleanup.c |
| 89 | +++ b/cleanup.c |
| 90 | @@ -51,7 +51,13 @@ void close_all(void) |
| 91 | int fd; |
| 92 | int ret; |
| 93 | STRUCT_STAT st; |
| 94 | +#endif |
| 95 | + |
| 96 | +#ifdef HAVE_POSIX_FADVISE64 |
| 97 | + fadv_close_all(); |
| 98 | +#endif |
| 99 | |
| 100 | +#ifdef SHUTDOWN_ALL_SOCKETS |
| 101 | max_fd = sysconf(_SC_OPEN_MAX) - 1; |
| 102 | for (fd = max_fd; fd >= 0; fd--) { |
| 103 | if ((ret = do_fstat(fd, &st)) == 0) { |
| 104 | diff --git a/configure.in b/configure.in |
| 105 | --- a/configure.in |
| 106 | +++ b/configure.in |
| 107 | @@ -589,7 +589,7 @@ AC_CHECK_FUNCS(waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \ |
| 108 | setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \ |
| 109 | seteuid strerror putenv iconv_open locale_charset nl_langinfo getxattr \ |
| 110 | extattr_get_link sigaction sigprocmask setattrlist getgrouplist \ |
| 111 | - initgroups utimensat) |
| 112 | + initgroups utimensat posix_fadvise64) |
| 113 | |
| 114 | dnl cygwin iconv.h defines iconv_open as libiconv_open |
| 115 | if test x"$ac_cv_func_iconv_open" != x"yes"; then |
| 116 | diff --git a/fileio.c b/fileio.c |
| 117 | --- a/fileio.c |
| 118 | +++ b/fileio.c |
| 119 | @@ -31,6 +31,12 @@ extern int sparse_files; |
| 120 | static char last_byte; |
| 121 | static OFF_T sparse_seek = 0; |
| 122 | |
| 123 | +#ifdef HAVE_POSIX_FADVISE64 |
| 124 | +#define close(fd) fadv_close(fd) |
| 125 | +#define read(fd,buf,len) fadv_read(fd,buf,len) |
| 126 | +#define write(fd,buf,len) fadv_write(fd,buf,len) |
| 127 | +#endif |
| 128 | + |
| 129 | int sparse_end(int f) |
| 130 | { |
| 131 | int ret; |
| 132 | diff --git a/generator.c b/generator.c |
| 133 | --- a/generator.c |
| 134 | +++ b/generator.c |
| 135 | @@ -111,6 +111,10 @@ static int need_retouch_dir_times; |
| 136 | static int need_retouch_dir_perms; |
| 137 | static const char *solo_file = NULL; |
| 138 | |
| 139 | +#ifdef HAVE_POSIX_FADVISE64 |
| 140 | +#define close(fd) fadv_close(fd) |
| 141 | +#endif |
| 142 | + |
| 143 | enum nonregtype { |
| 144 | TYPE_DIR, TYPE_SPECIAL, TYPE_DEVICE, TYPE_SYMLINK |
| 145 | }; |
| 146 | diff --git a/options.c b/options.c |
| 147 | --- a/options.c |
| 148 | +++ b/options.c |
| 149 | @@ -60,6 +60,7 @@ int preserve_uid = 0; |
| 150 | int preserve_gid = 0; |
| 151 | int preserve_times = 0; |
| 152 | int update_only = 0; |
| 153 | +int drop_cache = 0; |
| 154 | int cvs_exclude = 0; |
| 155 | int dry_run = 0; |
| 156 | int do_xfers = 1; |
| 157 | @@ -671,6 +672,9 @@ void usage(enum logcode F) |
| 158 | rprintf(F," --backup-dir=DIR make backups into hierarchy based in DIR\n"); |
| 159 | rprintf(F," --suffix=SUFFIX set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX); |
| 160 | rprintf(F," -u, --update skip files that are newer on the receiver\n"); |
| 161 | +#ifdef HAVE_POSIX_FADVISE64 |
| 162 | + rprintf(F," --drop-cache tell OS to drop caching of file data\n"); |
| 163 | +#endif |
| 164 | rprintf(F," --inplace update destination files in-place (SEE MAN PAGE)\n"); |
| 165 | rprintf(F," --append append data onto shorter files\n"); |
| 166 | rprintf(F," --append-verify like --append, but with old data in file checksum\n"); |
| 167 | @@ -892,6 +896,9 @@ static struct poptOption long_options[] = { |
| 168 | {"no-one-file-system",'x',POPT_ARG_VAL, &one_file_system, 0, 0, 0 }, |
| 169 | {"no-x", 'x', POPT_ARG_VAL, &one_file_system, 0, 0, 0 }, |
| 170 | {"update", 'u', POPT_ARG_NONE, &update_only, 0, 0, 0 }, |
| 171 | +#ifdef HAVE_POSIX_FADVISE64 |
| 172 | + {"drop-cache", 0, POPT_ARG_NONE, &drop_cache, 0, 0, 0 }, |
| 173 | +#endif |
| 174 | {"existing", 0, POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, |
| 175 | {"ignore-non-existing",0,POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 }, |
| 176 | {"ignore-existing", 0, POPT_ARG_NONE, &ignore_existing, 0, 0, 0 }, |
| 177 | @@ -2287,6 +2294,11 @@ void server_options(char **args, int *argc_p) |
| 178 | if (!am_sender) |
| 179 | args[ac++] = "--sender"; |
| 180 | |
| 181 | +#ifdef HAVE_POSIX_FADVISE64 |
| 182 | + if (drop_cache) |
| 183 | + args[ac++] = "--drop-cache"; |
| 184 | +#endif |
| 185 | + |
| 186 | x = 1; |
| 187 | argstr[0] = '-'; |
| 188 | |
| 189 | diff --git a/receiver.c b/receiver.c |
| 190 | --- a/receiver.c |
| 191 | +++ b/receiver.c |
| 192 | @@ -64,6 +64,10 @@ static flist_ndx_list batch_redo_list; |
| 193 | /* We're either updating the basis file or an identical copy: */ |
| 194 | static int updating_basis_or_equiv; |
| 195 | |
| 196 | +#ifdef HAVE_POSIX_FADVISE64 |
| 197 | +#define close(fd) fadv_close(fd) |
| 198 | +#endif |
| 199 | + |
| 200 | #define TMPNAME_SUFFIX ".XXXXXX" |
| 201 | #define TMPNAME_SUFFIX_LEN ((int)sizeof TMPNAME_SUFFIX - 1) |
| 202 | #define MAX_UNIQUE_NUMBER 999999 |
| 203 | diff --git a/rsync.yo b/rsync.yo |
| 204 | --- a/rsync.yo |
| 205 | +++ b/rsync.yo |
| 206 | @@ -359,6 +359,7 @@ to the detailed description below for a complete description. verb( |
| 207 | --super receiver attempts super-user activities |
| 208 | --fake-super store/recover privileged attrs using xattrs |
| 209 | -S, --sparse handle sparse files efficiently |
| 210 | + --drop-cache tell OS to drop caching of file data |
| 211 | -n, --dry-run perform a trial run with no changes made |
| 212 | -W, --whole-file copy files whole (w/o delta-xfer algorithm) |
| 213 | -x, --one-file-system don't cross filesystem boundaries |
| 214 | @@ -1127,6 +1128,10 @@ NOTE: Don't use this option when the destination is a Solaris "tmpfs" |
| 215 | filesystem. It seems to have problems seeking over null regions, |
| 216 | and ends up corrupting the files. |
| 217 | |
| 218 | +dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data. This |
| 219 | +prevents rsync from filling up the filesystem cache. This can sometimes help |
| 220 | +to make a system perform better by keeping non-rsync files in the disk cache. |
| 221 | + |
| 222 | dit(bf(-n, --dry-run)) This makes rsync perform a trial run that doesn't |
| 223 | make any changes (and produces mostly the same output as a real run). It |
| 224 | is most commonly used in combination with the bf(-v, --verbose) and/or |
| 225 | diff --git a/sender.c b/sender.c |
| 226 | --- a/sender.c |
| 227 | +++ b/sender.c |
| 228 | @@ -49,6 +49,10 @@ extern struct file_list *cur_flist, *first_flist, *dir_flist; |
| 229 | |
| 230 | BOOL extra_flist_sending_enabled; |
| 231 | |
| 232 | +#ifdef HAVE_POSIX_FADVISE64 |
| 233 | +#define close(fd) fadv_close(fd) |
| 234 | +#endif |
| 235 | + |
| 236 | /** |
| 237 | * @file |
| 238 | * |
| 239 | diff --git a/t_unsafe.c b/t_unsafe.c |
| 240 | --- a/t_unsafe.c |
| 241 | +++ b/t_unsafe.c |
| 242 | @@ -28,6 +28,7 @@ int am_root = 0; |
| 243 | int am_sender = 1; |
| 244 | int read_only = 0; |
| 245 | int list_only = 0; |
| 246 | +int drop_cache = 0; |
| 247 | int human_readable = 0; |
| 248 | int preserve_perms = 0; |
| 249 | int preserve_executability = 0; |
| 250 | diff --git a/util.c b/util.c |
| 251 | --- a/util.c |
| 252 | +++ b/util.c |
| 253 | @@ -27,6 +27,7 @@ |
| 254 | |
| 255 | extern int dry_run; |
| 256 | extern int module_id; |
| 257 | +extern int drop_cache; |
| 258 | extern int modify_window; |
| 259 | extern int relative_paths; |
| 260 | extern int preserve_xattrs; |
| 261 | @@ -42,6 +43,131 @@ char curr_dir[MAXPATHLEN]; |
| 262 | unsigned int curr_dir_len; |
| 263 | int curr_dir_depth; /* This is only set for a sanitizing daemon. */ |
| 264 | |
| 265 | +#ifdef HAVE_POSIX_FADVISE64 |
| 266 | +#define FADV_BUFFER_SIZE 1024*1024*16 |
| 267 | + |
| 268 | +static struct stat fadv_fd_stat[1024]; |
| 269 | +static off_t fadv_fd_pos[1024]; |
| 270 | +static int fadv_fd_init = 0; |
| 271 | +static int fadv_max_fd = 0; |
| 272 | +static int fadv_close_ring_tail = 0; |
| 273 | +static int fadv_close_ring_head = 0; |
| 274 | +static int fadv_close_ring_size = 0; |
| 275 | +static int fadv_close_ring[1024]; |
| 276 | +static int fadv_close_buffer_size = 0; |
| 277 | + |
| 278 | +static void fadv_fd_init_func(void) |
| 279 | +{ |
| 280 | + if (fadv_fd_init == 0) { |
| 281 | + int i; |
| 282 | + fadv_fd_init = 1; |
| 283 | + if (fadv_max_fd == 0){ |
| 284 | + fadv_max_fd = sysconf(_SC_OPEN_MAX) - 20; |
| 285 | + if (fadv_max_fd < 0) |
| 286 | + fadv_max_fd = 1; |
| 287 | + if (fadv_max_fd > 1000) |
| 288 | + fadv_max_fd = 1000; |
| 289 | + } |
| 290 | + for (i = 0; i < fadv_max_fd; i++) { |
| 291 | + fadv_fd_pos[i] = 0; |
| 292 | + fadv_fd_stat[i].st_dev = 0; |
| 293 | + fadv_fd_stat[i].st_ino = 0; |
| 294 | + } |
| 295 | + } |
| 296 | +} |
| 297 | + |
| 298 | +static void fadv_drop(int fd, int sync) |
| 299 | +{ |
| 300 | + struct stat sb; |
| 301 | + int pos; |
| 302 | + |
| 303 | + /* Trail 1 MB behind in dropping. we do this to make |
| 304 | + * sure that the same block or stripe does not have |
| 305 | + * to be written twice. */ |
| 306 | + if (fd > fadv_max_fd) |
| 307 | + return; |
| 308 | + pos = lseek(fd, 0, SEEK_CUR) - 1024*1024; |
| 309 | + fadv_fd_init_func(); |
| 310 | + fstat(fd, &sb); |
| 311 | + if (fadv_fd_stat[fd].st_dev == sb.st_dev |
| 312 | + && fadv_fd_stat[fd].st_ino == sb.st_ino) { |
| 313 | + if (fadv_fd_pos[fd] < pos - FADV_BUFFER_SIZE) { |
| 314 | + if (sync) { |
| 315 | + /* If the file is not flushed to disk before calling fadvise, |
| 316 | + * then the Cache will not be freed and the advise gets ignored |
| 317 | + * this does give a severe hit on performance. If only there |
| 318 | + * was a way to mark cache so that it gets release once the data |
| 319 | + * is written to disk. */ |
| 320 | + fdatasync(fd); |
| 321 | + } |
| 322 | + posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED); |
| 323 | + fadv_fd_pos[fd] = pos; |
| 324 | + } |
| 325 | + } else { |
| 326 | + fadv_fd_stat[fd].st_dev = sb.st_dev; |
| 327 | + fadv_fd_stat[fd].st_ino = sb.st_ino; |
| 328 | + fadv_fd_pos[fd] = 0; |
| 329 | + } |
| 330 | +} |
| 331 | + |
| 332 | +ssize_t fadv_write(int fd, const void *buf, size_t count) |
| 333 | +{ |
| 334 | + int ret = write(fd, buf, count); |
| 335 | + if (drop_cache) |
| 336 | + fadv_drop(fd, 1); |
| 337 | + return ret; |
| 338 | +} |
| 339 | + |
| 340 | +ssize_t fadv_read(int fd, void *buf, size_t count) |
| 341 | +{ |
| 342 | + int ret = read(fd, buf, count); |
| 343 | + if (drop_cache) |
| 344 | + fadv_drop(fd, 0); |
| 345 | + return ret; |
| 346 | +} |
| 347 | + |
| 348 | +void fadv_close_all(void) |
| 349 | +{ |
| 350 | + while (fadv_close_ring_size > 0){ |
| 351 | + fdatasync(fadv_close_ring[fadv_close_ring_tail]); |
| 352 | + posix_fadvise64(fadv_close_ring[fadv_close_ring_tail], 0, 0,POSIX_FADV_DONTNEED); |
| 353 | + fadv_close_ring_size--; |
| 354 | + close(fadv_close_ring[fadv_close_ring_tail]); |
| 355 | + fadv_close_ring_tail = (fadv_close_ring_tail + 1) % fadv_max_fd; |
| 356 | + fadv_close_buffer_size = 0; |
| 357 | + } |
| 358 | +} |
| 359 | + |
| 360 | +int fadv_close(int fd) |
| 361 | +{ |
| 362 | + if (drop_cache) { |
| 363 | + /* If the file is not flushed to disk before calling fadvise, |
| 364 | + * then the Cache will not be freed and the advise gets ignored |
| 365 | + * this does give a severe hit on performance. So instead of doing |
| 366 | + * it right away, we save us a copy of the filehandle and do it |
| 367 | + * some time before we are out of filehandles. This speeds |
| 368 | + * up operation for small files massively. It is directly |
| 369 | + * related to the number of spare file handles you have. */ |
| 370 | + int newfd = dup(fd); |
| 371 | + int pos = lseek(fd, 0, SEEK_CUR); |
| 372 | + fadv_fd_init_func(); |
| 373 | + fadv_close_buffer_size += pos - fadv_fd_pos[fd]; |
| 374 | + fadv_close_ring[fadv_close_ring_head] = newfd; |
| 375 | + fadv_close_ring_head = (fadv_close_ring_head + 1) % fadv_max_fd; |
| 376 | + fadv_close_ring_size ++; |
| 377 | + if (fadv_close_ring_size == fadv_max_fd || fadv_close_buffer_size > 1024*1024 ){ |
| 378 | + /* it seems fastest to drop things 'in groups' */ |
| 379 | + fadv_close_all(); |
| 380 | + } |
| 381 | + } |
| 382 | + return close(fd); |
| 383 | +} |
| 384 | + |
| 385 | +#define close(fd) fadv_close(fd) |
| 386 | +#define read(fd,buf,len) fadv_read(fd,buf,len) |
| 387 | +#define write(fd,buf,len) fadv_write(fd,buf,len) |
| 388 | +#endif |
| 389 | + |
| 390 | /* Set a fd into nonblocking mode. */ |
| 391 | void set_nonblocking(int fd) |
| 392 | { |