Updated patches to work with the current trunk.
[rsync/rsync-patches.git] / drop-cache.diff
CommitLineData
c966e8e8
WD
1From: Tobi Oetiker tobi{at}oetiker.ch
2Date: 2007-04-23
3
4I am using rsync for hard-link backup. I found that there is a
5major problem with frequent backup filling up the file system cache
6with all the data from the files being backed up. The effect is
7that all the other 'sensible' data in the cache gets thrown out in
8the process. This is rather unfortunate as the performance of the
9system becomes very bad after running rsync.
10
11Some research showed, that
12
13 posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED);
14
15would tell the OS that it should not keep the file in cache. I
16have written a patch for rsync that adds the
17
18 --drop-cache
19
20option which activates posix_fadvise64.
21
22There are some caveats though:
23
24 * When calling posix_fadvise64 while writing a file, only the
25 part of the cache will be release which has already been
26 written to disk. This means we have to call fdatasync before
27 calling posix_fadvise64 and this will unfortunately slow down
28 operations considerably. On my test system I get 240 KByte/s.
29
30 The patch has been optimized, so that the impact on large files
31 will be considerably lowered by calling posix_fadvise64 only
32 after a few megabytes have been written.
33
34 * When reading a file which has been cached *Before* rsync read
35 it, the content of the file will be released from cache never
36 the less, which may not be intended. I have unfortunately not
37 found a method for determining if a file is in cache or not
38 (ideas?)
39
40 I found that running rsync of an lvm snapshot is a good way
41 around this problem, since the snapshot data is cached
42 separately from the original. It has the additional benefit of
43 making the backups more consistent.
44
45 * I don't really know the rsync code, so it may be that the patch
46 is calling fadvise for files where this would not be necessary.
47
48 * The patch is tested only on Linux 2.6.18
49
50If you have any input on this, please let me know.
51
52You can get the latest edition of the patch from
53
54 http://tobi.oetiker.ch/patches/
55
56cheers
57tobi
58
e2e42a01 59Changes:
c966e8e8
WD
60
61 2007-04-23
62
63* pass --drop-cache on to the remote server
64* make test works now
65
fc557362
WD
66To use this patch, run these commands for a successful build:
67
68 patch -p1 <patches/drop-cache.diff
69 ./configure (optional if already run)
70 make
71
cc3e685d 72diff --git a/checksum.c b/checksum.c
fc557362 73index 811b5b6..f1f51cb 100644
cc3e685d
WD
74--- a/checksum.c
75+++ b/checksum.c
fc557362
WD
76@@ -24,6 +24,10 @@
77 extern int checksum_seed;
78 extern int protocol_version;
c966e8e8 79
14317044
WD
80+#ifdef HAVE_POSIX_FADVISE64
81+#define close(fd) fadv_close(fd)
82+#endif
83+
84 /*
85 a simple 32 bit checksum that can be upadted from either end
86 (inspired by Mark Adler's Adler-32 checksum)
cc3e685d 87diff --git a/cleanup.c b/cleanup.c
fc557362 88index 19ef072..84a6cf3 100644
cc3e685d
WD
89--- a/cleanup.c
90+++ b/cleanup.c
fc557362 91@@ -47,7 +47,13 @@ void close_all(void)
14317044
WD
92 int fd;
93 int ret;
94 STRUCT_STAT st;
95+#endif
96+
97+#ifdef HAVE_POSIX_FADVISE64
98+ fadv_close_all();
99+#endif
100
101+#ifdef SHUTDOWN_ALL_SOCKETS
102 max_fd = sysconf(_SC_OPEN_MAX) - 1;
103 for (fd = max_fd; fd >= 0; fd--) {
104 if ((ret = do_fstat(fd, &st)) == 0) {
cc3e685d 105diff --git a/configure.in b/configure.in
fc557362 106index bc7d4a7..e9ff45a 100644
cc3e685d
WD
107--- a/configure.in
108+++ b/configure.in
fc557362 109@@ -553,7 +553,7 @@ AC_CHECK_FUNCS(waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \
14317044
WD
110 setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \
111 strerror putenv iconv_open locale_charset nl_langinfo getxattr \
fc557362
WD
112 extattr_get_link sigaction sigprocmask setattrlist getgrouplist \
113- initgroups)
114+ initgroups posix_fadvise64)
14317044 115
4c15e800
WD
116 dnl cygwin iconv.h defines iconv_open as libiconv_open
117 if test x"$ac_cv_func_iconv_open" != x"yes"; then
cc3e685d 118diff --git a/fileio.c b/fileio.c
fc557362 119index 0faa619..0326fe8 100644
cc3e685d
WD
120--- a/fileio.c
121+++ b/fileio.c
fc557362 122@@ -31,6 +31,12 @@ extern int sparse_files;
c966e8e8 123 static char last_byte;
abd3adb8 124 static OFF_T sparse_seek = 0;
c966e8e8 125
14317044
WD
126+#ifdef HAVE_POSIX_FADVISE64
127+#define close(fd) fadv_close(fd)
128+#define read(fd,buf,len) fadv_read(fd,buf,len)
129+#define write(fd,buf,len) fadv_write(fd,buf,len)
130+#endif
c966e8e8
WD
131+
132 int sparse_end(int f)
133 {
c8a8b4a7 134 int ret;
cc3e685d 135diff --git a/generator.c b/generator.c
fc557362 136index 12007a1..f0c7a91 100644
cc3e685d
WD
137--- a/generator.c
138+++ b/generator.c
fc557362 139@@ -112,6 +112,10 @@ static int need_retouch_dir_times;
c8a8b4a7 140 static int need_retouch_dir_perms;
14317044 141 static const char *solo_file = NULL;
c966e8e8 142
14317044
WD
143+#ifdef HAVE_POSIX_FADVISE64
144+#define close(fd) fadv_close(fd)
145+#endif
146+
fc557362
WD
147 enum nonregtype {
148 TYPE_DIR, TYPE_SPECIAL, TYPE_DEVICE, TYPE_SYMLINK
149 };
cc3e685d 150diff --git a/options.c b/options.c
fc557362 151index e7c6c61..ce55d8e 100644
cc3e685d
WD
152--- a/options.c
153+++ b/options.c
c0c7984e 154@@ -60,6 +60,7 @@ int preserve_uid = 0;
c8a8b4a7 155 int preserve_gid = 0;
c966e8e8 156 int preserve_times = 0;
c966e8e8
WD
157 int update_only = 0;
158+int drop_cache = 0;
159 int cvs_exclude = 0;
160 int dry_run = 0;
161 int do_xfers = 1;
fc557362 162@@ -670,6 +671,9 @@ void usage(enum logcode F)
c966e8e8
WD
163 rprintf(F," --backup-dir=DIR make backups into hierarchy based in DIR\n");
164 rprintf(F," --suffix=SUFFIX set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX);
165 rprintf(F," -u, --update skip files that are newer on the receiver\n");
14317044 166+#ifdef HAVE_POSIX_FADVISE64
c966e8e8 167+ rprintf(F," --drop-cache tell OS to drop caching of file data\n");
14317044 168+#endif
c966e8e8
WD
169 rprintf(F," --inplace update destination files in-place (SEE MAN PAGE)\n");
170 rprintf(F," --append append data onto shorter files\n");
c8a8b4a7 171 rprintf(F," --append-verify like --append, but with old data in file checksum\n");
fc557362 172@@ -891,6 +895,9 @@ static struct poptOption long_options[] = {
85096e5e
WD
173 {"no-one-file-system",'x',POPT_ARG_VAL, &one_file_system, 0, 0, 0 },
174 {"no-x", 'x', POPT_ARG_VAL, &one_file_system, 0, 0, 0 },
c966e8e8 175 {"update", 'u', POPT_ARG_NONE, &update_only, 0, 0, 0 },
14317044 176+#ifdef HAVE_POSIX_FADVISE64
c966e8e8 177+ {"drop-cache", 0, POPT_ARG_NONE, &drop_cache, 0, 0, 0 },
14317044 178+#endif
c966e8e8
WD
179 {"existing", 0, POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 },
180 {"ignore-non-existing",0,POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 },
181 {"ignore-existing", 0, POPT_ARG_NONE, &ignore_existing, 0, 0, 0 },
fc557362 182@@ -2235,6 +2242,11 @@ void server_options(char **args, int *argc_p)
c966e8e8
WD
183 if (!am_sender)
184 args[ac++] = "--sender";
185
14317044 186+#ifdef HAVE_POSIX_FADVISE64
c966e8e8
WD
187+ if (drop_cache)
188+ args[ac++] = "--drop-cache";
14317044 189+#endif
c966e8e8
WD
190+
191 x = 1;
192 argstr[0] = '-';
c8a8b4a7 193
cc3e685d 194diff --git a/receiver.c b/receiver.c
fc557362 195index 4325e30..a3da64e 100644
cc3e685d
WD
196--- a/receiver.c
197+++ b/receiver.c
fc557362 198@@ -64,6 +64,10 @@ static flist_ndx_list batch_redo_list;
14317044 199 /* We're either updating the basis file or an identical copy: */
e2b0842a 200 static int updating_basis_or_equiv;
c966e8e8 201
14317044
WD
202+#ifdef HAVE_POSIX_FADVISE64
203+#define close(fd) fadv_close(fd)
204+#endif
205+
206 /*
207 * get_tmpname() - create a tmp filename for a given filename
208 *
cc3e685d 209diff --git a/rsync.yo b/rsync.yo
fc557362 210index 941f7a5..512aa6b 100644
cc3e685d
WD
211--- a/rsync.yo
212+++ b/rsync.yo
fc557362 213@@ -359,6 +359,7 @@ to the detailed description below for a complete description. verb(
c966e8e8
WD
214 --super receiver attempts super-user activities
215 --fake-super store/recover privileged attrs using xattrs
216 -S, --sparse handle sparse files efficiently
217+ --drop-cache tell OS to drop caching of file data
e2b0842a 218 -n, --dry-run perform a trial run with no changes made
f2863bc0 219 -W, --whole-file copy files whole (w/o delta-xfer algorithm)
c966e8e8 220 -x, --one-file-system don't cross filesystem boundaries
fc557362 221@@ -1120,6 +1121,10 @@ NOTE: Don't use this option when the destination is a Solaris "tmpfs"
c966e8e8
WD
222 filesystem. It doesn't seem to handle seeks over null regions
223 correctly and ends up corrupting the files.
224
225+dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data. This
226+prevents rsync from filling up the filesystem cache. This can sometimes help
227+to make a system perform better by keeping non-rsync files in the disk cache.
228+
e2b0842a
WD
229 dit(bf(-n, --dry-run)) This makes rsync perform a trial run that doesn't
230 make any changes (and produces mostly the same output as a real run). It
231 is most commonly used in combination with the bf(-v, --verbose) and/or
cc3e685d 232diff --git a/sender.c b/sender.c
fc557362 233index bf8221d..9e23dbb 100644
cc3e685d
WD
234--- a/sender.c
235+++ b/sender.c
fc557362 236@@ -45,6 +45,10 @@ extern int write_batch;
14317044 237 extern struct stats stats;
c8a8b4a7 238 extern struct file_list *cur_flist, *first_flist, *dir_flist;
c966e8e8 239
14317044
WD
240+#ifdef HAVE_POSIX_FADVISE64
241+#define close(fd) fadv_close(fd)
242+#endif
243+
244 /**
245 * @file
246 *
cc3e685d 247diff --git a/t_unsafe.c b/t_unsafe.c
fc557362 248index 9ba0aaa..3cb55e9 100644
cc3e685d
WD
249--- a/t_unsafe.c
250+++ b/t_unsafe.c
fc557362
WD
251@@ -27,6 +27,7 @@ int dry_run = 0;
252 int am_root = 0;
c966e8e8
WD
253 int read_only = 0;
254 int list_only = 0;
c966e8e8 255+int drop_cache = 0;
fc557362 256 int human_readable = 0;
c966e8e8 257 int preserve_perms = 0;
a5e6228a 258 int preserve_executability = 0;
cc3e685d 259diff --git a/util.c b/util.c
fc557362 260index 0cafed6..06d8770 100644
cc3e685d
WD
261--- a/util.c
262+++ b/util.c
fc557362
WD
263@@ -27,6 +27,7 @@
264
c966e8e8 265 extern int dry_run;
c966e8e8 266 extern int module_id;
14317044 267+extern int drop_cache;
c966e8e8
WD
268 extern int modify_window;
269 extern int relative_paths;
fc557362 270 extern int preserve_xattrs;
91270139 271@@ -42,6 +43,131 @@ char curr_dir[MAXPATHLEN];
c966e8e8
WD
272 unsigned int curr_dir_len;
273 int curr_dir_depth; /* This is only set for a sanitizing daemon. */
274
14317044
WD
275+#ifdef HAVE_POSIX_FADVISE64
276+#define FADV_BUFFER_SIZE 1024*1024*16
c966e8e8 277+
14317044
WD
278+static struct stat fadv_fd_stat[1024];
279+static off_t fadv_fd_pos[1024];
280+static int fadv_fd_init = 0;
281+static int fadv_max_fd = 0;
282+static int fadv_close_ring_tail = 0;
283+static int fadv_close_ring_head = 0;
284+static int fadv_close_ring_size = 0;
285+static int fadv_close_ring[1024];
286+static int fadv_close_buffer_size = 0;
c966e8e8 287+
14317044
WD
288+static void fadv_fd_init_func(void)
289+{
290+ if (fadv_fd_init == 0) {
291+ int i;
292+ fadv_fd_init = 1;
293+ if (fadv_max_fd == 0){
294+ fadv_max_fd = sysconf(_SC_OPEN_MAX) - 20;
295+ if (fadv_max_fd < 0)
296+ fadv_max_fd = 1;
297+ if (fadv_max_fd > 1000)
298+ fadv_max_fd = 1000;
299+ }
300+ for (i = 0; i < fadv_max_fd; i++) {
301+ fadv_fd_pos[i] = 0;
302+ fadv_fd_stat[i].st_dev = 0;
303+ fadv_fd_stat[i].st_ino = 0;
304+ }
305+ }
c966e8e8 306+}
14317044
WD
307+
308+static void fadv_drop(int fd, int sync)
309+{
310+ struct stat sb;
311+ int pos;
312+
313+ /* Trail 1 MB behind in dropping. we do this to make
314+ * sure that the same block or stripe does not have
315+ * to be written twice. */
316+ if (fd > fadv_max_fd)
317+ return;
318+ pos = lseek(fd, 0, SEEK_CUR) - 1024*1024;
c966e8e8 319+ fadv_fd_init_func();
14317044
WD
320+ fstat(fd, &sb);
321+ if (fadv_fd_stat[fd].st_dev == sb.st_dev
322+ && fadv_fd_stat[fd].st_ino == sb.st_ino) {
323+ if (fadv_fd_pos[fd] < pos - FADV_BUFFER_SIZE) {
324+ if (sync) {
325+ /* If the file is not flushed to disk before calling fadvise,
326+ * then the Cache will not be freed and the advise gets ignored
327+ * this does give a severe hit on performance. If only there
328+ * was a way to mark cache so that it gets release once the data
329+ * is written to disk. */
330+ fdatasync(fd);
331+ }
332+ posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED);
333+ fadv_fd_pos[fd] = pos;
334+ }
335+ } else {
336+ fadv_fd_stat[fd].st_dev = sb.st_dev;
337+ fadv_fd_stat[fd].st_ino = sb.st_ino;
338+ fadv_fd_pos[fd] = 0;
339+ }
c966e8e8 340+}
14317044 341+
c966e8e8
WD
342+ssize_t fadv_write(int fd, const void *buf, size_t count)
343+{
14317044
WD
344+ int ret = write(fd, buf, count);
345+ if (drop_cache)
346+ fadv_drop(fd, 1);
347+ return ret;
c966e8e8
WD
348+}
349+
350+ssize_t fadv_read(int fd, void *buf, size_t count)
351+{
14317044
WD
352+ int ret = read(fd, buf, count);
353+ if (drop_cache)
354+ fadv_drop(fd, 0);
355+ return ret;
356+}
357+
358+void fadv_close_all(void)
359+{
360+ while (fadv_close_ring_size > 0){
361+ fdatasync(fadv_close_ring[fadv_close_ring_tail]);
362+ posix_fadvise64(fadv_close_ring[fadv_close_ring_tail], 0, 0,POSIX_FADV_DONTNEED);
363+ fadv_close_ring_size--;
364+ close(fadv_close_ring[fadv_close_ring_tail]);
365+ fadv_close_ring_tail = (fadv_close_ring_tail + 1) % fadv_max_fd;
366+ fadv_close_buffer_size = 0;
367+ }
c966e8e8
WD
368+}
369+
14317044
WD
370+int fadv_close(int fd)
371+{
372+ if (drop_cache) {
373+ /* If the file is not flushed to disk before calling fadvise,
374+ * then the Cache will not be freed and the advise gets ignored
375+ * this does give a severe hit on performance. So instead of doing
376+ * it right away, we save us a copy of the filehandle and do it
377+ * some time before we are out of filehandles. This speeds
378+ * up operation for small files massively. It is directly
379+ * related to the number of spare file handles you have. */
380+ int newfd = dup(fd);
381+ int pos = lseek(fd, 0, SEEK_CUR);
382+ fadv_fd_init_func();
383+ fadv_close_buffer_size += pos - fadv_fd_pos[fd];
384+ fadv_close_ring[fadv_close_ring_head] = newfd;
e2e42a01 385+ fadv_close_ring_head = (fadv_close_ring_head + 1) % fadv_max_fd;
14317044
WD
386+ fadv_close_ring_size ++;
387+ if (fadv_close_ring_size == fadv_max_fd || fadv_close_buffer_size > 1024*1024 ){
388+ /* it seems fastest to drop things 'in groups' */
389+ fadv_close_all();
e2e42a01 390+ }
14317044
WD
391+ }
392+ return close(fd);
c966e8e8 393+}
14317044
WD
394+
395+#define close(fd) fadv_close(fd)
396+#define read(fd,buf,len) fadv_read(fd,buf,len)
397+#define write(fd,buf,len) fadv_write(fd,buf,len)
398+#endif
c966e8e8
WD
399+
400 /* Set a fd into nonblocking mode. */
401 void set_nonblocking(int fd)
402 {