Fix alignment issue on 64-bit. Solution from Steve Ortiz.
[rsync/rsync-patches.git] / drop-cache.diff
CommitLineData
c966e8e8
WD
1From: Tobi Oetiker tobi{at}oetiker.ch
2Date: 2007-04-23
3
4I am using rsync for hard-link backup. I found that there is a
5major problem with frequent backup filling up the file system cache
6with all the data from the files being backed up. The effect is
7that all the other 'sensible' data in the cache gets thrown out in
8the process. This is rather unfortunate as the performance of the
9system becomes very bad after running rsync.
10
11Some research showed, that
12
13 posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED);
14
15would tell the OS that it should not keep the file in cache. I
16have written a patch for rsync that adds the
17
18 --drop-cache
19
20option which activates posix_fadvise64.
21
22There are some caveats though:
23
24 * When calling posix_fadvise64 while writing a file, only the
25 part of the cache will be release which has already been
26 written to disk. This means we have to call fdatasync before
27 calling posix_fadvise64 and this will unfortunately slow down
28 operations considerably. On my test system I get 240 KByte/s.
29
30 The patch has been optimized, so that the impact on large files
31 will be considerably lowered by calling posix_fadvise64 only
32 after a few megabytes have been written.
33
34 * When reading a file which has been cached *Before* rsync read
35 it, the content of the file will be released from cache never
36 the less, which may not be intended. I have unfortunately not
37 found a method for determining if a file is in cache or not
38 (ideas?)
39
40 I found that running rsync of an lvm snapshot is a good way
41 around this problem, since the snapshot data is cached
42 separately from the original. It has the additional benefit of
43 making the backups more consistent.
44
45 * I don't really know the rsync code, so it may be that the patch
46 is calling fadvise for files where this would not be necessary.
47
48 * The patch is tested only on Linux 2.6.18
49
50If you have any input on this, please let me know.
51
52You can get the latest edition of the patch from
53
54 http://tobi.oetiker.ch/patches/
55
56cheers
57tobi
58
e2e42a01 59Changes:
c966e8e8
WD
60
61 2007-04-23
62
63* pass --drop-cache on to the remote server
64* make test works now
65
fc557362
WD
66To use this patch, run these commands for a successful build:
67
68 patch -p1 <patches/drop-cache.diff
69 ./configure (optional if already run)
70 make
71
c1ff70aa 72based-on: a01e3b490eb36ccf9e704840e1b6683dab867550
cc3e685d
WD
73diff --git a/checksum.c b/checksum.c
74--- a/checksum.c
75+++ b/checksum.c
fc557362
WD
76@@ -24,6 +24,10 @@
77 extern int checksum_seed;
78 extern int protocol_version;
c966e8e8 79
14317044
WD
80+#ifdef HAVE_POSIX_FADVISE64
81+#define close(fd) fadv_close(fd)
82+#endif
83+
84 /*
85 a simple 32 bit checksum that can be upadted from either end
86 (inspired by Mark Adler's Adler-32 checksum)
cc3e685d
WD
87diff --git a/cleanup.c b/cleanup.c
88--- a/cleanup.c
89+++ b/cleanup.c
c1ff70aa 90@@ -51,7 +51,13 @@ void close_all(void)
14317044
WD
91 int fd;
92 int ret;
93 STRUCT_STAT st;
94+#endif
95+
96+#ifdef HAVE_POSIX_FADVISE64
97+ fadv_close_all();
98+#endif
99
100+#ifdef SHUTDOWN_ALL_SOCKETS
101 max_fd = sysconf(_SC_OPEN_MAX) - 1;
102 for (fd = max_fd; fd >= 0; fd--) {
103 if ((ret = do_fstat(fd, &st)) == 0) {
cc3e685d
WD
104diff --git a/configure.in b/configure.in
105--- a/configure.in
106+++ b/configure.in
c1ff70aa 107@@ -589,7 +589,7 @@ AC_CHECK_FUNCS(waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \
14317044 108 setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \
5214a41b 109 seteuid strerror putenv iconv_open locale_charset nl_langinfo getxattr \
fc557362 110 extattr_get_link sigaction sigprocmask setattrlist getgrouplist \
72e5645e
WD
111- initgroups utimensat)
112+ initgroups utimensat posix_fadvise64)
14317044 113
4c15e800
WD
114 dnl cygwin iconv.h defines iconv_open as libiconv_open
115 if test x"$ac_cv_func_iconv_open" != x"yes"; then
cc3e685d
WD
116diff --git a/fileio.c b/fileio.c
117--- a/fileio.c
118+++ b/fileio.c
fc557362 119@@ -31,6 +31,12 @@ extern int sparse_files;
c966e8e8 120 static char last_byte;
abd3adb8 121 static OFF_T sparse_seek = 0;
c966e8e8 122
14317044
WD
123+#ifdef HAVE_POSIX_FADVISE64
124+#define close(fd) fadv_close(fd)
125+#define read(fd,buf,len) fadv_read(fd,buf,len)
126+#define write(fd,buf,len) fadv_write(fd,buf,len)
127+#endif
c966e8e8
WD
128+
129 int sparse_end(int f)
130 {
c8a8b4a7 131 int ret;
cc3e685d
WD
132diff --git a/generator.c b/generator.c
133--- a/generator.c
134+++ b/generator.c
c1ff70aa 135@@ -111,6 +111,10 @@ static int need_retouch_dir_times;
c8a8b4a7 136 static int need_retouch_dir_perms;
14317044 137 static const char *solo_file = NULL;
c966e8e8 138
14317044
WD
139+#ifdef HAVE_POSIX_FADVISE64
140+#define close(fd) fadv_close(fd)
141+#endif
142+
fc557362
WD
143 enum nonregtype {
144 TYPE_DIR, TYPE_SPECIAL, TYPE_DEVICE, TYPE_SYMLINK
145 };
cc3e685d
WD
146diff --git a/options.c b/options.c
147--- a/options.c
148+++ b/options.c
c0c7984e 149@@ -60,6 +60,7 @@ int preserve_uid = 0;
c8a8b4a7 150 int preserve_gid = 0;
c966e8e8 151 int preserve_times = 0;
c966e8e8
WD
152 int update_only = 0;
153+int drop_cache = 0;
154 int cvs_exclude = 0;
155 int dry_run = 0;
156 int do_xfers = 1;
72e5645e 157@@ -671,6 +672,9 @@ void usage(enum logcode F)
c966e8e8
WD
158 rprintf(F," --backup-dir=DIR make backups into hierarchy based in DIR\n");
159 rprintf(F," --suffix=SUFFIX set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX);
160 rprintf(F," -u, --update skip files that are newer on the receiver\n");
14317044 161+#ifdef HAVE_POSIX_FADVISE64
c966e8e8 162+ rprintf(F," --drop-cache tell OS to drop caching of file data\n");
14317044 163+#endif
c966e8e8
WD
164 rprintf(F," --inplace update destination files in-place (SEE MAN PAGE)\n");
165 rprintf(F," --append append data onto shorter files\n");
c8a8b4a7 166 rprintf(F," --append-verify like --append, but with old data in file checksum\n");
72e5645e 167@@ -892,6 +896,9 @@ static struct poptOption long_options[] = {
85096e5e
WD
168 {"no-one-file-system",'x',POPT_ARG_VAL, &one_file_system, 0, 0, 0 },
169 {"no-x", 'x', POPT_ARG_VAL, &one_file_system, 0, 0, 0 },
c966e8e8 170 {"update", 'u', POPT_ARG_NONE, &update_only, 0, 0, 0 },
14317044 171+#ifdef HAVE_POSIX_FADVISE64
c966e8e8 172+ {"drop-cache", 0, POPT_ARG_NONE, &drop_cache, 0, 0, 0 },
14317044 173+#endif
c966e8e8
WD
174 {"existing", 0, POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 },
175 {"ignore-non-existing",0,POPT_ARG_NONE, &ignore_non_existing, 0, 0, 0 },
176 {"ignore-existing", 0, POPT_ARG_NONE, &ignore_existing, 0, 0, 0 },
c1ff70aa 177@@ -2287,6 +2294,11 @@ void server_options(char **args, int *argc_p)
c966e8e8
WD
178 if (!am_sender)
179 args[ac++] = "--sender";
180
14317044 181+#ifdef HAVE_POSIX_FADVISE64
c966e8e8
WD
182+ if (drop_cache)
183+ args[ac++] = "--drop-cache";
14317044 184+#endif
c966e8e8
WD
185+
186 x = 1;
187 argstr[0] = '-';
c8a8b4a7 188
cc3e685d
WD
189diff --git a/receiver.c b/receiver.c
190--- a/receiver.c
191+++ b/receiver.c
fc557362 192@@ -64,6 +64,10 @@ static flist_ndx_list batch_redo_list;
14317044 193 /* We're either updating the basis file or an identical copy: */
e2b0842a 194 static int updating_basis_or_equiv;
c966e8e8 195
14317044
WD
196+#ifdef HAVE_POSIX_FADVISE64
197+#define close(fd) fadv_close(fd)
198+#endif
199+
72e5645e
WD
200 #define TMPNAME_SUFFIX ".XXXXXX"
201 #define TMPNAME_SUFFIX_LEN ((int)sizeof TMPNAME_SUFFIX - 1)
202 #define MAX_UNIQUE_NUMBER 999999
cc3e685d
WD
203diff --git a/rsync.yo b/rsync.yo
204--- a/rsync.yo
205+++ b/rsync.yo
fc557362 206@@ -359,6 +359,7 @@ to the detailed description below for a complete description. verb(
c966e8e8
WD
207 --super receiver attempts super-user activities
208 --fake-super store/recover privileged attrs using xattrs
209 -S, --sparse handle sparse files efficiently
210+ --drop-cache tell OS to drop caching of file data
e2b0842a 211 -n, --dry-run perform a trial run with no changes made
f2863bc0 212 -W, --whole-file copy files whole (w/o delta-xfer algorithm)
c966e8e8 213 -x, --one-file-system don't cross filesystem boundaries
7170ca8d 214@@ -1127,6 +1128,10 @@ NOTE: Don't use this option when the destination is a Solaris "tmpfs"
72e5645e
WD
215 filesystem. It seems to have problems seeking over null regions,
216 and ends up corrupting the files.
c966e8e8
WD
217
218+dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data. This
219+prevents rsync from filling up the filesystem cache. This can sometimes help
220+to make a system perform better by keeping non-rsync files in the disk cache.
221+
e2b0842a
WD
222 dit(bf(-n, --dry-run)) This makes rsync perform a trial run that doesn't
223 make any changes (and produces mostly the same output as a real run). It
224 is most commonly used in combination with the bf(-v, --verbose) and/or
cc3e685d
WD
225diff --git a/sender.c b/sender.c
226--- a/sender.c
227+++ b/sender.c
c1ff70aa 228@@ -49,6 +49,10 @@ extern struct file_list *cur_flist, *first_flist, *dir_flist;
72e5645e
WD
229
230 BOOL extra_flist_sending_enabled;
c966e8e8 231
14317044
WD
232+#ifdef HAVE_POSIX_FADVISE64
233+#define close(fd) fadv_close(fd)
234+#endif
235+
236 /**
237 * @file
238 *
cc3e685d
WD
239diff --git a/t_unsafe.c b/t_unsafe.c
240--- a/t_unsafe.c
241+++ b/t_unsafe.c
72e5645e
WD
242@@ -28,6 +28,7 @@ int am_root = 0;
243 int am_sender = 1;
c966e8e8
WD
244 int read_only = 0;
245 int list_only = 0;
c966e8e8 246+int drop_cache = 0;
fc557362 247 int human_readable = 0;
c966e8e8 248 int preserve_perms = 0;
a5e6228a 249 int preserve_executability = 0;
cc3e685d
WD
250diff --git a/util.c b/util.c
251--- a/util.c
252+++ b/util.c
fc557362
WD
253@@ -27,6 +27,7 @@
254
c966e8e8 255 extern int dry_run;
c966e8e8 256 extern int module_id;
14317044 257+extern int drop_cache;
c966e8e8
WD
258 extern int modify_window;
259 extern int relative_paths;
fc557362 260 extern int preserve_xattrs;
91270139 261@@ -42,6 +43,131 @@ char curr_dir[MAXPATHLEN];
c966e8e8
WD
262 unsigned int curr_dir_len;
263 int curr_dir_depth; /* This is only set for a sanitizing daemon. */
264
14317044
WD
265+#ifdef HAVE_POSIX_FADVISE64
266+#define FADV_BUFFER_SIZE 1024*1024*16
c966e8e8 267+
14317044
WD
268+static struct stat fadv_fd_stat[1024];
269+static off_t fadv_fd_pos[1024];
270+static int fadv_fd_init = 0;
271+static int fadv_max_fd = 0;
272+static int fadv_close_ring_tail = 0;
273+static int fadv_close_ring_head = 0;
274+static int fadv_close_ring_size = 0;
275+static int fadv_close_ring[1024];
276+static int fadv_close_buffer_size = 0;
c966e8e8 277+
14317044
WD
278+static void fadv_fd_init_func(void)
279+{
280+ if (fadv_fd_init == 0) {
281+ int i;
282+ fadv_fd_init = 1;
283+ if (fadv_max_fd == 0){
284+ fadv_max_fd = sysconf(_SC_OPEN_MAX) - 20;
285+ if (fadv_max_fd < 0)
286+ fadv_max_fd = 1;
287+ if (fadv_max_fd > 1000)
288+ fadv_max_fd = 1000;
289+ }
290+ for (i = 0; i < fadv_max_fd; i++) {
291+ fadv_fd_pos[i] = 0;
292+ fadv_fd_stat[i].st_dev = 0;
293+ fadv_fd_stat[i].st_ino = 0;
294+ }
295+ }
c966e8e8 296+}
14317044
WD
297+
298+static void fadv_drop(int fd, int sync)
299+{
300+ struct stat sb;
301+ int pos;
302+
303+ /* Trail 1 MB behind in dropping. we do this to make
304+ * sure that the same block or stripe does not have
305+ * to be written twice. */
306+ if (fd > fadv_max_fd)
307+ return;
308+ pos = lseek(fd, 0, SEEK_CUR) - 1024*1024;
c966e8e8 309+ fadv_fd_init_func();
14317044
WD
310+ fstat(fd, &sb);
311+ if (fadv_fd_stat[fd].st_dev == sb.st_dev
312+ && fadv_fd_stat[fd].st_ino == sb.st_ino) {
313+ if (fadv_fd_pos[fd] < pos - FADV_BUFFER_SIZE) {
314+ if (sync) {
315+ /* If the file is not flushed to disk before calling fadvise,
316+ * then the Cache will not be freed and the advise gets ignored
317+ * this does give a severe hit on performance. If only there
318+ * was a way to mark cache so that it gets release once the data
319+ * is written to disk. */
320+ fdatasync(fd);
321+ }
322+ posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED);
323+ fadv_fd_pos[fd] = pos;
324+ }
325+ } else {
326+ fadv_fd_stat[fd].st_dev = sb.st_dev;
327+ fadv_fd_stat[fd].st_ino = sb.st_ino;
328+ fadv_fd_pos[fd] = 0;
329+ }
c966e8e8 330+}
14317044 331+
c966e8e8
WD
332+ssize_t fadv_write(int fd, const void *buf, size_t count)
333+{
14317044
WD
334+ int ret = write(fd, buf, count);
335+ if (drop_cache)
336+ fadv_drop(fd, 1);
337+ return ret;
c966e8e8
WD
338+}
339+
340+ssize_t fadv_read(int fd, void *buf, size_t count)
341+{
14317044
WD
342+ int ret = read(fd, buf, count);
343+ if (drop_cache)
344+ fadv_drop(fd, 0);
345+ return ret;
346+}
347+
348+void fadv_close_all(void)
349+{
350+ while (fadv_close_ring_size > 0){
351+ fdatasync(fadv_close_ring[fadv_close_ring_tail]);
352+ posix_fadvise64(fadv_close_ring[fadv_close_ring_tail], 0, 0,POSIX_FADV_DONTNEED);
353+ fadv_close_ring_size--;
354+ close(fadv_close_ring[fadv_close_ring_tail]);
355+ fadv_close_ring_tail = (fadv_close_ring_tail + 1) % fadv_max_fd;
356+ fadv_close_buffer_size = 0;
357+ }
c966e8e8
WD
358+}
359+
14317044
WD
360+int fadv_close(int fd)
361+{
362+ if (drop_cache) {
363+ /* If the file is not flushed to disk before calling fadvise,
364+ * then the Cache will not be freed and the advise gets ignored
365+ * this does give a severe hit on performance. So instead of doing
366+ * it right away, we save us a copy of the filehandle and do it
367+ * some time before we are out of filehandles. This speeds
368+ * up operation for small files massively. It is directly
369+ * related to the number of spare file handles you have. */
370+ int newfd = dup(fd);
371+ int pos = lseek(fd, 0, SEEK_CUR);
372+ fadv_fd_init_func();
373+ fadv_close_buffer_size += pos - fadv_fd_pos[fd];
374+ fadv_close_ring[fadv_close_ring_head] = newfd;
e2e42a01 375+ fadv_close_ring_head = (fadv_close_ring_head + 1) % fadv_max_fd;
14317044
WD
376+ fadv_close_ring_size ++;
377+ if (fadv_close_ring_size == fadv_max_fd || fadv_close_buffer_size > 1024*1024 ){
378+ /* it seems fastest to drop things 'in groups' */
379+ fadv_close_all();
e2e42a01 380+ }
14317044
WD
381+ }
382+ return close(fd);
c966e8e8 383+}
14317044
WD
384+
385+#define close(fd) fadv_close(fd)
386+#define read(fd,buf,len) fadv_read(fd,buf,len)
387+#define write(fd,buf,len) fadv_write(fd,buf,len)
388+#endif
c966e8e8
WD
389+
390 /* Set a fd into nonblocking mode. */
391 void set_nonblocking(int fd)
392 {