+/**
+ * Sleep after writing to limit I/O bandwidth usage.
+ *
+ * @todo Rather than sleeping after each write, it might be better to
+ * use some kind of averaging. The current algorithm seems to always
+ * use a bit less bandwidth than specified, because it doesn't make up
+ * for slow periods. But arguably this is a feature. In addition, we
+ * ought to take the time used to write the data into account.
+ *
+ * During some phases of big transfers (file FOO is uptodate) this is
+ * called with a small bytes_written every time. As the kernel has to
+ * round small waits up to guarantee that we actually wait at least the
+ * requested number of microseconds, this can become grossly inaccurate.
+ * We therefore keep track of the bytes we've written over time and only
+ * sleep when the accumulated delay is at least 1 tenth of a second.
+ **/
+static void sleep_for_bwlimit(int bytes_written)
+{
+ static struct timeval prior_tv;
+ static long total_written = 0;
+ struct timeval tv, start_tv;
+ long elapsed_usec, sleep_usec;
+
+#define ONE_SEC 1000000L /* # of microseconds in a second */
+
+ if (!bwlimit)
+ return;
+
+ total_written += bytes_written;
+
+ gettimeofday(&start_tv, NULL);
+ if (prior_tv.tv_sec) {
+ elapsed_usec = (start_tv.tv_sec - prior_tv.tv_sec) * ONE_SEC
+ + (start_tv.tv_usec - prior_tv.tv_usec);
+ total_written -= elapsed_usec * bwlimit / (ONE_SEC/1024);
+ if (total_written < 0)
+ total_written = 0;
+ }
+
+ sleep_usec = total_written * (ONE_SEC/1024) / bwlimit;
+ if (sleep_usec < ONE_SEC / 10) {
+ prior_tv = start_tv;
+ return;
+ }
+
+ tv.tv_sec = sleep_usec / ONE_SEC;
+ tv.tv_usec = sleep_usec % ONE_SEC;
+ select(0, NULL, NULL, NULL, &tv);
+
+ gettimeofday(&prior_tv, NULL);
+ elapsed_usec = (prior_tv.tv_sec - start_tv.tv_sec) * ONE_SEC
+ + (prior_tv.tv_usec - start_tv.tv_usec);
+ total_written = (sleep_usec - elapsed_usec) * bwlimit / (ONE_SEC/1024);
+}
+
+
+/**
+ * Write len bytes to the file descriptor @p fd.
+ *
+ * This function underlies the multiplexing system. The body of the
+ * application never calls this function directly.
+ **/
+static void writefd_unbuffered(int fd,char *buf,size_t len)
+{
+ size_t total = 0;
+ fd_set w_fds, r_fds;
+ int fd_count, count;
+ struct timeval tv;
+
+ if (fd == msg_fd_out) {
+ rprintf(FERROR, "Internal error: wrong write used in receiver.\n");
+ exit_cleanup(RERR_PROTOCOL);
+ }
+
+ no_flush++;
+
+ while (total < len) {
+ FD_ZERO(&w_fds);
+ FD_SET(fd,&w_fds);
+ fd_count = fd;
+
+ if (msg_fd_in >= 0) {
+ FD_ZERO(&r_fds);
+ FD_SET(msg_fd_in,&r_fds);
+ if (msg_fd_in > fd_count)
+ fd_count = msg_fd_in;
+ }
+
+ tv.tv_sec = select_timeout;
+ tv.tv_usec = 0;
+
+ errno = 0;
+ count = select(fd_count+1, msg_fd_in >= 0 ? &r_fds : NULL,
+ &w_fds, NULL, &tv);
+
+ if (count <= 0) {
+ check_timeout();
+ if (errno == EBADF)
+ exit_cleanup(RERR_SOCKETIO);
+ continue;
+ }
+
+ if (msg_fd_in >= 0 && FD_ISSET(msg_fd_in, &r_fds))
+ read_msg_fd();
+
+ if (FD_ISSET(fd, &w_fds)) {
+ int ret;
+ size_t n = len-total;
+ if (bwlimit && n > bwlimit_writemax)
+ n = bwlimit_writemax;
+ ret = write(fd,buf+total,n);
+
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ if (errno == EWOULDBLOCK || errno == EAGAIN) {
+ msleep(1);
+ continue;
+ }
+ }
+
+ if (ret <= 0) {
+ /* Don't try to write errors back
+ * across the stream */
+ io_multiplexing_close();
+ rsyserr(FERROR, errno,
+ "writefd_unbuffered failed to write %ld bytes: phase \"%s\"",
+ (long)len, io_write_phase);
+ exit_cleanup(RERR_STREAMIO);
+ }
+
+ sleep_for_bwlimit(ret);
+
+ total += ret;
+
+ if (io_timeout)
+ last_io = time(NULL);
+ }
+ }
+
+ no_flush--;
+}
+
+
+static char *io_buffer;
+static int io_buffer_count;
+
+void io_start_buffering_out(int fd)
+{
+ if (io_buffer)
+ return;
+ multiplex_out_fd = fd;
+ io_buffer = new_array(char, IO_BUFFER_SIZE);
+ if (!io_buffer)
+ out_of_memory("writefd");
+ io_buffer_count = 0;
+}
+
+void io_start_buffering_in(int fd)
+{
+ multiplex_in_fd = fd;
+}
+
+/**
+ * Write an message to a multiplexed stream. If this fails then rsync
+ * exits.
+ **/
+static void mplex_write(int fd, enum msgcode code, char *buf, size_t len)
+{
+ char buffer[4096];
+ size_t n = len;
+
+ SIVAL(buffer, 0, ((MPLEX_BASE + (int)code)<<24) + len);
+
+ if (n > (sizeof buffer - 4)) {
+ n = sizeof buffer - 4;
+ }
+
+ memcpy(&buffer[4], buf, n);
+ writefd_unbuffered(fd, buffer, n+4);
+
+ len -= n;
+ buf += n;
+
+ if (len) {
+ writefd_unbuffered(fd, buf, len);
+ }