Eran Tromer writes: One feature missing from rsync, and requested on this list before, is on-the-fly conversion of filename character encoding. For example, I often need to sync files having Hebrew filenames from a UTF-8 system (Linux) to an ISO8859-8 system (Cygwin on Windows 2000 using the non-Unicode Win32 interface). Other circumstances surely abound. Attached is a patch against rsync 2.6.2 that adds an "--fname-convert" option. When the argument "--fname-convert CONV" is given, rsync pipes every filename through the program CONV, and filename presented to the server will be CONV's output instead of the raw filename. Artificial example: $ touch /tmp/xyz $ rsync -fname-convert 'tr y Y' /tmp/xyz /tmp/ $ ls /tmp/x?z /tmp/xyz /tmp/xYz Perhaps the most useful case is using iconv: $ rsync --fname-convert 'iconv -f utf8 -t iso8859-8' ... I chose to allow invocation of arbitrary programs instead of using libiconv (or equivalent) in order to avoid external dependencies, and to offer more flexibility. The price is that some heuristics were needed to avoid the deadlock problems that tend to occur when filtering data through a program that uses buffered I/O -- see the comments at the top of the new file fnameconv.c. The delay you may have noticed in the above artificial example using "tr" is due to these heuristics; it occurs just once per rsync invocation, not for every file. I believe there are no server-side security implications, since all conversion is done at the client and the server is oblivious to it. On the client, conversion is done before sanitize_path() and besides, providing a sane converter program is the client's responsibility anyway. In verbose mode the updating of non-regular files is reported via rprintf() by the server, so the client will see the converted filename instead the raw filename -- see my comment in recv_generator(). Fixing this requires some delicate changes so I left it as is, but it seems like a minor concern. Most of the new code is in the new file fnameconv.c. The patch lightly touches some other files, mostly flist.c and the addition/extension of some utility functions. Note that you'll need to run 'make proto' after applying this patch. --- orig/Makefile.in 2004-11-03 11:56:03 +++ Makefile.in 2004-07-03 20:18:02 @@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle main.o checksum.o match.o syscall.o log.o backup.o OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \ fileio.o batch.o clientname.o -OBJS3=progress.o pipe.o +OBJS3=progress.o pipe.o fnameconv.o DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \ popt/popthelp.o popt/poptparse.o --- orig/cleanup.c 2005-02-07 20:41:56 +++ cleanup.c 2005-01-10 10:40:51 @@ -25,6 +25,7 @@ extern int io_error; extern int keep_partial; extern int log_got_error; extern char *partial_dir; +extern char *fname_convert_cmd; /** * Close all open sockets and files, allowing a (somewhat) graceful @@ -125,6 +126,8 @@ void _exit_cleanup(int code, const char !partial_dir); } io_flush(FULL_FLUSH); + if (fname_convert_cmd) + cleanup_fname_convert(); if (cleanup_fname) do_unlink(cleanup_fname); if (code) --- orig/errcode.h 2003-12-15 08:04:14 +++ errcode.h 2004-07-02 21:38:59 @@ -34,6 +34,7 @@ #define RERR_STREAMIO 12 /* error in rsync protocol data stream */ #define RERR_MESSAGEIO 13 /* errors with program diagnostics */ #define RERR_IPC 14 /* error in IPC code */ +#define RERR_FNAMECONV 15 /* error in filename conversion */ #define RERR_SIGNAL 20 /* status returned when sent SIGUSR1, SIGINT */ #define RERR_WAITCHILD 21 /* some error returned by waitpid() */ --- orig/flist.c 2005-02-20 00:16:35 +++ flist.c 2005-02-19 09:31:41 @@ -62,6 +62,7 @@ extern int orig_umask; extern int list_only; extern unsigned int curr_dir_len; extern char *log_format; +extern char *fname_convert_cmd; extern char curr_dir[MAXPATHLEN]; @@ -341,7 +342,10 @@ void send_file_entry(struct file_struct io_write_phase = "send_file_entry"; - f_name_to(file, fname); + if (fname_convert_cmd && !am_server) /* fname conversion always done on client */ + convert_fname(fname, f_name(file), MAXPATHLEN); + else + f_name_to(file, fname); flags = base_flags; @@ -557,6 +561,9 @@ static struct file_struct *receive_file_ strlcpy(lastname, thisname, MAXPATHLEN); + if (fname_convert_cmd && !am_server) /* fname conversion always done on client */ + convert_fname(thisname, lastname, MAXPATHLEN); + clean_fname(thisname, 0); if (sanitize_paths) @@ -1070,6 +1077,9 @@ struct file_list *send_file_list(int f, start_write = stats.total_written; gettimeofday(&start_tv, NULL); + if (!am_server) + init_fname_convert(); + flist = flist_new(WITH_HLINK, "send_file_list"); io_start_buffering_out(); @@ -1252,6 +1262,9 @@ struct file_list *send_file_list(int f, stats.flist_size = stats.total_written - start_write; stats.num_files = flist->count; + if (fname_convert_cmd && !am_server) + cleanup_fname_convert(); + if (verbose > 3) output_flist(flist, who_am_i()); @@ -1273,6 +1286,9 @@ struct file_list *recv_file_list(int f) start_read = stats.total_read; + if (fname_convert_cmd && !am_server) + init_fname_convert(); + flist = flist_new(WITH_HLINK, "recv_file_list"); received_flist = flist; @@ -1326,6 +1342,9 @@ struct file_list *recv_file_list(int f) io_error |= read_int(f); } + if (fname_convert_cmd && !am_server) + cleanup_fname_convert(); + if (verbose > 3) output_flist(flist, who_am_i()); --- orig/fnameconv.c 2004-07-02 21:38:59 +++ fnameconv.c 2004-07-02 21:38:59 @@ -0,0 +1,220 @@ +/* -*- c-file-style: "linux" -*- + * + * Copyright (C) 2004 by Eran Tromer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* Handles filename conversion through an external process. Implements + * two modes of operation: + * In persistent mode, a single filename converter is kept running; + * for each query we feed it a single line and read back a single + * line. This will fail for programs that used buffered I/O, and will + * get into a deadlock. + * In non-persistent mode, a converter is invoked and killed for each + * query. This has a very high overhead, but will work for any + * program. + * We start in persistence mode, and if we suspect a deadlock (i.e., + * nothing happens for FNAME_CONV_PERSISTENCE_TIMEOUT milliseconds) + * then we smoothly fall back to non-persistent mode. + * + * Filename conversion errors are always considered fatal, since an + * incorrectly named file could cause unpredictable damage. + */ + +#include + +#define FNAME_CONV_PERSISTENCE_TIMEOUT 3000 /* milliseconds */ + +static int conv_persistent = 1; +static pid_t conv_pid = -1; +static int conv_write_fd = -1, conv_read_fd; +extern char *fname_convert_cmd; +extern int blocking_io; + +/** + * Splits cmd on spaces. + */ +static void split_on_spaces(char *cmd, char **parts) { + int nparts = 0; + char *tok; + char *cmd2 = strdup(cmd); + if (!cmd2) { + rprintf(FERROR, "Out of memory while parsing filename filter %s\n", cmd); + exit_cleanup(RERR_MALLOC); + } + + for (tok = strtok(cmd2, " "); tok; tok = strtok(NULL, " ")) { + if (nparts >= MAX_ARGS) { + rprintf(FERROR, "Filename conversion command is too long: %s\n", cmd); + exit_cleanup(RERR_SYNTAX); + } + parts[nparts++] = tok; + } + parts[nparts] = NULL; +} + + +/** + * Runs the filename converter process. Should be called before filename + * conversion begins (actually it's not necessarh, but it keeps the proress report + * nice and clean. + **/ +void init_fname_convert() +{ + if (fname_convert_cmd && conv_pid < 0) { + char *args[MAX_ARGS]; + + if (verbose > 2) + rprintf(FINFO, "Running filename converter: %s\n", fname_convert_cmd); + split_on_spaces(fname_convert_cmd, args); + /* Invoke child pipe with non-blocking IO and without registering it for + * autocleanup (the latter may blow up the all_pids table, and is not needed + * since we have our own cleanup handler. */ + conv_pid = piped_child(args, &conv_read_fd, &conv_write_fd, 0, 0); + set_nonblocking(conv_write_fd); + set_nonblocking(conv_read_fd); + } +} + +/** + * Kills the filename converter process. Should be called when the file + * list creation is done. We assume that the converter will terminate + * soon after its standard input is closed. + **/ +void cleanup_fname_convert() +{ + if (conv_pid >= 0) { + int status; + if (conv_write_fd >= 0) { + close(conv_write_fd); + conv_write_fd = -1; + } + close(conv_read_fd); + waitpid(conv_pid, &status, 0); + conv_pid = -1; + } +} + +/** + * Converts the filename from src into dest, using at most maxlen + * characters of dest. + **/ +void convert_fname(char *dest, const char *src, unsigned int maxlen) +{ + int res; + const char *srcp; + char *destp; + unsigned int srcrem, dstrem; + + init_fname_convert(); + + /* Send and receive strings simultaneously to avoid deadlock: */ + srcrem = strlen(src)+1; /* chars left to send (incl. terminating LF) */ + dstrem = maxlen-1; /* free chars left in dest */ + srcp = src; + destp = dest; + while(1) { + /* Write as much as possible: */ + if (srcrem > 1) { + res = write(conv_write_fd, srcp, srcrem-1); + if (res < 0 && errno != EAGAIN) { + rprintf(FERROR, "Error writing to fname converter (filename: %s): %s\n", strerror(errno), src); + exit_cleanup(RERR_FNAMECONV); + } + if (res > 0) { /* wrote something */ + srcp += res; + srcrem -= res; + } + } + if (srcrem == 1) { /* final LF */ + res = write(conv_write_fd, "\n", 1); + if (res < 0 && errno != EAGAIN) { + rprintf(FERROR, "Error writing to fname converter (filename: %s): %s\n", strerror(errno), src); + exit_cleanup(RERR_FNAMECONV); + } + if (res > 0) { /* wrote final LF */ + srcrem = 0; + if (!conv_persistent) { + close(conv_write_fd); + conv_write_fd = -1; + } + } + } + + /* Read as much as possible: */ + res = read(conv_read_fd, destp, dstrem); + if (res < 0 && errno != EAGAIN) { + rprintf(FERROR, "Error reading from filename converter (filename: %s):%s \n", strerror(errno), src); + exit_cleanup(RERR_FNAMECONV); + } + if (res == 0) { /* EOF */ + rprintf(FERROR, "EOF from filename converter (filename: %s)\n", src); + exit_cleanup(RERR_FNAMECONV); + } + if (res > 0) { + destp += res; + dstrem -= res; + if (destp[-1] == '\n' || destp[-1] == '\r') + break; /* End of line. Yippy! */ + if (dstrem == 0) { + rprintf(FINFO, "Name converter output too long (filename: %s)\n", src); + exit_cleanup(RERR_FNAMECONV); + } + } + + /* Await activity */ + if (!await_fds(conv_read_fd, !srcrem ? -1 : conv_write_fd, FNAME_CONV_PERSISTENCE_TIMEOUT)) { + if (srcrem == 0 && conv_persistent) { + /* We finished writing but nothing happens. It looks like the converter program + * is using buffered I/O and thus wait to read more input, but we can't give it + * the next filename yet. Fall back to non-persistent mode. */ + if (verbose > 0) + rprintf(FINFO, "Filename converter blocked, disabling persistence to recover.\n"); + + conv_persistent = 0; + close(conv_write_fd); + conv_write_fd = -1; + } + } + } + + /* Cleanup and sanity check */ + if (!conv_persistent) + cleanup_fname_convert(); + if (srcrem > 0) { + close(conv_write_fd); + rprintf(FERROR, "Name converter produced output before reading all its input for file: %s\n", src); + exit_cleanup(RERR_FNAMECONV); + } + + /* Chop newline chars */ + destp--; + if (destp > dest && *destp == '\n') + --destp; + if (destp > dest && *destp == '\r') + --destp; + if (++destp == dest) { + rprintf(FERROR, "Name converter output is empty (filename: %s)\n", src); + exit_cleanup(RERR_FNAMECONV); + } + *destp = 0; + /* Also, we may have a leading CR left over from a CRLF of the previous line */ + if (*dest == '\n') + memmove(dest, dest+1, destp-dest-1); + + if (verbose > 2) + rprintf(FINFO, "Converted filename: %s -> %s\n", src, dest); +} --- orig/generator.c 2005-02-20 22:13:19 +++ generator.c 2005-02-03 02:07:33 @@ -340,7 +340,13 @@ static int find_fuzzy(struct file_struct * start sending checksums. * * Note that f_out is set to -1 when doing final directory-permission and - * modification-time repair. */ + * modification-time repair. + * + * TODO: The filename seen in recv_generator is after filename + * conversion. In verbose mode, directories, symlinks and device + * files are printf()ed here but regular files are rprintf()ed on the + * sender (unconverted). To solve the above, move all progress + * reporting to the sender. */ static void recv_generator(char *fname, struct file_list *flist, struct file_struct *file, int ndx, int f_out, int f_out_name) --- orig/log.c 2005-02-20 00:02:23 +++ log.c 2004-07-03 20:18:02 @@ -62,6 +62,7 @@ struct { { RERR_STREAMIO , "error in rsync protocol data stream" }, { RERR_MESSAGEIO , "errors with program diagnostics" }, { RERR_IPC , "error in IPC code" }, + { RERR_FNAMECONV , "error in filename conversion" }, { RERR_SIGNAL , "received SIGUSR1 or SIGINT" }, { RERR_WAITCHILD , "some error returned by waitpid()" }, { RERR_MALLOC , "error allocating core memory buffers" }, --- orig/main.c 2005-02-20 01:12:42 +++ main.c 2004-07-22 00:31:47 @@ -361,7 +361,7 @@ static pid_t do_cmd(char *cmd, char *mac whole_file = 1; ret = local_child(argc, args, f_in, f_out, child_main); } else - ret = piped_child(args,f_in,f_out); + ret = piped_child(args, f_in, f_out, blocking_io, 1); if (dir) free(dir); --- orig/options.c 2005-02-20 01:12:42 +++ options.c 2005-02-14 02:50:32 @@ -131,6 +131,7 @@ char *basis_dir[MAX_BASIS_DIRS+1]; char *config_file = NULL; char *shell_cmd = NULL; char *log_format = NULL; +char *fname_convert_cmd = NULL; char *password_file = NULL; char *rsync_path = RSYNC_PATH; char *backup_dir = NULL; @@ -314,6 +315,7 @@ void usage(enum logcode F) rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); + rprintf(F," --fname-convert=CMD invoke CMD for filename conversion\n"); rprintf(F," -z, --compress compress file data during the transfer\n"); rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n"); rprintf(F," -f, --filter=RULE add a file-filtering RULE\n"); @@ -423,6 +425,7 @@ static struct poptOption long_options[] {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, + {"fname-convert", 0, POPT_ARG_STRING, &fname_convert_cmd, 0, 0, 0 }, /* TODO: Should this take an optional int giving the compression level? */ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 }, {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 }, --- orig/pipe.c 2005-02-07 20:41:56 +++ pipe.c 2004-07-03 20:18:02 @@ -23,7 +23,6 @@ extern int am_sender; extern int am_server; -extern int blocking_io; extern int orig_umask; extern int write_batch; extern int filesfrom_fd; @@ -40,8 +39,10 @@ extern int filesfrom_fd; * If blocking_io is set then use blocking io on both fds. That can be * used to cope with badly broken rsh implementations like the one on * Solaris. + * + * If register_child is nonzero then the child is registered for autocleanup. **/ -pid_t piped_child(char **command, int *f_in, int *f_out) +pid_t piped_child(char **command, int *f_in, int *f_out, int blocking_io, int register_child) { pid_t pid; int to_child_pipe[2]; @@ -56,7 +57,7 @@ pid_t piped_child(char **command, int *f exit_cleanup(RERR_IPC); } - pid = do_fork(); + pid = register_child ? do_fork() : fork(); if (pid == -1) { rsyserr(FERROR, errno, "fork"); exit_cleanup(RERR_IPC); --- orig/syscall.c 2005-02-14 02:45:11 +++ syscall.c 2004-07-02 21:39:00 @@ -259,3 +259,34 @@ char *d_name(struct dirent *di) return di->d_name; #endif } + +/** + * A wrapper around select(2) that guarantees Linux-like updating of + * the timeout argument to contain the time left, so we can simply + * re-invoke in case of EINTR or EAGAIN. On BSD, select(2) doesn't + * change the timeout argument by itself. + **/ +int do_select(int n, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) +{ + struct timeval intended, before, after; + int result; + + if (timeout == NULL) + return select(n, readfds, writefds, exceptfds, timeout); + + intended = *timeout; + gettimeofday(&before, NULL); + result = select(n, readfds, writefds, exceptfds, timeout); + gettimeofday(&after, NULL); + timeout->tv_sec = intended.tv_sec - (after.tv_sec - before.tv_sec); + timeout->tv_usec = intended.tv_usec - (after.tv_usec - before.tv_usec); + if (timeout->tv_usec >= 1000000) { + ++timeout->tv_sec; + timeout->tv_usec -= 1000000; + } else if (timeout->tv_usec < 0) { + --(timeout)->tv_sec; + timeout->tv_usec += 1000000; + } + + return result; +} --- orig/util.c 2005-02-20 19:17:49 +++ util.c 2004-07-03 20:18:02 @@ -1331,3 +1331,55 @@ uint32 fuzzy_distance(const char *s1, in return a[len2-1]; } + +/** + * Blocks until one of the following happens: + * - read_fd is nonnegative and has data to read + * - write_fd is nonnegative and can be written to + * - something terrible happened to either + * - the timeout (in milliseconds) has elapsed + * Return value is zero iff the timeout occured. + */ +char await_fds(int read_fd, int write_fd, int timeout_ms) +{ + fd_set read_fds, write_fds, except_fds; + struct timeval tv; + int res; + + tv.tv_sec = timeout_ms / 1000; + tv.tv_usec = (timeout_ms % 1000) * 1000; + + while (1) { + int maxfd = 0; + FD_ZERO(&read_fds); + FD_ZERO(&write_fds); + FD_ZERO(&except_fds); + if (write_fd >= 0) { + FD_SET(write_fd, &write_fds); + FD_SET(write_fd, &except_fds); + if (write_fd > maxfd) + maxfd = write_fd; + } + if (read_fd >= 0) { + FD_SET(read_fd, &read_fds); + FD_SET(read_fd, &except_fds); + if (read_fd > maxfd) + maxfd = read_fd; + } + + res = do_select(maxfd+1, &read_fds, &write_fds, &except_fds, &tv); + if (res > 0) + return 1; + if (res < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; /* Retry */ + rprintf(FERROR, "Error awaiting fname converter: %s\n", strerror(errno)); + exit_cleanup(RERR_FNAMECONV); + } + if (read_fd >= 0 && (FD_ISSET(read_fd, &read_fds) || FD_ISSET(read_fd, &except_fds))) + return 1; + if (write_fd >= 0 && (FD_ISSET(write_fd, &write_fds) || FD_ISSET(write_fd, &except_fds))) + return 1; + return 0; /* res == 0 and no FDs set, hence a timeout. */ + } +}