From 2d2150f62c4127bf0a97b68a5c14f8656b8b4591 Mon Sep 17 00:00:00 2001 From: Wayne Davison Date: Thu, 3 Nov 2005 16:55:20 +0000 Subject: [PATCH] A parallel-checksumming idea. --- early-checksum.diff | 170 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 early-checksum.diff diff --git a/early-checksum.diff b/early-checksum.diff new file mode 100644 index 0000000..d0ca0f8 --- /dev/null +++ b/early-checksum.diff @@ -0,0 +1,170 @@ +This patch changes the way the --checksum option works by having the +receiving side perform a checksum-read of every file in the file list +(if the sizes are equal) as the list is received, marking non-matching +items with a flag. The idea is that the checksum pass on the sender and +the receiver can then happen in parallel instead of having the reciever +to its checksum pass during its normal find-the-different-files pass. + +I have benchmarked this a little, and it appears to slow things down. + +--- orig/flist.c 2005-11-01 20:09:55 ++++ flist.c 2005-09-16 16:41:25 +@@ -36,6 +36,7 @@ extern int am_daemon; + extern int am_sender; + extern int do_progress; + extern int always_checksum; ++extern int pre_checksum; + extern int module_id; + extern int ignore_errors; + extern int numeric_ids; +@@ -705,6 +706,16 @@ static struct file_struct *receive_file_ + sum = empty_sum; + } + read_buf(f, sum, slen); ++ if (pre_checksum) { ++ char sum2[MD4_SUM_LENGTH]; ++ STRUCT_STAT st; ++ char *fname = f_name(file); ++ if (stat(fname, &st) == 0 && st.st_size == file_length) { ++ file_checksum(fname, sum2, st.st_size); ++ if (memcmp(sum, sum2, slen) != 0) ++ file->flags |= FLAG_SUM_DIFFERS; ++ } ++ } + } + + if (!preserve_perms) { +--- orig/generator.c 2005-10-30 22:30:28 ++++ generator.c 2005-09-16 16:41:25 +@@ -69,6 +69,7 @@ extern int ignore_timeout; + extern int protocol_version; + extern int fuzzy_basis; + extern int always_checksum; ++extern int pre_checksum; + extern char *partial_dir; + extern char *basis_dir[]; + extern int compare_dest; +@@ -359,7 +360,8 @@ void itemize(struct file_struct *file, i + + + /* Perform our quick-check heuristic for determining if a file is unchanged. */ +-static int unchanged_file(char *fn, struct file_struct *file, STRUCT_STAT *st) ++static int unchanged_file(char *fn, int fnamecmp_type, struct file_struct *file, ++ STRUCT_STAT *st) + { + if (st->st_size != file->length) + return 0; +@@ -368,6 +370,8 @@ static int unchanged_file(char *fn, stru + of the file time to determine whether to sync */ + if (always_checksum && S_ISREG(st->st_mode)) { + char sum[MD4_SUM_LENGTH]; ++ if (pre_checksum && fnamecmp_type == FNAMECMP_FNAME) ++ return !(file->flags & FLAG_SUM_DIFFERS); + file_checksum(fn, sum, st->st_size); + return memcmp(sum, file->u.sum, protocol_version < 21 ? 2 + : MD4_SUM_LENGTH) == 0; +@@ -923,7 +927,7 @@ static void recv_generator(char *fname, + match_level = 1; + /* FALL THROUGH */ + case 1: +- if (!unchanged_file(fnamecmpbuf, file, &st)) ++ if (!unchanged_file(fnamecmpbuf, 0, file, &st)) + continue; + best_match = i; + match_level = 2; +@@ -1051,7 +1055,7 @@ static void recv_generator(char *fname, + ; + else if (fnamecmp_type == FNAMECMP_FUZZY) + ; +- else if (unchanged_file(fnamecmp, file, &st)) { ++ else if (unchanged_file(fnamecmp, fnamecmp_type, file, &st)) { + if (fnamecmp_type == FNAMECMP_FNAME) { + if (itemizing) { + itemize(file, ndx, real_ret, &real_st, +--- orig/main.c 2005-11-02 07:22:12 ++++ main.c 2005-09-16 16:41:25 +@@ -45,6 +45,7 @@ extern int copy_links; + extern int keep_dirlinks; + extern int preserve_hard_links; + extern int protocol_version; ++extern int always_checksum; + extern int recurse; + extern int relative_paths; + extern int rsync_port; +@@ -60,8 +61,10 @@ extern char *filesfrom_host; + extern char *rsync_path; + extern char *shell_cmd; + extern char *batch_name; ++extern char curr_dir[MAXPATHLEN]; + + int local_server = 0; ++int pre_checksum = 0; + struct file_list *the_file_list; + + /* There's probably never more than at most 2 outstanding child processes, +@@ -633,6 +636,7 @@ static void do_server_recv(int f_in, int + struct file_list *flist; + char *local_name = NULL; + char *dir = NULL; ++ char olddir[sizeof curr_dir]; + int save_verbose = verbose; + + if (filesfrom_fd >= 0) { +@@ -677,6 +681,10 @@ static void do_server_recv(int f_in, int + filesfrom_fd = -1; + } + ++ strlcpy(olddir, curr_dir, sizeof olddir); ++ if (always_checksum && argc > 0) ++ pre_checksum = push_dir(argv[0]); ++ + flist = recv_file_list(f_in); + verbose = save_verbose; + if (!flist) { +@@ -685,6 +693,9 @@ static void do_server_recv(int f_in, int + } + the_file_list = flist; + ++ if (pre_checksum) ++ pop_dir(olddir); ++ + if (argc > 0) + local_name = get_local_name(flist,argv[0]); + +@@ -733,6 +744,7 @@ int client_run(int f_in, int f_out, pid_ + { + struct file_list *flist = NULL; + int exit_code = 0, exit_code2 = 0; ++ char olddir[sizeof curr_dir]; + char *local_name = NULL; + + cleanup_child_pid = pid; +@@ -804,11 +816,18 @@ int client_run(int f_in, int f_out, pid_ + filesfrom_fd = -1; + } + ++ strlcpy(olddir, curr_dir, sizeof olddir); ++ if (always_checksum) ++ pre_checksum = push_dir(argv[0]); ++ + if (write_batch && !am_server) + start_write_batch(f_in); + flist = recv_file_list(f_in); + the_file_list = flist; + ++ if (pre_checksum) ++ pop_dir(olddir); ++ + if (flist && flist->count > 0) { + local_name = get_local_name(flist, argv[0]); + +--- orig/rsync.h 2005-10-14 18:45:50 ++++ rsync.h 2005-09-16 16:41:26 +@@ -64,6 +64,7 @@ + #define FLAG_DEL_HERE (1<<3) /* receiver/generator */ + #define FLAG_SENT (1<<3) /* sender */ + #define FLAG_HLINK_TOL (1<<4) /* receiver/generator */ ++#define FLAG_SUM_DIFFERS (1<<5) /* receiver/generator */ + + /* update this if you make incompatible changes */ + #define PROTOCOL_VERSION 29 -- 2.34.1