A parallel-checksumming idea.
authorWayne Davison <wayned@samba.org>
Thu, 3 Nov 2005 16:55:20 +0000 (16:55 +0000)
committerWayne Davison <wayned@samba.org>
Thu, 3 Nov 2005 16:55:20 +0000 (16:55 +0000)
early-checksum.diff [new file with mode: 0644]

diff --git a/early-checksum.diff b/early-checksum.diff
new file mode 100644 (file)
index 0000000..d0ca0f8
--- /dev/null
@@ -0,0 +1,170 @@
+This patch changes the way the --checksum option works by having the
+receiving side perform a checksum-read of every file in the file list
+(if the sizes are equal) as the list is received, marking non-matching
+items with a flag.  The idea is that the checksum pass on the sender and
+the receiver can then happen in parallel instead of having the reciever
+to its checksum pass during its normal find-the-different-files pass.
+
+I have benchmarked this a little, and it appears to slow things down.
+
+--- orig/flist.c       2005-11-01 20:09:55
++++ flist.c    2005-09-16 16:41:25
+@@ -36,6 +36,7 @@ extern int am_daemon;
+ extern int am_sender;
+ extern int do_progress;
+ extern int always_checksum;
++extern int pre_checksum;
+ extern int module_id;
+ extern int ignore_errors;
+ extern int numeric_ids;
+@@ -705,6 +706,16 @@ static struct file_struct *receive_file_
+                       sum = empty_sum;
+               }
+               read_buf(f, sum, slen);
++              if (pre_checksum) {
++                      char sum2[MD4_SUM_LENGTH];
++                      STRUCT_STAT st;
++                      char *fname = f_name(file);
++                      if (stat(fname, &st) == 0 && st.st_size == file_length) {
++                              file_checksum(fname, sum2, st.st_size);
++                              if (memcmp(sum, sum2, slen) != 0)
++                                      file->flags |= FLAG_SUM_DIFFERS;
++                      }
++              }
+       }
+       if (!preserve_perms) {
+--- orig/generator.c   2005-10-30 22:30:28
++++ generator.c        2005-09-16 16:41:25
+@@ -69,6 +69,7 @@ extern int ignore_timeout;
+ extern int protocol_version;
+ extern int fuzzy_basis;
+ extern int always_checksum;
++extern int pre_checksum;
+ extern char *partial_dir;
+ extern char *basis_dir[];
+ extern int compare_dest;
+@@ -359,7 +360,8 @@ void itemize(struct file_struct *file, i
+ /* Perform our quick-check heuristic for determining if a file is unchanged. */
+-static int unchanged_file(char *fn, struct file_struct *file, STRUCT_STAT *st)
++static int unchanged_file(char *fn, int fnamecmp_type, struct file_struct *file,
++                        STRUCT_STAT *st)
+ {
+       if (st->st_size != file->length)
+               return 0;
+@@ -368,6 +370,8 @@ static int unchanged_file(char *fn, stru
+          of the file time to determine whether to sync */
+       if (always_checksum && S_ISREG(st->st_mode)) {
+               char sum[MD4_SUM_LENGTH];
++              if (pre_checksum && fnamecmp_type == FNAMECMP_FNAME)
++                      return !(file->flags & FLAG_SUM_DIFFERS);
+               file_checksum(fn, sum, st->st_size);
+               return memcmp(sum, file->u.sum, protocol_version < 21 ? 2
+                                                       : MD4_SUM_LENGTH) == 0;
+@@ -923,7 +927,7 @@ static void recv_generator(char *fname, 
+                               match_level = 1;
+                               /* FALL THROUGH */
+                       case 1:
+-                              if (!unchanged_file(fnamecmpbuf, file, &st))
++                              if (!unchanged_file(fnamecmpbuf, 0, file, &st))
+                                       continue;
+                               best_match = i;
+                               match_level = 2;
+@@ -1051,7 +1055,7 @@ static void recv_generator(char *fname, 
+               ;
+       else if (fnamecmp_type == FNAMECMP_FUZZY)
+               ;
+-      else if (unchanged_file(fnamecmp, file, &st)) {
++      else if (unchanged_file(fnamecmp, fnamecmp_type, file, &st)) {
+               if (fnamecmp_type == FNAMECMP_FNAME) {
+                       if (itemizing) {
+                               itemize(file, ndx, real_ret, &real_st,
+--- orig/main.c        2005-11-02 07:22:12
++++ main.c     2005-09-16 16:41:25
+@@ -45,6 +45,7 @@ extern int copy_links;
+ extern int keep_dirlinks;
+ extern int preserve_hard_links;
+ extern int protocol_version;
++extern int always_checksum;
+ extern int recurse;
+ extern int relative_paths;
+ extern int rsync_port;
+@@ -60,8 +61,10 @@ extern char *filesfrom_host;
+ extern char *rsync_path;
+ extern char *shell_cmd;
+ extern char *batch_name;
++extern char curr_dir[MAXPATHLEN];
+ int local_server = 0;
++int pre_checksum = 0;
+ struct file_list *the_file_list;
+ /* There's probably never more than at most 2 outstanding child processes,
+@@ -633,6 +636,7 @@ static void do_server_recv(int f_in, int
+       struct file_list *flist;
+       char *local_name = NULL;
+       char *dir = NULL;
++      char olddir[sizeof curr_dir];
+       int save_verbose = verbose;
+       if (filesfrom_fd >= 0) {
+@@ -677,6 +681,10 @@ static void do_server_recv(int f_in, int
+               filesfrom_fd = -1;
+       }
++      strlcpy(olddir, curr_dir, sizeof olddir);
++      if (always_checksum && argc > 0)
++              pre_checksum = push_dir(argv[0]);
++
+       flist = recv_file_list(f_in);
+       verbose = save_verbose;
+       if (!flist) {
+@@ -685,6 +693,9 @@ static void do_server_recv(int f_in, int
+       }
+       the_file_list = flist;
++      if (pre_checksum)
++              pop_dir(olddir);
++
+       if (argc > 0)
+               local_name = get_local_name(flist,argv[0]);
+@@ -733,6 +744,7 @@ int client_run(int f_in, int f_out, pid_
+ {
+       struct file_list *flist = NULL;
+       int exit_code = 0, exit_code2 = 0;
++      char olddir[sizeof curr_dir];
+       char *local_name = NULL;
+       cleanup_child_pid = pid;
+@@ -804,11 +816,18 @@ int client_run(int f_in, int f_out, pid_
+               filesfrom_fd = -1;
+       }
++      strlcpy(olddir, curr_dir, sizeof olddir);
++      if (always_checksum)
++              pre_checksum = push_dir(argv[0]);
++
+       if (write_batch && !am_server)
+               start_write_batch(f_in);
+       flist = recv_file_list(f_in);
+       the_file_list = flist;
++      if (pre_checksum)
++              pop_dir(olddir);
++
+       if (flist && flist->count > 0) {
+               local_name = get_local_name(flist, argv[0]);
+--- orig/rsync.h       2005-10-14 18:45:50
++++ rsync.h    2005-09-16 16:41:26
+@@ -64,6 +64,7 @@
+ #define FLAG_DEL_HERE (1<<3)  /* receiver/generator */
+ #define FLAG_SENT (1<<3)      /* sender */
+ #define FLAG_HLINK_TOL (1<<4) /* receiver/generator */
++#define FLAG_SUM_DIFFERS (1<<5)       /* receiver/generator */
+ /* update this if you make incompatible changes */
+ #define PROTOCOL_VERSION 29