Matt's transliterate patch.
[rsync/rsync-patches.git] / dynamic_hash.diff
index 2a6026f..4294ae3 100644 (file)
 This patch makes the processing of large really large files more efficient
 by making sure that the sender's hash table is large enough to hold all the
-checksum entries without being overloaded.  It also makes the hashing of
-normal sized files use slightly less memory than before.
+checksum entries without being overloaded.
 
-An extended version of a patch by Shachar Shemesh.
+Updated to use the current hashtable method when possible, and the new
+hashtable method (which requires a modulus calculation for up to every byte
+of the source file) only on large files that need a larger hashtable size.
+This avoids slowing down files that don't need the extra-large hashtable.
+
+This was updated for the latest codebase from a patch written by Shachar
+Shemesh.
+
+To use this patch, run these commands for a successful build:
+
+    patch -p1 <patches/dynamic_hash.diff
+    ./configure                                 (optional if already run)
+    make
 
 --- old/match.c
 +++ new/match.c
-@@ -26,11 +26,6 @@ extern int append_mode;
+@@ -39,40 +39,51 @@ static int total_matches;
  
int updating_basis_file;
extern struct stats stats;
  
--typedef unsigned short tag;
--
 -#define TABLESIZE (1<<16)
--#define NULL_TAG (-1)
--
- static int false_alarms;
- static int tag_hits;
- static int matches;
-@@ -42,47 +37,36 @@ static int total_matches;
- extern struct stats stats;
++#define TRADITIONAL_TABLESIZE (1<<16)
  
--struct target {
--      tag t;
--      int32 i;
--};
--
--static struct target *targets;
--
--static int32 *tag_table;
--
--#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
--#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
--
--static int compare_targets(struct target *t1,struct target *t2)
--{
--      return (int)t1->t - (int)t2->t;
--}
-+static int32 tablesize;
-+static int32 *sum_table;
++static uint32 tablesize;
+ static int32 *hash_table;
  
-+#define gettag2(s1,s2) gettag((s1) + ((s2)<<16))
-+#define gettag(sum) ((sum)%tablesize)
+ #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF)
+ #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16)
  
- static void build_hash_table(struct sum_struct *s)
+-static int32 build_hash_table(struct sum_struct *s, int32 start)
++#define BIG_SUM2HASH(sum) ((sum)%tablesize)
++
++static void build_hash_table(struct sum_struct *s)
  {
--      int32 i;
-+      int32 i, prior_size = tablesize;
+-      int32 i, end = s->count;
++      static uint32 alloc_size;
++      int32 i;
  
--      if (!tag_table)
--              tag_table = new_array(int32, TABLESIZE);
+-      if (!hash_table) {
+-              hash_table = new_array(int32, TABLESIZE);
 +      /* Dynamically calculate the hash table size so that the hash load
-+       * for big files is about 80%.  This number must be odd or s2 will
-+       * not be able to span the entire set. */
-+      tablesize = (s->count/8) * 10 + 11;
-+      if (tablesize < 65537)
-+              tablesize = 65537; /* a prime number */
-+      if (tablesize != prior_size) {
-+              free(sum_table);
-+              sum_table = new_array(int32, tablesize);
-+              if (!sum_table)
-+                      out_of_memory("build_hash_table");
-+      }
--      targets = new_array(struct target, s->count);
--      if (!tag_table || !targets)
--              out_of_memory("build_hash_table");
-+      memset(sum_table, 0xFF, tablesize * sizeof (sum_table[0]));
-       for (i = 0; i < s->count; i++) {
--              targets[i].i = i;
--              targets[i].t = gettag(s->sums[i].sum1);
-+              int32 t = gettag(s->sums[i].sum1);
-+              s->sums[i].chain = sum_table[t];
-+              sum_table[t] = i;
++       * for big files is about 80%.  A number greater than the traditional
++       * size must be odd or s2 will not be able to span the entire set. */
++      tablesize = (uint32)(s->count/8) * 10 + 11;
++      if (tablesize < TRADITIONAL_TABLESIZE)
++              tablesize = TRADITIONAL_TABLESIZE;
++      if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) {
++              if (hash_table)
++                      free(hash_table);
++              hash_table = new_array(int32, tablesize);
+               if (!hash_table)
+                       out_of_memory("build_hash_table");
++              alloc_size = tablesize;
        }
+-      memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]);
 -
--      qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets);
+-      if (end - start > TABLESIZE*8/10)
+-              end = start + TABLESIZE*8/10;
 -
--      for (i = 0; i < TABLESIZE; i++)
--              tag_table[i] = NULL_TAG;
+-      for (i = start; i < end; i++) {
+-              uint32 t = SUM2HASH(s->sums[i].sum1);
+-              s->sums[i].chain = hash_table[t];
+-              hash_table[t] = i;
+-      }
++      memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]);
+-      if (verbose > 2) {
+-              rprintf(FINFO, "built hash table for entries %ld - %ld\n",
+-                      (long)start, (long)end - 1);
++      if (tablesize == TRADITIONAL_TABLESIZE) {
++              for (i = 0; i < s->count; i++) {
++                      uint32 t = SUM2HASH(s->sums[i].sum1);
++                      s->sums[i].chain = hash_table[t];
++                      hash_table[t] = i;
++              }
++      } else {
++              for (i = 0; i < s->count; i++) {
++                      uint32 t = BIG_SUM2HASH(s->sums[i].sum1);
++                      s->sums[i].chain = hash_table[t];
++                      hash_table[t] = i;
++              }
+       }
 -
--      for (i = s->count; i-- > 0; )
--              tag_table[targets[i].t] = i;
+-      return end;
  }
  
  
-@@ -176,20 +160,16 @@ static void hash_search(int f,struct sum
-       }
-       do {
--              tag t = gettag2(s1,s2);
-+              int32 i, t = gettag2(s1,s2);
+@@ -130,8 +141,8 @@ static void matched(int f, struct sum_st
+ static void hash_search(int f,struct sum_struct *s,
+                       struct map_struct *buf, OFF_T len)
+ {
+-      OFF_T offset, end, reset = 0;
+-      int32 k, want_i, backup, sum_pos = 0;
++      OFF_T offset, end;
++      int32 k, want_i, backup;
+       char sum2[SUM_LENGTH];
+       uint32 s1, s2, sum;
+       int more;
+@@ -169,24 +180,21 @@ static void hash_search(int f,struct sum
                int done_csum2 = 0;
--              int32 j = tag_table[t];
-               if (verbose > 4)
-                       rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum);
--              if (j == NULL_TAG)
--                      goto null_tag;
+               int32 i;
+-              if (offset >= reset) {
+-                      sum_pos = build_hash_table(s, sum_pos);
+-                      if (sum_pos == s->count)
+-                              reset = len;
+-                      else
+-                              reset = sum_pos * s->blength;
+-              }
 -
-               sum = (s1 & 0xffff) | (s2 << 16);
-               tag_hits++;
--              do {
--                      int32 l, i = targets[j].i;
-+              for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) {
-+                      int32 l;
-                       if (sum != s->sums[i].sum1)
-                               continue;
-@@ -205,9 +185,10 @@ static void hash_search(int f,struct sum
-                           && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
-                               continue;
--                      if (verbose > 3)
--                              rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n",
--                                      (double)offset,(double)j,(double)i,sum);
-+                      if (verbose > 3) {
-+                              rprintf(FINFO,"potential match at %.0f i=%ld sum=%08x\n",
-+                                      (double)offset, (long)i, sum);
-+                      }
-                       if (!done_csum2) {
-                               map = (schar *)map_ptr(buf,offset,l);
-@@ -224,23 +205,23 @@ static void hash_search(int f,struct sum
-                        * one with an identical offset, so we prefer that over
-                        * the following want_i optimization. */
-                       if (updating_basis_file) {
--                              do {
--                                      int32 i2 = targets[j].i;
-+                              int32 i2;
-+                              for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
-                                       if (s->sums[i2].offset != offset)
-                                               continue;
-                                       if (i2 != i) {
-                                               if (sum != s->sums[i2].sum1)
--                                                      break;
-+                                                      continue;
-                                               if (memcmp(sum2, s->sums[i2].sum2,
-                                                          s->s2length) != 0)
--                                                      break;
-+                                                      continue;
-                                               i = i2;
-                                       }
-                                       /* This chunk was at the same offset on
-                                        * both the sender and the receiver. */
-                                       s->sums[i].flags |= SUMFLG_SAME_OFFSET;
-                                       goto set_want_i;
--                              } while (++j < s->count && targets[j].t == t);
-+                              }
-                       }
-                       /* we've found a match, but now check to see
-@@ -266,9 +247,8 @@ static void hash_search(int f,struct sum
-                       s2 = sum >> 16;
-                       matches++;
-                       break;
--              } while (++j < s->count && targets[j].t == t);
+               if (verbose > 4) {
+                       rprintf(FINFO, "offset=%.0f sum=%04x%04x\n",
+                               (double)offset, s2 & 0xFFFF, s1 & 0xFFFF);
+               }
+-              i = hash_table[SUM2HASH2(s1,s2)];
+-              if (i < 0)
+-                      goto null_hash;
++              if (tablesize == TRADITIONAL_TABLESIZE) {
++                      if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
++                              goto null_hash;
++                      sum = (s1 & 0xffff) | (s2 << 16);
++              } else {
++                      sum = (s1 & 0xffff) | (s2 << 16);
++                      if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
++                              goto null_hash;
 +              }
  
--      null_tag:
-               backup = offset - last_match;
-               /* We sometimes read 1 byte prior to last_match... */
-               if (backup < 0)
-@@ -375,11 +355,6 @@ void match_sums(int f, struct sum_struct
-               rprintf(FINFO,"sending file_sum\n");
-       write_buf(f,file_sum,MD4_SUM_LENGTH);
--      if (targets) {
--              free(targets);
--              targets=NULL;
--      }
--
-       if (verbose > 2)
-               rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n",
-                       false_alarms, tag_hits, matches);
---- old/rsync.h
-+++ new/rsync.h
-@@ -560,6 +560,7 @@ struct sum_buf {
-       OFF_T offset;           /**< offset in file of this chunk */
-       int32 len;              /**< length of chunk of file */
-       uint32 sum1;            /**< simple checksum */
-+      int32 chain;            /**< next hash-table collision */
-       short flags;            /**< flag bits */
-       char sum2[SUM_LENGTH];  /**< checksum  */
- };
---- old/sender.c
-+++ new/sender.c
-@@ -92,6 +92,7 @@ static struct sum_struct *receive_sums(i
+-              sum = (s1 & 0xffff) | (s2 << 16);
+               hash_hits++;
+               do {
+                       int32 l;
+@@ -354,6 +362,11 @@ void match_sums(int f, struct sum_struct
+       }
  
-               s->sums[i].offset = offset;
-               s->sums[i].flags = 0;
-+              s->sums[i].chain = -1;
+       if (len > 0 && s->count > 0) {
++              build_hash_table(s);
++
++              if (verbose > 2)
++                      rprintf(FINFO,"built hash table\n");
++
+               hash_search(f, s, buf, len);
  
-               if (i == s->count-1 && s->remainder != 0)
-                       s->sums[i].len = s->remainder;
+               if (verbose > 2)