+--- hashlink.c 1969-12-31 19:00:00.000000000 -0500
++++ hashlink.c 2004-02-23 10:30:45.000000000 -0500
+@@ -0,0 +1,351 @@
++/*
++ Copyright (C) Cronosys, LLC 2004
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++/* This file contains code used by the --link-by-hash option. */
++
++#include "rsync.h"
++
++extern char *link_by_hash_dir;
++
++#ifdef HAVE_LINK
++
++char* make_hash_name(struct file_struct *file)
++{
++ char hash[33], *dst;
++ unsigned char *src;
++ unsigned char c;
++ int i;
++
++ src = (unsigned char*)file->u.sum;
++ for (dst = hash, i = 0; i < 4; i++, src++) {
++ c = *src >> 4;
++ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
++ c = *src & 0x0f;
++ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
++ }
++ *dst++ = '/';
++ for (i = 0; i < 12; i++, src++) {
++ c = *src >> 4;
++ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
++ c = *src & 0x0f;
++ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
++ }
++ *dst = 0;
++
++ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
++ return dst;
++}
++
++
++void kill_hashfile(struct hashfile_struct *hashfile)
++{
++ if (!hashfile)
++ return;
++ free(hashfile->name);
++ close(hashfile->fd);
++ free(hashfile);
++}
++
++
++void kill_hashfiles(struct hashfile_struct *hashfiles)
++{
++ struct hashfile_struct *iter, *next;
++ if ((iter = hashfiles) != NULL) {
++ do {
++ next = iter->next;
++ kill_hashfile(iter);
++ iter = next;
++ } while (iter != hashfiles);
++ }
++}
++
++
++struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
++{
++ DIR *d;
++ struct dirent *di;
++ struct hashfile_struct *hashfiles = NULL, *hashfile;
++ STRUCT_STAT st;
++ long this_fnbr;
++
++ *fnbr = 0;
++
++ /* Build a list of potential candidates and open
++ * them. */
++ if ((d = opendir(hashname)) == NULL) {
++ rprintf(FERROR,"opendir \"%s\": %s\n",
++ hashname, strerror(errno));
++ free(hashname);
++ return NULL;
++ }
++ while ((di = readdir(d)) != NULL) {
++ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
++ continue;
++ }
++
++ /* We need to have the largest fnbr in case we need to store
++ * a new file. */
++ this_fnbr = atol(di->d_name);
++ if (this_fnbr > *fnbr)
++ *fnbr = this_fnbr;
++
++ hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
++ asprintf(&hashfile->name,"%s/%s",hashname,
++ di->d_name);
++ if (do_stat(hashfile->name,&st) == -1) {
++ rprintf(FERROR,"%s: %s", hashfile->name,
++ strerror(errno));
++ kill_hashfile(hashfile);
++ continue;
++ }
++ if (st.st_size != size) {
++ kill_hashfile(hashfile);
++ continue;
++ }
++ hashfile->nlink = st.st_nlink;
++ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
++ if (hashfile->fd == -1) {
++ rprintf(FERROR,"%s: %s\n", hashfile->name,
++ strerror(errno));
++ kill_hashfile(hashfile);
++ continue;
++ }
++ if (hashfiles == NULL)
++ hashfiles = hashfile->next = hashfile->prev = hashfile;
++ else {
++ hashfile->next = hashfiles;
++ hashfile->prev = hashfiles->prev;
++ hashfile->next->prev = hashfile;
++ hashfile->prev->next = hashfile;
++ }
++ }
++ closedir(d);
++
++ return hashfiles;
++}
++
++
++struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
++{
++ int amt, hamt;
++ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
++ struct hashfile_struct *iter, *next, *best;
++ uint32 nlink;
++
++ if (!files)
++ return NULL;
++
++ iter = files; /* in case files are 0 bytes */
++ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
++ iter = files;
++ do {
++ /* Icky bit to resync when we steal the first node. */
++ if (!files)
++ files = iter;
++
++ next = iter->next;
++
++ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
++ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
++ if (iter == files) {
++ files = files->prev;
++ }
++ if (iter->next == iter) {
++ files = next = NULL;
++ } else {
++ next = iter->next;
++ if (iter == files) {
++ /* So we know to resync */
++ files = NULL;
++ }
++ }
++ iter->next->prev = iter->prev;
++ iter->prev->next = iter->next;
++ kill_hashfile(iter);
++ }
++
++ iter = next;
++ } while (iter != files);
++
++ if (iter == NULL && files == NULL) {
++ /* There are no matches. */
++ return NULL;
++ }
++
++ }
++
++ if (amt == -1) {
++ rprintf(FERROR,"%s",strerror(errno));
++ kill_hashfiles(files);
++ return NULL;
++ }
++
++ /* If we only have one file left, use it. */
++ if (files == files->next) {
++ return files;
++ }
++
++ /* All files which remain in the list are identical and should have
++ * the same size. We pick the one with the lowest link count (we
++ * may have rolled over because we hit the maximum link count for
++ * the filesystem). */
++ best = iter = files;
++ nlink = iter->nlink;
++ do {
++ if (iter->nlink < nlink) {
++ nlink = iter->nlink;
++ best = iter;
++ }
++ iter = iter->next;
++ } while (iter != files);
++
++ best->next->prev = best->prev;
++ best->prev->next = best->next;
++ if (files == best)
++ files = files->next;
++ kill_hashfiles(files);
++ return best;
++}
++
++
++int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
++{
++ STRUCT_STAT st;
++ char *hashname = make_hash_name(file);
++ int first = 0, rc;
++ char *linkname;
++ long last_fnbr;
++
++ if (file->length == 0) {
++ return robust_rename(fnametmp,fname,0644);
++ }
++
++ if (do_stat(hashname, &st) == -1) {
++ char *dirname;
++
++ /* Directory does not exist. */
++ dirname = strdup(hashname);
++ *strrchr(dirname,'/') = 0;
++ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
++ rprintf(FERROR, "mkdir %s: %s\n", dirname,
++ strerror(errno));
++ free(hashname);
++ free(dirname);
++ return robust_rename(fnametmp,fname,0644);
++ }
++ free(dirname);
++
++ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
++ rprintf(FERROR, "mkdir %s: %s\n", hashname,
++ strerror(errno));
++ free(hashname);
++ return robust_rename(fnametmp,fname,0644);
++ }
++
++ first = 1;
++ asprintf(&linkname,"%s/0",hashname);
++ rprintf(FINFO, "(1) linkname = %s\n", linkname);
++
++ } else {
++ struct hashfile_struct *hashfiles, *hashfile;
++ int fd;
++
++ if (do_stat(fnametmp,&st) == -1) {
++ rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
++ return -1;
++ }
++ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
++
++ if (hashfiles == NULL) {
++ first = 1;
++ asprintf(&linkname,"%s/0",hashname);
++ rprintf(FINFO, "(2) linkname = %s\n", linkname);
++ } else {
++
++ /* Search for one identical to us. */
++ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
++ rprintf(FERROR,"%s: %s\n",fnametmp,
++ strerror(errno));
++ kill_hashfiles(hashfiles);
++ return -1;
++ }
++ hashfile = compare_hashfiles(fd, hashfiles);
++ hashfiles = NULL;
++
++ if (hashfile) {
++ first = 0;
++ linkname = strdup(hashfile->name);
++ rprintf(FINFO, "(3) linkname = %s\n", linkname);
++ kill_hashfile(hashfile);
++ } else {
++ first = 1;
++ asprintf(&linkname, "%s/%ld", hashname,
++ last_fnbr + 1);
++ rprintf(FINFO, "(4) linkname = %s\n", linkname);
++ }
++ }
++ }
++
++ if (!first) {
++ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
++ linkname, full_fname(fname));
++ rc = do_link(linkname, fname);
++ if (rc == -1) {
++ if (errno == EMLINK) {
++ first = 1;
++ free(linkname);
++ asprintf(&linkname,"%s/%ld",hashname,
++ last_fnbr + 1);
++ rprintf(FINFO, "(5) linkname = %s\n", linkname);
++ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
++ } else {
++ rprintf(FERROR,"link \"%s\" -> %s: %s\n",
++ linkname,full_fname(fname),
++ strerror(errno));
++ robust_unlink(fname);
++ rc = robust_rename(fnametmp,fname,0644);
++ }
++ } else {
++ do_unlink(fnametmp);
++ }
++ }
++
++ if (first) {
++ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
++ full_fname(fname),linkname);
++
++ rc = robust_rename(fnametmp,fname,0644);
++ if (rc != 0) {
++ rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
++ full_fname(fnametmp),full_fname(fname),
++ strerror(errno));
++ }
++ rc = do_link(fname,linkname);
++ if (rc != 0) {
++ rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
++ full_fname(fname),linkname,
++ strerror(errno));
++ }
++ }
++
++ free(linkname);
++ free(hashname);
++ return rc;
++}
++
++#endif
+--- Makefile.in 2004-02-23 10:22:51.000000000 -0500
++++ Makefile.in 2004-02-23 10:22:51.000000000 -0500
+@@ -35,7 +35,7 @@
+ main.o checksum.o match.o syscall.o log.o backup.o
+ OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
+ fileio.o batch.o clientname.o
+-OBJS3=progress.o pipe.o
++OBJS3=progress.o pipe.o hashlink.o
+ DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
+ popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
+ popt/popthelp.o popt/poptparse.o
+--- options.c 2004-02-23 10:22:51.000000000 -0500
++++ options.c 2004-02-23 10:29:14.000000000 -0500
+@@ -119,6 +119,7 @@
+ char *password_file = NULL;
+ char *rsync_path = RSYNC_PATH;
+ char *backup_dir = NULL;
++char *link_by_hash_dir = NULL;
+ char backup_dir_buf[MAXPATHLEN];
+ int rsync_port = RSYNC_PORT;
+ int link_dest = 0;
+@@ -264,6 +265,7 @@
+ rprintf(F," -T --temp-dir=DIR create temporary files in directory DIR\n");
+ rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
+ rprintf(F," --link-dest=DIR create hardlinks to DIR for unchanged files\n");
++ rprintf(F," --link-by-hash=DIR create hardlinks by hash to DIR for regular files\n");
+ rprintf(F," -P equivalent to --partial --progress\n");
+ rprintf(F," -z, --compress compress file data\n");
+ rprintf(F," -C, --cvs-exclude auto ignore files in the same way CVS does\n");
+@@ -303,7 +305,7 @@
+ enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
+ OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
+ OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
+- OPT_READ_BATCH, OPT_WRITE_BATCH,
++ OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH,
+ OPT_REFUSED_BASE = 9000};
+
+ static struct poptOption long_options[] = {
+@@ -360,6 +362,7 @@
+ {"temp-dir", 'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
+ {"compare-dest", 0, POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
+ {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
++ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
+ /* TODO: Should this take an optional int giving the compression level? */
+ {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
+ {"daemon", 0, POPT_ARG_NONE, &daemon_opt, 0, 0, 0 },
+@@ -577,6 +580,19 @@
+ return 0;
+ #endif
+
++ case OPT_LINK_BY_HASH:
++#if HAVE_LINK
++ link_by_hash_dir = (char *)poptGetOptArg(pc);
++ checksum_seed = FIXED_CHECKSUM_SEED;
++ break;
++#else
++ snprintf(err_buf, sizeof err_buf,
++ "hard links are not supported on this %s\n",
++ am_server ? "server" : "client");
++ rprintf(FERROR, "ERROR: %s", err_buf);
++ return 0;
++#endif
++
+ default:
+ /* A large opt value means that set_refuse_options()
+ * turned this option off (opt-BASE is its index). */
+@@ -934,6 +950,11 @@
+ args[ac++] = compare_dest;
+ }
+
++ if (link_by_hash_dir && am_sender) {
++ args[ac++] = "--link-by-hash";
++ args[ac++] = link_by_hash_dir;
++ }
++
+ if (files_from && (!am_sender || remote_filesfrom_file)) {
+ if (remote_filesfrom_file) {
+ args[ac++] = "--files-from";
+--- proto.h 2004-02-23 10:22:51.000000000 -0500
++++ proto.h 2004-02-23 10:22:51.000000000 -0500
+@@ -93,6 +93,12 @@
+ void write_sum_head(int f, struct sum_struct *sum);
+ void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
+ void generate_files(int f, struct file_list *flist, char *local_name);
++char* make_hash_name(struct file_struct *file);
++void kill_hashfile(struct hashfile_struct *hashfile);
++void kill_hashfiles(struct hashfile_struct *hashfiles);
++struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
++struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
++int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
+ void init_hard_links(struct file_list *flist);
+ int hard_link_check(struct file_struct *file, int skip);
+ void do_hard_links(void);
+--- receiver.c 2004-02-23 10:22:51.000000000 -0500
++++ receiver.c 2004-02-23 10:22:51.000000000 -0500
+@@ -186,10 +186,11 @@
+
+
+ static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
+- OFF_T total_size)
++ OFF_T total_size,char *md4)
+ {
+ int i;
+ struct sum_struct sum;
++ struct mdfour mdfour_data;
+ unsigned int len;
+ OFF_T offset = 0;
+ OFF_T offset2;
+@@ -199,7 +200,9 @@
+ char *map=NULL;
+
+ read_sum_head(f_in, &sum);
+-
++ if (md4)
++ mdfour_begin(&mdfour_data);
++
+ sum_init();
+
+ while ((i = recv_token(f_in, &data)) != 0) {
+@@ -216,6 +219,8 @@
+ cleanup_got_literal = 1;
+
+ sum_update(data,i);
++ if (md4)
++ mdfour_update(&mdfour_data,data,i);
+
+ if (fd != -1 && write_file(fd,data,i) != i) {
+ rprintf(FERROR, "write failed on %s: %s\n",
+@@ -243,6 +248,8 @@
+
+ see_token(map, len);
+ sum_update(map,len);
++ if (md4)
++ mdfour_update(&mdfour_data,map,len);
+ }
+
+ if (fd != -1 && write_file(fd,map,len) != (int) len) {
+@@ -265,6 +272,8 @@
+ }
+
+ sum_end(file_sum1);
++ if (md4)
++ mdfour_result(&mdfour_data, (unsigned char*)md4);
+
+ read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
+ if (verbose > 2) {
+@@ -299,6 +308,7 @@
+ extern int preserve_perms;
+ extern int delete_after;
+ extern int orig_umask;
++ extern char *link_by_hash_dir;
+ struct stats initial_stats;
+
+ if (verbose > 2) {
+@@ -372,7 +382,7 @@
+ if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
+ rprintf(FERROR, "fstat %s failed: %s\n",
+ full_fname(fnamecmp), strerror(errno));
+- receive_data(f_in,NULL,-1,NULL,file->length);
++ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
+ close(fd1);
+ continue;
+ }
+@@ -385,7 +395,7 @@
+ */
+ rprintf(FERROR,"recv_files: %s is a directory\n",
+ full_fname(fnamecmp));
+- receive_data(f_in, NULL, -1, NULL, file->length);
++ receive_data(f_in,NULL,-1,NULL,file->length,NULL);
+ close(fd1);
+ continue;
+ }
+@@ -437,7 +447,7 @@
+ if (fd2 == -1) {
+ rprintf(FERROR, "mkstemp %s failed: %s\n",
+ full_fname(fnametmp), strerror(errno));
+- receive_data(f_in,mapbuf,-1,NULL,file->length);
++ receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
+ if (mapbuf) unmap_file(mapbuf);
+ if (fd1 != -1) close(fd1);
+ continue;
+@@ -450,7 +460,12 @@
+ }
+
+ /* recv file data */
+- recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
++#ifdef HAVE_LINK
++ if (link_by_hash_dir) {
++ file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
++ }
++#endif
++ recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
+
+ log_recv(file, &initial_stats);
+
+--- rsync.c 2004-02-23 10:22:51.000000000 -0500
++++ rsync.c 2004-02-23 10:27:15.000000000 -0500
+@@ -33,6 +33,7 @@
+ extern int preserve_gid;
+ extern int preserve_perms;
+ extern int make_backups;
++extern char *link_by_hash_dir;
+
+
+ /*
+@@ -236,6 +237,12 @@
+ if (make_backups && !make_backup(fname))
+ return;
+
++#ifdef HAVE_LINK
++ if (link_by_hash_dir) {
++ if ((ret = link_by_hash(fnametmp,fname,file)) != 0)
++ return;
++ } else
++#endif
+ /* move tmp file over real file */
+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
+ if (ret != 0) {
+--- rsync.h 2004-02-23 10:22:51.000000000 -0500
++++ rsync.h 2004-02-23 10:22:51.000000000 -0500
+@@ -513,6 +513,14 @@
+ int current_file_index;
+ };
+
++struct hashfile_struct {
++ struct hashfile_struct *next;
++ struct hashfile_struct *prev;
++ char *name;
++ int fd;
++ uint32 nlink;
++};
++
+
+ /* we need this function because of the silly way in which duplicate
+ entries are handled in the file lists - we can't change this