-rw-r--r-- | rsync/delta.c | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/rsync/delta.c b/rsync/delta.c new file mode 100644 index 0000000..323c079 --- a/dev/null +++ b/rsync/delta.c @@ -0,0 +1,351 @@ +/*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*- + * + * librsync -- library for network deltas + * $Id$ + * + * Copyright (C) 2000, 2001 by Martin Pool <mbp@samba.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + /* + | Let's climb to the TOP of that + | MOUNTAIN and think about STRIP + | MINING!! + */ + + +/* + * delta.c -- Generate in streaming mode an rsync delta given a set of + * signatures, and a new file. + * + * The size of blocks for signature generation is determined by the + * block size in the incoming signature. + * + * To calculate a signature, we need to be able to see at least one + * block of the new file at a time. Once we have that, we calculate + * its weak signature, and see if there is any block in the signature + * hash table that has the same weak sum. If there is one, then we + * also compute the strong sum of the new block, and cross check that. + * If they're the same, then we can assume we have a match. + * + * The final block of the file has to be handled a little differently, + * because it may be a short match. Short blocks in the signature + * don't include their length -- we just allow for the final short + * block of the file to match any block in the signature, and if they + * have the same checksum we assume they must have the same length. + * Therefore, when we emit a COPY command, we have to send it with a + * length that is the same as the block matched, and not the block + * length from the signature. + */ + +/* + * Profiling results as of v1.26, 2001-03-18: + * + * If everything matches, then we spend almost all our time in + * rs_mdfour64 and rs_weak_sum, which is unavoidable and therefore a + * good profile. + * + * If nothing matches, it is not so good. + */ + + +#include <config_rsync.h> + +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> + +#include "rsync.h" +#include "emit.h" +#include "stream.h" +#include "util.h" +#include "sumset.h" +#include "job.h" +#include "trace.h" +#include "checksum.h" +#include "search.h" +#include "types.h" + + +/** + * Turn this on to make all rolling checksums be checked from scratch. + */ +int rs_roll_paranoia = 0; + + +static rs_result rs_delta_scan(rs_job_t *, rs_long_t avail_len, void *); + +static rs_result rs_delta_s_deferred_copy(rs_job_t *job); + + + +static rs_result rs_delta_s_end(rs_job_t *job) +{ + rs_emit_end_cmd(job); + return RS_DONE; +} + + +/** + * \brief Get a block of data if possible, and see if it matches. + * + * On each call, we try to process all of the input data available on + * the scoop and input buffer. + */ +static rs_result +rs_delta_s_scan(rs_job_t *job) +{ + size_t this_len, avail_len; + int is_ending; + void *inptr; + rs_result result; + + rs_job_check(job); + + avail_len = rs_scoop_total_avail(job); + this_len = job->block_len; + is_ending = job->stream->eof_in; + + /* Now, we have avail_len bytes, and we need to scan through them + * looking for a match. We'll always end up emitting exactly one + * command, either a literal or a copy, and after discovering that + * we will skip over the appropriate number of bytes. */ + if (avail_len == 0) { + if (is_ending) { + /* no more delta to do */ + job->statefn = rs_delta_s_end; + } + return RS_BLOCKED; + } + + /* must read at least one block, or give up */ + if ((avail_len < job->block_len) && !is_ending) { + /* we know we won't get it, but we have to try for a whole + * block anyhow so that it gets into the scoop. */ + rs_scoop_input(job, job->block_len); + return RS_BLOCKED; + } + + result = rs_scoop_readahead(job, avail_len, &inptr); + if (result != RS_DONE) + return result; + + return rs_delta_scan(job, avail_len, inptr); +} + + + +/** + * Scan for a matching block in the next \p avail_len bytes of input. + * + * If nonmatching data is found, then a LITERAL command will be put in + * the tube immediately. If matching data is found, then its position + * will be saved in the job, and the job state set up to write out a + * COPY command after handling the literal. + */ +static rs_result +rs_delta_scan(rs_job_t *job, rs_long_t avail_len, void *p) +{ + rs_long_t match_where; + int search_pos, end_pos; + unsigned char *inptr = (unsigned char *) p; + uint32_t s1 = job->weak_sig & 0xFFFF; + uint32_t s2 = job->weak_sig >> 16; + + /* So, we have avail_len bytes of data, and we want to look + * through it for a match at some point. It's OK if it's not at + * the start of the available input data. If we're approaching + * the end and can't get a match, then we just block and get more + * later. */ + + /* FIXME: Perhaps we should be working in signed chars for the + * rolling sum? */ + + if (job->stream->eof_in) + end_pos = avail_len - 1; + else + end_pos = avail_len - job->block_len; + + for (search_pos = 0; search_pos <= end_pos; search_pos++) { + size_t this_len = job->block_len; + + if (search_pos + this_len > avail_len) { + this_len = avail_len - search_pos; + rs_trace("block reduced to %d", this_len); + } else if (job->have_weak_sig) { + unsigned char a = inptr[search_pos + this_len - 1]; + /* roll in the newly added byte, if any */ + s1 += a + RS_CHAR_OFFSET; + s2 += s1; + + job->weak_sig = (s1 & 0xffff) | (s2 << 16); + } + + if (!job->have_weak_sig) { + rs_trace("calculate weak sum from scratch"); + job->weak_sig = rs_calc_weak_sum(inptr + search_pos, this_len); + s1 = job->weak_sig & 0xFFFF; + s2 = job->weak_sig >> 16; + job->have_weak_sig = 1; + } + + if (rs_roll_paranoia) { + rs_weak_sum_t verify = rs_calc_weak_sum(inptr + search_pos, this_len); + if (verify != job->weak_sig) { + rs_fatal("mismatch between rolled sum %#x and check %#x", + job->weak_sig, verify); + } + } + + if (rs_search_for_block(job->weak_sig, inptr + search_pos, this_len, + job->signature, &job->stats, &match_where)) { + /* So, we got a match. Cool. However, there may be + * leading unmatched data that we need to flush. Thus we + * set our statefn to be rs_delta_s_deferred_copy so that + * we can write out the command later. */ + + rs_trace("matched %.0f bytes at %.0f!", + (double) this_len, (double) match_where); + job->basis_pos = match_where; + job->basis_len = this_len; + job->statefn = rs_delta_s_deferred_copy; + job->have_weak_sig = 0; + break; + } else { + /* advance by one; roll out the byte we just moved over. */ + unsigned char a = inptr[search_pos]; + unsigned char shift = a + RS_CHAR_OFFSET; + + s1 -= shift; + s2 -= this_len * shift; + job->weak_sig = (s1 & 0xffff) | (s2 << 16); + } + } + + if (search_pos > 0) { + /* We may or may not have found a block, but we know we found + * some literal data at the start of the buffer. Therefore, + * we have to flush that out before we can continue on and + * emit the copy command or keep searching. */ + + /* FIXME: At the moment, if you call with very short buffers, + * then you will get a series of very short LITERAL commands. + * Perhaps this is what you deserve, or perhaps we should try + * to get more readahead and avoid that. */ + + /* There's some literal data at the start of this window which + * we know is not in any block. */ + rs_trace("got %d bytes of literal data", search_pos); + rs_emit_literal_cmd(job, search_pos); + rs_tube_copy(job, search_pos); + } + + return RS_RUNNING; +} + + + +static rs_result rs_delta_s_deferred_copy(rs_job_t *job) +{ + if (!job->basis_len) { + rs_log(RS_LOG_ERR, "somehow got zero basis_len"); + return RS_INTERNAL_ERROR; + } + + rs_emit_copy_cmd(job, job->basis_pos, job->basis_len); + rs_scoop_advance(job, job->basis_len); + + job->statefn = rs_delta_s_scan; + + return RS_RUNNING; +} + + +/** + * \brief State function that does a slack delta containing only + * literal data to recreate the input. + */ +static rs_result rs_delta_s_slack(rs_job_t *job) +{ + rs_buffers_t * const stream = job->stream; + size_t avail = stream->avail_in; + + if (avail) { + rs_trace("emit slack delta for %.0f available bytes", (double) avail); + rs_emit_literal_cmd(job, avail); + rs_tube_copy(job, avail); + return RS_RUNNING; + } else { + if (rs_job_input_is_ending(job)) { + job->statefn = rs_delta_s_end; + return RS_RUNNING; + } else { + return RS_BLOCKED; + } + } +} + + +/** + * State function for writing out the header of the encoding job. + */ +static rs_result rs_delta_s_header(rs_job_t *job) +{ + rs_emit_delta_header(job); + + if (job->block_len) { + if (!job->signature) { + rs_error("no signature is loaded into the job"); + return RS_PARAM_ERROR; + } + job->statefn = rs_delta_s_scan; + } else { + rs_trace("block length is zero for this delta; " + "therefore using slack deltas"); + job->statefn = rs_delta_s_slack; + } + + return RS_RUNNING; +} + + +/** + * Prepare to compute a streaming delta. + */ +rs_job_t *rs_delta_begin(rs_signature_t *sig) +{ + rs_job_t *job; + + job = rs_job_new("delta", rs_delta_s_header); + job->signature = sig; + + if ((job->block_len = sig->block_len) < 0) { + rs_log(RS_LOG_ERR, "unreasonable block_len %d in signature", + job->block_len); + return NULL; + } + + job->strong_sum_len = sig->strong_sum_len; + if (job->strong_sum_len < 0 || job->strong_sum_len > RS_MD4_LENGTH) { + rs_log(RS_LOG_ERR, "unreasonable strong_sum_len %d in signature", + job->strong_sum_len); + return NULL; + } + + return job; +} + + |