summaryrefslogtreecommitdiff
path: root/rsync/delta.c
Unidiff
Diffstat (limited to 'rsync/delta.c') (more/less context) (ignore whitespace changes)
-rw-r--r--rsync/delta.c351
1 files changed, 351 insertions, 0 deletions
diff --git a/rsync/delta.c b/rsync/delta.c
new file mode 100644
index 0000000..323c079
--- a/dev/null
+++ b/rsync/delta.c
@@ -0,0 +1,351 @@
1/*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2 *
3 * librsync -- library for network deltas
4 * $Id$
5 *
6 * Copyright (C) 2000, 2001 by Martin Pool <mbp@samba.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published by
10 * the Free Software Foundation; either version 2.1 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23 /*
24 | Let's climb to the TOP of that
25 | MOUNTAIN and think about STRIP
26 | MINING!!
27 */
28
29
30/*
31 * delta.c -- Generate in streaming mode an rsync delta given a set of
32 * signatures, and a new file.
33 *
34 * The size of blocks for signature generation is determined by the
35 * block size in the incoming signature.
36 *
37 * To calculate a signature, we need to be able to see at least one
38 * block of the new file at a time. Once we have that, we calculate
39 * its weak signature, and see if there is any block in the signature
40 * hash table that has the same weak sum. If there is one, then we
41 * also compute the strong sum of the new block, and cross check that.
42 * If they're the same, then we can assume we have a match.
43 *
44 * The final block of the file has to be handled a little differently,
45 * because it may be a short match. Short blocks in the signature
46 * don't include their length -- we just allow for the final short
47 * block of the file to match any block in the signature, and if they
48 * have the same checksum we assume they must have the same length.
49 * Therefore, when we emit a COPY command, we have to send it with a
50 * length that is the same as the block matched, and not the block
51 * length from the signature.
52 */
53
54/*
55 * Profiling results as of v1.26, 2001-03-18:
56 *
57 * If everything matches, then we spend almost all our time in
58 * rs_mdfour64 and rs_weak_sum, which is unavoidable and therefore a
59 * good profile.
60 *
61 * If nothing matches, it is not so good.
62 */
63
64
65#include <config_rsync.h>
66
67#include <assert.h>
68#include <stdlib.h>
69#include <stdio.h>
70
71#include "rsync.h"
72#include "emit.h"
73#include "stream.h"
74#include "util.h"
75#include "sumset.h"
76#include "job.h"
77#include "trace.h"
78#include "checksum.h"
79#include "search.h"
80#include "types.h"
81
82
83/**
84 * Turn this on to make all rolling checksums be checked from scratch.
85 */
86int rs_roll_paranoia = 0;
87
88
89static rs_result rs_delta_scan(rs_job_t *, rs_long_t avail_len, void *);
90
91static rs_result rs_delta_s_deferred_copy(rs_job_t *job);
92
93
94
95static rs_result rs_delta_s_end(rs_job_t *job)
96{
97 rs_emit_end_cmd(job);
98 return RS_DONE;
99}
100
101
102/**
103 * \brief Get a block of data if possible, and see if it matches.
104 *
105 * On each call, we try to process all of the input data available on
106 * the scoop and input buffer.
107 */
108static rs_result
109rs_delta_s_scan(rs_job_t *job)
110{
111 size_t this_len, avail_len;
112 int is_ending;
113 void *inptr;
114 rs_result result;
115
116 rs_job_check(job);
117
118 avail_len = rs_scoop_total_avail(job);
119 this_len = job->block_len;
120 is_ending = job->stream->eof_in;
121
122 /* Now, we have avail_len bytes, and we need to scan through them
123 * looking for a match. We'll always end up emitting exactly one
124 * command, either a literal or a copy, and after discovering that
125 * we will skip over the appropriate number of bytes. */
126 if (avail_len == 0) {
127 if (is_ending) {
128 /* no more delta to do */
129 job->statefn = rs_delta_s_end;
130 }
131 return RS_BLOCKED;
132 }
133
134 /* must read at least one block, or give up */
135 if ((avail_len < job->block_len) && !is_ending) {
136 /* we know we won't get it, but we have to try for a whole
137 * block anyhow so that it gets into the scoop. */
138 rs_scoop_input(job, job->block_len);
139 return RS_BLOCKED;
140 }
141
142 result = rs_scoop_readahead(job, avail_len, &inptr);
143 if (result != RS_DONE)
144 return result;
145
146 return rs_delta_scan(job, avail_len, inptr);
147}
148
149
150
151/**
152 * Scan for a matching block in the next \p avail_len bytes of input.
153 *
154 * If nonmatching data is found, then a LITERAL command will be put in
155 * the tube immediately. If matching data is found, then its position
156 * will be saved in the job, and the job state set up to write out a
157 * COPY command after handling the literal.
158 */
159static rs_result
160rs_delta_scan(rs_job_t *job, rs_long_t avail_len, void *p)
161{
162 rs_long_t match_where;
163 int search_pos, end_pos;
164 unsigned char *inptr = (unsigned char *) p;
165 uint32_t s1 = job->weak_sig & 0xFFFF;
166 uint32_t s2 = job->weak_sig >> 16;
167
168 /* So, we have avail_len bytes of data, and we want to look
169 * through it for a match at some point. It's OK if it's not at
170 * the start of the available input data. If we're approaching
171 * the end and can't get a match, then we just block and get more
172 * later. */
173
174 /* FIXME: Perhaps we should be working in signed chars for the
175 * rolling sum? */
176
177 if (job->stream->eof_in)
178 end_pos = avail_len - 1;
179 else
180 end_pos = avail_len - job->block_len;
181
182 for (search_pos = 0; search_pos <= end_pos; search_pos++) {
183 size_t this_len = job->block_len;
184
185 if (search_pos + this_len > avail_len) {
186 this_len = avail_len - search_pos;
187 rs_trace("block reduced to %d", this_len);
188 } else if (job->have_weak_sig) {
189 unsigned char a = inptr[search_pos + this_len - 1];
190 /* roll in the newly added byte, if any */
191 s1 += a + RS_CHAR_OFFSET;
192 s2 += s1;
193
194 job->weak_sig = (s1 & 0xffff) | (s2 << 16);
195 }
196
197 if (!job->have_weak_sig) {
198 rs_trace("calculate weak sum from scratch");
199 job->weak_sig = rs_calc_weak_sum(inptr + search_pos, this_len);
200 s1 = job->weak_sig & 0xFFFF;
201 s2 = job->weak_sig >> 16;
202 job->have_weak_sig = 1;
203 }
204
205 if (rs_roll_paranoia) {
206 rs_weak_sum_t verify = rs_calc_weak_sum(inptr + search_pos, this_len);
207 if (verify != job->weak_sig) {
208 rs_fatal("mismatch between rolled sum %#x and check %#x",
209 job->weak_sig, verify);
210 }
211 }
212
213 if (rs_search_for_block(job->weak_sig, inptr + search_pos, this_len,
214 job->signature, &job->stats, &match_where)) {
215 /* So, we got a match. Cool. However, there may be
216 * leading unmatched data that we need to flush. Thus we
217 * set our statefn to be rs_delta_s_deferred_copy so that
218 * we can write out the command later. */
219
220 rs_trace("matched %.0f bytes at %.0f!",
221 (double) this_len, (double) match_where);
222 job->basis_pos = match_where;
223 job->basis_len = this_len;
224 job->statefn = rs_delta_s_deferred_copy;
225 job->have_weak_sig = 0;
226 break;
227 } else {
228 /* advance by one; roll out the byte we just moved over. */
229 unsigned char a = inptr[search_pos];
230 unsigned char shift = a + RS_CHAR_OFFSET;
231
232 s1 -= shift;
233 s2 -= this_len * shift;
234 job->weak_sig = (s1 & 0xffff) | (s2 << 16);
235 }
236 }
237
238 if (search_pos > 0) {
239 /* We may or may not have found a block, but we know we found
240 * some literal data at the start of the buffer. Therefore,
241 * we have to flush that out before we can continue on and
242 * emit the copy command or keep searching. */
243
244 /* FIXME: At the moment, if you call with very short buffers,
245 * then you will get a series of very short LITERAL commands.
246 * Perhaps this is what you deserve, or perhaps we should try
247 * to get more readahead and avoid that. */
248
249 /* There's some literal data at the start of this window which
250 * we know is not in any block. */
251 rs_trace("got %d bytes of literal data", search_pos);
252 rs_emit_literal_cmd(job, search_pos);
253 rs_tube_copy(job, search_pos);
254 }
255
256 return RS_RUNNING;
257}
258
259
260
261static rs_result rs_delta_s_deferred_copy(rs_job_t *job)
262{
263 if (!job->basis_len) {
264 rs_log(RS_LOG_ERR, "somehow got zero basis_len");
265 return RS_INTERNAL_ERROR;
266 }
267
268 rs_emit_copy_cmd(job, job->basis_pos, job->basis_len);
269 rs_scoop_advance(job, job->basis_len);
270
271 job->statefn = rs_delta_s_scan;
272
273 return RS_RUNNING;
274}
275
276
277/**
278 * \brief State function that does a slack delta containing only
279 * literal data to recreate the input.
280 */
281static rs_result rs_delta_s_slack(rs_job_t *job)
282{
283 rs_buffers_t * const stream = job->stream;
284 size_t avail = stream->avail_in;
285
286 if (avail) {
287 rs_trace("emit slack delta for %.0f available bytes", (double) avail);
288 rs_emit_literal_cmd(job, avail);
289 rs_tube_copy(job, avail);
290 return RS_RUNNING;
291 } else {
292 if (rs_job_input_is_ending(job)) {
293 job->statefn = rs_delta_s_end;
294 return RS_RUNNING;
295 } else {
296 return RS_BLOCKED;
297 }
298 }
299}
300
301
302/**
303 * State function for writing out the header of the encoding job.
304 */
305static rs_result rs_delta_s_header(rs_job_t *job)
306{
307 rs_emit_delta_header(job);
308
309 if (job->block_len) {
310 if (!job->signature) {
311 rs_error("no signature is loaded into the job");
312 return RS_PARAM_ERROR;
313 }
314 job->statefn = rs_delta_s_scan;
315 } else {
316 rs_trace("block length is zero for this delta; "
317 "therefore using slack deltas");
318 job->statefn = rs_delta_s_slack;
319 }
320
321 return RS_RUNNING;
322}
323
324
325/**
326 * Prepare to compute a streaming delta.
327 */
328rs_job_t *rs_delta_begin(rs_signature_t *sig)
329{
330 rs_job_t *job;
331
332 job = rs_job_new("delta", rs_delta_s_header);
333 job->signature = sig;
334
335 if ((job->block_len = sig->block_len) < 0) {
336 rs_log(RS_LOG_ERR, "unreasonable block_len %d in signature",
337 job->block_len);
338 return NULL;
339 }
340
341 job->strong_sum_len = sig->strong_sum_len;
342 if (job->strong_sum_len < 0 || job->strong_sum_len > RS_MD4_LENGTH) {
343 rs_log(RS_LOG_ERR, "unreasonable strong_sum_len %d in signature",
344 job->strong_sum_len);
345 return NULL;
346 }
347
348 return job;
349}
350
351