-rw-r--r-- | rsync/delta.c | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/rsync/delta.c b/rsync/delta.c new file mode 100644 index 0000000..323c079 --- a/dev/null +++ b/rsync/delta.c | |||
@@ -0,0 +1,351 @@ | |||
1 | /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*- | ||
2 | * | ||
3 | * librsync -- library for network deltas | ||
4 | * $Id$ | ||
5 | * | ||
6 | * Copyright (C) 2000, 2001 by Martin Pool <mbp@samba.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU Lesser General Public License as published by | ||
10 | * the Free Software Foundation; either version 2.1 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | | Let's climb to the TOP of that | ||
25 | | MOUNTAIN and think about STRIP | ||
26 | | MINING!! | ||
27 | */ | ||
28 | |||
29 | |||
30 | /* | ||
31 | * delta.c -- Generate in streaming mode an rsync delta given a set of | ||
32 | * signatures, and a new file. | ||
33 | * | ||
34 | * The size of blocks for signature generation is determined by the | ||
35 | * block size in the incoming signature. | ||
36 | * | ||
37 | * To calculate a signature, we need to be able to see at least one | ||
38 | * block of the new file at a time. Once we have that, we calculate | ||
39 | * its weak signature, and see if there is any block in the signature | ||
40 | * hash table that has the same weak sum. If there is one, then we | ||
41 | * also compute the strong sum of the new block, and cross check that. | ||
42 | * If they're the same, then we can assume we have a match. | ||
43 | * | ||
44 | * The final block of the file has to be handled a little differently, | ||
45 | * because it may be a short match. Short blocks in the signature | ||
46 | * don't include their length -- we just allow for the final short | ||
47 | * block of the file to match any block in the signature, and if they | ||
48 | * have the same checksum we assume they must have the same length. | ||
49 | * Therefore, when we emit a COPY command, we have to send it with a | ||
50 | * length that is the same as the block matched, and not the block | ||
51 | * length from the signature. | ||
52 | */ | ||
53 | |||
54 | /* | ||
55 | * Profiling results as of v1.26, 2001-03-18: | ||
56 | * | ||
57 | * If everything matches, then we spend almost all our time in | ||
58 | * rs_mdfour64 and rs_weak_sum, which is unavoidable and therefore a | ||
59 | * good profile. | ||
60 | * | ||
61 | * If nothing matches, it is not so good. | ||
62 | */ | ||
63 | |||
64 | |||
65 | #include <config_rsync.h> | ||
66 | |||
67 | #include <assert.h> | ||
68 | #include <stdlib.h> | ||
69 | #include <stdio.h> | ||
70 | |||
71 | #include "rsync.h" | ||
72 | #include "emit.h" | ||
73 | #include "stream.h" | ||
74 | #include "util.h" | ||
75 | #include "sumset.h" | ||
76 | #include "job.h" | ||
77 | #include "trace.h" | ||
78 | #include "checksum.h" | ||
79 | #include "search.h" | ||
80 | #include "types.h" | ||
81 | |||
82 | |||
83 | /** | ||
84 | * Turn this on to make all rolling checksums be checked from scratch. | ||
85 | */ | ||
86 | int rs_roll_paranoia = 0; | ||
87 | |||
88 | |||
89 | static rs_result rs_delta_scan(rs_job_t *, rs_long_t avail_len, void *); | ||
90 | |||
91 | static rs_result rs_delta_s_deferred_copy(rs_job_t *job); | ||
92 | |||
93 | |||
94 | |||
95 | static rs_result rs_delta_s_end(rs_job_t *job) | ||
96 | { | ||
97 | rs_emit_end_cmd(job); | ||
98 | return RS_DONE; | ||
99 | } | ||
100 | |||
101 | |||
102 | /** | ||
103 | * \brief Get a block of data if possible, and see if it matches. | ||
104 | * | ||
105 | * On each call, we try to process all of the input data available on | ||
106 | * the scoop and input buffer. | ||
107 | */ | ||
108 | static rs_result | ||
109 | rs_delta_s_scan(rs_job_t *job) | ||
110 | { | ||
111 | size_t this_len, avail_len; | ||
112 | int is_ending; | ||
113 | void *inptr; | ||
114 | rs_result result; | ||
115 | |||
116 | rs_job_check(job); | ||
117 | |||
118 | avail_len = rs_scoop_total_avail(job); | ||
119 | this_len = job->block_len; | ||
120 | is_ending = job->stream->eof_in; | ||
121 | |||
122 | /* Now, we have avail_len bytes, and we need to scan through them | ||
123 | * looking for a match. We'll always end up emitting exactly one | ||
124 | * command, either a literal or a copy, and after discovering that | ||
125 | * we will skip over the appropriate number of bytes. */ | ||
126 | if (avail_len == 0) { | ||
127 | if (is_ending) { | ||
128 | /* no more delta to do */ | ||
129 | job->statefn = rs_delta_s_end; | ||
130 | } | ||
131 | return RS_BLOCKED; | ||
132 | } | ||
133 | |||
134 | /* must read at least one block, or give up */ | ||
135 | if ((avail_len < job->block_len) && !is_ending) { | ||
136 | /* we know we won't get it, but we have to try for a whole | ||
137 | * block anyhow so that it gets into the scoop. */ | ||
138 | rs_scoop_input(job, job->block_len); | ||
139 | return RS_BLOCKED; | ||
140 | } | ||
141 | |||
142 | result = rs_scoop_readahead(job, avail_len, &inptr); | ||
143 | if (result != RS_DONE) | ||
144 | return result; | ||
145 | |||
146 | return rs_delta_scan(job, avail_len, inptr); | ||
147 | } | ||
148 | |||
149 | |||
150 | |||
151 | /** | ||
152 | * Scan for a matching block in the next \p avail_len bytes of input. | ||
153 | * | ||
154 | * If nonmatching data is found, then a LITERAL command will be put in | ||
155 | * the tube immediately. If matching data is found, then its position | ||
156 | * will be saved in the job, and the job state set up to write out a | ||
157 | * COPY command after handling the literal. | ||
158 | */ | ||
159 | static rs_result | ||
160 | rs_delta_scan(rs_job_t *job, rs_long_t avail_len, void *p) | ||
161 | { | ||
162 | rs_long_t match_where; | ||
163 | int search_pos, end_pos; | ||
164 | unsigned char *inptr = (unsigned char *) p; | ||
165 | uint32_t s1 = job->weak_sig & 0xFFFF; | ||
166 | uint32_t s2 = job->weak_sig >> 16; | ||
167 | |||
168 | /* So, we have avail_len bytes of data, and we want to look | ||
169 | * through it for a match at some point. It's OK if it's not at | ||
170 | * the start of the available input data. If we're approaching | ||
171 | * the end and can't get a match, then we just block and get more | ||
172 | * later. */ | ||
173 | |||
174 | /* FIXME: Perhaps we should be working in signed chars for the | ||
175 | * rolling sum? */ | ||
176 | |||
177 | if (job->stream->eof_in) | ||
178 | end_pos = avail_len - 1; | ||
179 | else | ||
180 | end_pos = avail_len - job->block_len; | ||
181 | |||
182 | for (search_pos = 0; search_pos <= end_pos; search_pos++) { | ||
183 | size_t this_len = job->block_len; | ||
184 | |||
185 | if (search_pos + this_len > avail_len) { | ||
186 | this_len = avail_len - search_pos; | ||
187 | rs_trace("block reduced to %d", this_len); | ||
188 | } else if (job->have_weak_sig) { | ||
189 | unsigned char a = inptr[search_pos + this_len - 1]; | ||
190 | /* roll in the newly added byte, if any */ | ||
191 | s1 += a + RS_CHAR_OFFSET; | ||
192 | s2 += s1; | ||
193 | |||
194 | job->weak_sig = (s1 & 0xffff) | (s2 << 16); | ||
195 | } | ||
196 | |||
197 | if (!job->have_weak_sig) { | ||
198 | rs_trace("calculate weak sum from scratch"); | ||
199 | job->weak_sig = rs_calc_weak_sum(inptr + search_pos, this_len); | ||
200 | s1 = job->weak_sig & 0xFFFF; | ||
201 | s2 = job->weak_sig >> 16; | ||
202 | job->have_weak_sig = 1; | ||
203 | } | ||
204 | |||
205 | if (rs_roll_paranoia) { | ||
206 | rs_weak_sum_t verify = rs_calc_weak_sum(inptr + search_pos, this_len); | ||
207 | if (verify != job->weak_sig) { | ||
208 | rs_fatal("mismatch between rolled sum %#x and check %#x", | ||
209 | job->weak_sig, verify); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | if (rs_search_for_block(job->weak_sig, inptr + search_pos, this_len, | ||
214 | job->signature, &job->stats, &match_where)) { | ||
215 | /* So, we got a match. Cool. However, there may be | ||
216 | * leading unmatched data that we need to flush. Thus we | ||
217 | * set our statefn to be rs_delta_s_deferred_copy so that | ||
218 | * we can write out the command later. */ | ||
219 | |||
220 | rs_trace("matched %.0f bytes at %.0f!", | ||
221 | (double) this_len, (double) match_where); | ||
222 | job->basis_pos = match_where; | ||
223 | job->basis_len = this_len; | ||
224 | job->statefn = rs_delta_s_deferred_copy; | ||
225 | job->have_weak_sig = 0; | ||
226 | break; | ||
227 | } else { | ||
228 | /* advance by one; roll out the byte we just moved over. */ | ||
229 | unsigned char a = inptr[search_pos]; | ||
230 | unsigned char shift = a + RS_CHAR_OFFSET; | ||
231 | |||
232 | s1 -= shift; | ||
233 | s2 -= this_len * shift; | ||
234 | job->weak_sig = (s1 & 0xffff) | (s2 << 16); | ||
235 | } | ||
236 | } | ||
237 | |||
238 | if (search_pos > 0) { | ||
239 | /* We may or may not have found a block, but we know we found | ||
240 | * some literal data at the start of the buffer. Therefore, | ||
241 | * we have to flush that out before we can continue on and | ||
242 | * emit the copy command or keep searching. */ | ||
243 | |||
244 | /* FIXME: At the moment, if you call with very short buffers, | ||
245 | * then you will get a series of very short LITERAL commands. | ||
246 | * Perhaps this is what you deserve, or perhaps we should try | ||
247 | * to get more readahead and avoid that. */ | ||
248 | |||
249 | /* There's some literal data at the start of this window which | ||
250 | * we know is not in any block. */ | ||
251 | rs_trace("got %d bytes of literal data", search_pos); | ||
252 | rs_emit_literal_cmd(job, search_pos); | ||
253 | rs_tube_copy(job, search_pos); | ||
254 | } | ||
255 | |||
256 | return RS_RUNNING; | ||
257 | } | ||
258 | |||
259 | |||
260 | |||
261 | static rs_result rs_delta_s_deferred_copy(rs_job_t *job) | ||
262 | { | ||
263 | if (!job->basis_len) { | ||
264 | rs_log(RS_LOG_ERR, "somehow got zero basis_len"); | ||
265 | return RS_INTERNAL_ERROR; | ||
266 | } | ||
267 | |||
268 | rs_emit_copy_cmd(job, job->basis_pos, job->basis_len); | ||
269 | rs_scoop_advance(job, job->basis_len); | ||
270 | |||
271 | job->statefn = rs_delta_s_scan; | ||
272 | |||
273 | return RS_RUNNING; | ||
274 | } | ||
275 | |||
276 | |||
277 | /** | ||
278 | * \brief State function that does a slack delta containing only | ||
279 | * literal data to recreate the input. | ||
280 | */ | ||
281 | static rs_result rs_delta_s_slack(rs_job_t *job) | ||
282 | { | ||
283 | rs_buffers_t * const stream = job->stream; | ||
284 | size_t avail = stream->avail_in; | ||
285 | |||
286 | if (avail) { | ||
287 | rs_trace("emit slack delta for %.0f available bytes", (double) avail); | ||
288 | rs_emit_literal_cmd(job, avail); | ||
289 | rs_tube_copy(job, avail); | ||
290 | return RS_RUNNING; | ||
291 | } else { | ||
292 | if (rs_job_input_is_ending(job)) { | ||
293 | job->statefn = rs_delta_s_end; | ||
294 | return RS_RUNNING; | ||
295 | } else { | ||
296 | return RS_BLOCKED; | ||
297 | } | ||
298 | } | ||
299 | } | ||
300 | |||
301 | |||
302 | /** | ||
303 | * State function for writing out the header of the encoding job. | ||
304 | */ | ||
305 | static rs_result rs_delta_s_header(rs_job_t *job) | ||
306 | { | ||
307 | rs_emit_delta_header(job); | ||
308 | |||
309 | if (job->block_len) { | ||
310 | if (!job->signature) { | ||
311 | rs_error("no signature is loaded into the job"); | ||
312 | return RS_PARAM_ERROR; | ||
313 | } | ||
314 | job->statefn = rs_delta_s_scan; | ||
315 | } else { | ||
316 | rs_trace("block length is zero for this delta; " | ||
317 | "therefore using slack deltas"); | ||
318 | job->statefn = rs_delta_s_slack; | ||
319 | } | ||
320 | |||
321 | return RS_RUNNING; | ||
322 | } | ||
323 | |||
324 | |||
325 | /** | ||
326 | * Prepare to compute a streaming delta. | ||
327 | */ | ||
328 | rs_job_t *rs_delta_begin(rs_signature_t *sig) | ||
329 | { | ||
330 | rs_job_t *job; | ||
331 | |||
332 | job = rs_job_new("delta", rs_delta_s_header); | ||
333 | job->signature = sig; | ||
334 | |||
335 | if ((job->block_len = sig->block_len) < 0) { | ||
336 | rs_log(RS_LOG_ERR, "unreasonable block_len %d in signature", | ||
337 | job->block_len); | ||
338 | return NULL; | ||
339 | } | ||
340 | |||
341 | job->strong_sum_len = sig->strong_sum_len; | ||
342 | if (job->strong_sum_len < 0 || job->strong_sum_len > RS_MD4_LENGTH) { | ||
343 | rs_log(RS_LOG_ERR, "unreasonable strong_sum_len %d in signature", | ||
344 | job->strong_sum_len); | ||
345 | return NULL; | ||
346 | } | ||
347 | |||
348 | return job; | ||
349 | } | ||
350 | |||
351 | |||