1 #============================================================= -*-perl-*-
3 # BackupPC::PoolWrite package
7 # This library defines a BackupPC::PoolWrite class for writing
8 # files to disk that are candidates for pooling. One instance
9 # of this class is used to write each file. The following steps
12 # - As the incoming data arrives, the first 1MB is buffered
13 # in memory so the MD5 digest can be computed.
15 # - A running comparison against all the candidate pool files
16 # (ie: those with the same MD5 digest, usually at most a single
17 # file) is done as new incoming data arrives. Up to $MaxFiles
18 # simultaneous files can be compared in parallel. This
19 # involves reading and uncompressing one or more pool files.
21 # - When a pool file no longer matches it is discarded from
22 # the search. If there are more than $MaxFiles candidates, one of
23 # the new candidates is added to the search, first checking
24 # that it matches up to the current point (this requires
25 # re-reading one of the other pool files).
27 # - When or if no pool files match then the new file is written
28 # to disk. This could occur many MB into the file. We don't
29 # need to buffer all this data in memory since we can copy it
30 # from the last matching pool file, up to the point where it
33 # - When all the new data is complete, if a pool file exactly
34 # matches then the file is simply created as a hardlink to
38 # Craig Barratt <cbarratt@users.sourceforge.net>
41 # Copyright (C) 2001 Craig Barratt
43 # This program is free software; you can redistribute it and/or modify
44 # it under the terms of the GNU General Public License as published by
45 # the Free Software Foundation; either version 2 of the License, or
46 # (at your option) any later version.
48 # This program is distributed in the hope that it will be useful,
49 # but WITHOUT ANY WARRANTY; without even the implied warranty of
50 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51 # GNU General Public License for more details.
53 # You should have received a copy of the GNU General Public License
54 # along with this program; if not, write to the Free Software
55 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
57 #========================================================================
59 # Version 2.0.0_CVS, released 18 Jan 2003.
61 # See http://backuppc.sourceforge.net.
63 #========================================================================
65 package BackupPC::PoolWrite;
71 use BackupPC::FileZIO;
75 my($class, $bpc, $fileName, $fileSize, $compress) = @_;
78 fileName => $fileName,
79 fileSize => $fileSize,
81 compress => $compress,
93 # Always unlink any current file in case it is already linked
95 unlink($fileName) if ( -f $fileName );
99 my $BufSize = 1048576; # 1MB or 2^20
104 my($a, $dataRef) = @_;
106 return if ( $a->{eof} );
107 $a->{data} .= $$dataRef if ( defined($dataRef) );
108 return if ( length($a->{data}) < $BufSize && defined($dataRef) );
109 if ( !defined($a->{digest}) && $a->{fileSize} > 0 ) {
111 # build a list of all the candidate matching files
113 my $md5 = Digest::MD5->new;
114 $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
115 if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
117 push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
118 . " for $a->{fileName}\n");
120 while ( @{$a->{files}} < $MaxFiles ) {
122 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
123 : "$a->{base}_$a->{fileCnt}";
124 last if ( !-f $fileName );
125 if ( !defined($fh = BackupPC::FileZIO->open($fileName, 0,
130 push(@{$a->{files}}, {
138 # if there are no candidate files then we must write
139 # the new file to disk
141 if ( !@{$a->{files}} ) {
142 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
144 if ( !defined($a->{fhOut}) ) {
145 push(@{$a->{errors}}, "Unable to open $a->{fileName}"
150 my $dataLen = length($a->{data});
151 if ( !defined($a->{fhOut}) && $a->{fileSize} > 0 ) {
153 # See if the new chunk of data continues to match the
156 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
158 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
159 : "$a->{base}_$a->{fileCnt}";
160 if ( $dataLen > 0 ) {
161 # verify next $dataLen bytes from candidate file
162 my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
163 next if ( $n == $dataLen && $d eq $a->{data} );
165 # verify candidate file is at EOF
166 my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
169 #print(" File $a->{files}[$i]->{name} doesn't match\n");
171 # this candidate file didn't match. Replace it
172 # with a new candidate file. We have to qualify
173 # any new candidate file by making sure that its
174 # first $a->{nWrite} bytes match, plus the next $dataLen
175 # bytes match $a->{data}.
177 while ( -f $fileName ) {
179 if ( !defined($fh = BackupPC::FileZIO->open($fileName, 0,
182 #print(" Discarding $fileName (open failed)\n");
183 $fileName = "$a->{base}_$a->{fileCnt}";
186 if ( !$a->{files}[$i]->{fh}->rewind() ) {
187 push(@{$a->{errors}},
188 "Unable to rewind $a->{files}[$i]->{name}"
191 $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
192 $a->{nWrite}, $dataLen, \$a->{data});
194 $a->{files}[$i]->{fh}->close();
195 $a->{files}[$i]->{fh} = $fh,
196 $a->{files}[$i]->{name} = $fileName;
197 #print(" Found new candidate $fileName\n");
201 #print(" Discarding $fileName (no match)\n");
205 $fileName = "$a->{base}_$a->{fileCnt}";
209 # We couldn't find another candidate file
211 if ( @{$a->{files}} == 1 ) {
212 #print(" Exhausted matches, now writing\n");
213 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
215 if ( !defined($a->{fhOut}) ) {
216 push(@{$a->{errors}},
217 "Unable to open $a->{fileName}"
220 if ( !$a->{files}[$i]->{fh}->rewind() ) {
221 push(@{$a->{errors}},
223 . " $a->{files}[$i]->{name} for copy\n");
225 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
229 $a->{files}[$i]->{fh}->close();
230 splice(@{$a->{files}}, $i, 1);
235 if ( defined($a->{fhOut}) && $dataLen > 0 ) {
237 # if we are in writing mode then just write the data
239 my $n = $a->{fhOut}->write(\$a->{data});
240 if ( $n != $dataLen ) {
241 push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
242 . " $a->{fileName} (got $n)\n");
245 $a->{nWrite} += $dataLen;
247 return if ( defined($dataRef) );
250 # We are at EOF, so finish up
253 foreach my $f ( @{$a->{files}} ) {
256 if ( $a->{fileSize} == 0 ) {
258 # Simply create an empty file
261 if ( !open(OUT, ">", $a->{fileName}) ) {
262 push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
267 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
268 } elsif ( defined($a->{fhOut}) ) {
269 $a->{fhOut}->close();
270 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
272 if ( @{$a->{files}} == 0 ) {
273 push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
274 . " ($a->{digest})\n");
275 } elsif ( @{$a->{files}} > 1 ) {
276 my $str = "Unexpected multiple matches on"
277 . " $a->{fileName} ($a->{digest})\n";
278 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
279 $str .= " -> $a->{files}[$i]->{name}\n";
281 push(@{$a->{errors}}, $str);
283 #print(" Linking $a->{fileName} to $a->{files}[0]->{name}\n");
284 if ( @{$a->{files}} && !link($a->{files}[0]->{name}, $a->{fileName}) ) {
285 push(@{$a->{errors}}, "Can't link $a->{fileName} to"
286 . " $a->{files}[0]->{name}\n");
288 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
293 # Finish writing: pass undef dataRef to write so it can do all
294 # the work. Returns a 4 element array:
296 # (existingFlag, digestString, outputFileLength, errorList)
302 return $a->write(undef);
306 # Copy $nBytes from files $fhIn to $fhOut.
310 my($a, $fhIn, $fhOut, $nBytes) = @_;
313 while ( $nRead < $nBytes ) {
314 my $thisRead = $nBytes - $nRead < $BufSize
315 ? $nBytes - $nRead : $BufSize;
317 my $n = $fhIn->read(\$data, $thisRead);
318 if ( $n != $thisRead ) {
319 push(@{$a->{errors}},
320 "Unable to read $thisRead bytes from "
321 . $fhIn->name . " (got $n)\n");
324 $n = $fhOut->write(\$data, $thisRead);
325 if ( $n != $thisRead ) {
326 push(@{$a->{errors}},
327 "Unable to write $thisRead bytes to "
328 . $fhOut->name . " (got $n)\n");
336 # Compare $nBytes from files $fh0 and $fh1, and also compare additional
337 # $extra bytes from $fh1 to $$extraData.
339 sub filePartialCompare
341 my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
345 while ( $nRead < $nBytes ) {
346 my $thisRead = $nBytes - $nRead < $BufSize
347 ? $nBytes - $nRead : $BufSize;
348 $n = $fh0->read(\$data0, $thisRead);
349 if ( $n != $thisRead ) {
350 push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
351 . $fh0->name . " (got $n)\n");
354 $n = $fh1->read(\$data1, $thisRead);
355 return 0 if ( $n < $thisRead || $data0 ne $data1 );
359 # verify additional bytes
360 $n = $fh1->read(\$data1, $extra);
361 return 0 if ( $n != $extra || $data1 ne $$extraData );
364 $n = $fh1->read(\$data1, 100);
365 return 0 if ( $n != 0 );