1 #============================================================= -*-perl-*-
3 # BackupPC::PoolWrite package
7 # This library defines a BackupPC::PoolWrite class for writing
8 # files to disk that are candidates for pooling. One instance
9 # of this class is used to write each file. The following steps
12 # - As the incoming data arrives, the first 1MB is buffered
13 # in memory so the MD5 digest can be computed.
15 # - A running comparison against all the candidate pool files
16 # (ie: those with the same MD5 digest, usually at most a single
17 # file) is done as new incoming data arrives. Up to $MaxFiles
18 # simultaneous files can be compared in parallel. This
19 # involves reading and uncompressing one or more pool files.
21 # - When a pool file no longer matches it is discarded from
22 # the search. If there are more than $MaxFiles candidates, one of
23 # the new candidates is added to the search, first checking
24 # that it matches up to the current point (this requires
25 # re-reading one of the other pool files).
27 # - When or if no pool files match then the new file is written
28 # to disk. This could occur many MB into the file. We don't
29 # need to buffer all this data in memory since we can copy it
30 # from the last matching pool file, up to the point where it
33 # - When all the new data is complete, if a pool file exactly
34 # matches then the file is simply created as a hardlink to
38 # Craig Barratt <cbarratt@users.sourceforge.net>
41 # Copyright (C) 2001-2009 Craig Barratt
43 # This program is free software; you can redistribute it and/or modify
44 # it under the terms of the GNU General Public License as published by
45 # the Free Software Foundation; either version 2 of the License, or
46 # (at your option) any later version.
48 # This program is distributed in the hope that it will be useful,
49 # but WITHOUT ANY WARRANTY; without even the implied warranty of
50 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51 # GNU General Public License for more details.
53 # You should have received a copy of the GNU General Public License
54 # along with this program; if not, write to the Free Software
55 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
57 #========================================================================
59 # Version 3.2.0, released 31 Jul 2010.
61 # See http://backuppc.sourceforge.net.
63 #========================================================================
65 package BackupPC::PoolWrite;
71 use BackupPC::FileZIO;
75 my($class, $bpc, $fileName, $fileSize, $compress) = @_;
78 fileName => $fileName,
79 fileSize => $fileSize,
81 compress => $compress,
92 $self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax");
95 # Always unlink any current file in case it is already linked
97 unlink($fileName) if ( -f $fileName );
98 if ( $fileName =~ m{(.*)/.+} && !-d $1 ) {
100 eval { mkpath($newDir, 0, 0777) };
102 push(@{$self->{errors}}, "Unable to create directory $newDir for $self->{fileName}");
108 my $BufSize = 1048576; # 1MB or 2^20
109 my $MaxFiles = 20; # max number of compare files open at one time
113 my($a, $dataRef) = @_;
115 return if ( $a->{eof} );
116 $a->{data} .= $$dataRef if ( defined($dataRef) );
117 return if ( length($a->{data}) < $BufSize && defined($dataRef) );
120 # Correct the fileSize if it is wrong (rsync might transfer
121 # a file whose length is different to the length sent with the
122 # file list if the file changes between the file list sending
123 # and the file sending). Here we only catch the case where
124 # we haven't computed the digest (ie: we have written no more
125 # than $BufSize). We catch the big file case below.
127 if ( !defined($dataRef) && !defined($a->{digest})
128 && $a->{fileSize} != length($a->{data}) ) {
129 #my $newSize = length($a->{data});
130 #print("Fixing file size from $a->{fileSize} to $newSize\n");
131 $a->{fileSize} = length($a->{data});
134 if ( !defined($a->{digest}) && length($a->{data}) > 0 ) {
136 # build a list of all the candidate matching files
138 my $md5 = Digest::MD5->new;
139 $a->{fileSize} = length($a->{data})
140 if ( $a->{fileSize} < length($a->{data}) );
141 $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
142 if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
144 push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
145 . " for $a->{fileName}");
147 while ( @{$a->{files}} < $MaxFiles ) {
149 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
150 : "$a->{base}_$a->{fileCnt}";
151 last if ( !-f $fileName );
153 # Don't attempt to match pool files that already
154 # have too many hardlinks. Also, don't match pool
155 # files with only one link since starting in
156 # BackupPC v3.0, BackupPC_nightly could be running
157 # in parallel (and removing those files). This doesn't
158 # eliminate all possible race conditions, but just
159 # reduces the odds. Other design steps eliminate
160 # the remaining race conditions of linking vs
163 if ( (stat(_))[3] >= $a->{hardLinkMax}
165 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
170 push(@{$a->{files}}, {
178 # if there are no candidate files then we must write
179 # the new file to disk
181 if ( !@{$a->{files}} ) {
182 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
184 if ( !defined($a->{fhOut}) ) {
185 push(@{$a->{errors}}, "Unable to open $a->{fileName}"
190 my $dataLen = length($a->{data});
191 if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) {
193 # See if the new chunk of data continues to match the
196 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
198 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
199 : "$a->{base}_$a->{fileCnt}";
200 if ( $dataLen > 0 ) {
201 # verify next $dataLen bytes from candidate file
202 my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
203 next if ( $n == $dataLen && $d eq $a->{data} );
205 # verify candidate file is at EOF
206 my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
209 #print(" File $a->{files}[$i]->{name} doesn't match\n");
211 # this candidate file didn't match. Replace it
212 # with a new candidate file. We have to qualify
213 # any new candidate file by making sure that its
214 # first $a->{nWrite} bytes match, plus the next $dataLen
215 # bytes match $a->{data}.
217 while ( -f $fileName ) {
219 if ( (stat(_))[3] >= $a->{hardLinkMax}
220 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
223 #print(" Discarding $fileName (open failed)\n");
224 $fileName = "$a->{base}_$a->{fileCnt}";
227 if ( !$a->{files}[$i]->{fh}->rewind() ) {
228 push(@{$a->{errors}},
229 "Unable to rewind $a->{files}[$i]->{name}"
232 $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
233 $a->{nWrite}, $dataLen, \$a->{data});
235 $a->{files}[$i]->{fh}->close();
236 $a->{files}[$i]->{fh} = $fh,
237 $a->{files}[$i]->{name} = $fileName;
238 #print(" Found new candidate $fileName\n");
242 #print(" Discarding $fileName (no match)\n");
246 $fileName = "$a->{base}_$a->{fileCnt}";
250 # We couldn't find another candidate file
252 if ( @{$a->{files}} == 1 ) {
253 #print(" Exhausted matches, now writing\n");
254 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
256 if ( !defined($a->{fhOut}) ) {
257 push(@{$a->{errors}},
258 "Unable to open $a->{fileName}"
261 if ( !$a->{files}[$i]->{fh}->rewind() ) {
262 push(@{$a->{errors}},
264 . " $a->{files}[$i]->{name} for copy");
266 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
270 $a->{files}[$i]->{fh}->close();
271 splice(@{$a->{files}}, $i, 1);
276 if ( defined($a->{fhOut}) && $dataLen > 0 ) {
278 # if we are in writing mode then just write the data
280 my $n = $a->{fhOut}->write(\$a->{data});
281 if ( $n != $dataLen ) {
282 push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
283 . " $a->{fileName} (got $n)");
286 $a->{nWrite} += $dataLen;
288 return if ( defined($dataRef) );
291 # We are at EOF, so finish up
296 # Make sure the fileSize was correct. See above for comments about
299 if ( $a->{nWrite} != $a->{fileSize} ) {
301 # Oops, fileSize was wrong, so our MD5 digest was wrong and our
302 # effort to match files likely failed. This is ugly, but our
303 # only choice at this point is to re-write the entire file with
304 # the correct length. We need to rename the file, open it for
305 # reading, and then re-write the file with the correct length.
308 #print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n");
311 $a->{fileSize} = $a->{nWrite};
313 if ( defined($a->{fhOut}) ) {
314 if ( $a->{fileName} =~ /(.*)\// ) {
320 # Find a unique target temporary file name
323 while ( -f "$fileName/t$$.$i" ) {
326 $fileName = "$fileName/t$$.$i";
327 $a->{fhOut}->close();
328 if ( !rename($a->{fileName}, $fileName)
329 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
331 push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName"
332 . " or open during size fixup");
334 #print("Using temporary name $fileName\n");
335 } elsif ( defined($a->{files}) && defined($a->{files}[0]) ) {
337 # We haven't written anything yet, so just use the
338 # compare file to copy from.
340 $fh = $a->{files}[0]->{fh};
342 #print("Using compare file $a->{files}[0]->{name}\n");
344 if ( defined($fh) ) {
345 my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName},
346 $a->{fileSize}, $a->{compress});
349 while ( $nRead < $a->{fileSize} ) {
350 my $thisRead = $a->{fileSize} - $nRead < $BufSize
351 ? $a->{fileSize} - $nRead : $BufSize;
353 my $n = $fh->read(\$data, $thisRead);
354 if ( $n != $thisRead ) {
355 push(@{$a->{errors}},
356 "Unable to read $thisRead bytes during resize"
357 . " from temp $fileName (got $n)");
360 $poolWrite->write(\$data);
364 unlink($fileName) if ( defined($fileName) );
365 if ( @{$a->{errors}} ) {
367 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
369 return $poolWrite->close;
374 if ( $a->{fileSize} == 0 ) {
376 # Simply create an empty file
379 if ( !open(OUT, ">", $a->{fileName}) ) {
380 push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
386 # Close the compare files
388 foreach my $f ( @{$a->{files}} ) {
391 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
392 } elsif ( defined($a->{fhOut}) ) {
393 $a->{fhOut}->close();
395 # Close the compare files
397 foreach my $f ( @{$a->{files}} ) {
400 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
402 if ( @{$a->{files}} == 0 ) {
403 push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
404 . " ($a->{digest})");
405 } elsif ( @{$a->{files}} > 1 ) {
407 # This is no longer a real error because $Conf{HardLinkMax}
408 # could be hit, thereby creating identical pool files
410 #my $str = "Unexpected multiple matches on"
411 # . " $a->{fileName} ($a->{digest})\n";
412 #for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
413 # $str .= " -> $a->{files}[$i]->{name}\n";
415 #push(@{$a->{errors}}, $str);
417 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
418 if ( link($a->{files}[$i]->{name}, $a->{fileName}) ) {
419 #print(" Linked $a->{fileName} to $a->{files}[$i]->{name}\n");
421 # Close the compare files
423 foreach my $f ( @{$a->{files}} ) {
426 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
430 # We were unable to link to the pool. Either we're at the
431 # hardlink max, or the pool file got deleted. Recover by
432 # writing the matching file, since we still have an open
435 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
436 if ( !$a->{files}[$i]->{fh}->rewind() ) {
437 push(@{$a->{errors}},
438 "Unable to rewind $a->{files}[$i]->{name}"
439 . " for copy after link fail");
442 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
444 if ( !defined($a->{fhOut}) ) {
445 push(@{$a->{errors}},
446 "Unable to open $a->{fileName}"
447 . " for writing after link fail");
449 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
456 # Close the compare files
458 foreach my $f ( @{$a->{files}} ) {
461 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
466 # Finish writing: pass undef dataRef to write so it can do all
467 # the work. Returns a 4 element array:
469 # (existingFlag, digestString, outputFileLength, errorList)
475 return $a->write(undef);
485 if ( defined($a->{fhOut}) ) {
486 $a->{fhOut}->close();
487 unlink($a->{fileName});
489 foreach my $f ( @{$a->{files}} ) {
496 # Copy $nBytes from files $fhIn to $fhOut.
500 my($a, $fhIn, $fhOut, $nBytes) = @_;
503 while ( $nRead < $nBytes ) {
504 my $thisRead = $nBytes - $nRead < $BufSize
505 ? $nBytes - $nRead : $BufSize;
507 my $n = $fhIn->read(\$data, $thisRead);
508 if ( $n != $thisRead ) {
509 push(@{$a->{errors}},
510 "Unable to read $thisRead bytes from "
511 . $fhIn->name . " (got $n)");
514 $n = $fhOut->write(\$data, $thisRead);
515 if ( $n != $thisRead ) {
516 push(@{$a->{errors}},
517 "Unable to write $thisRead bytes to "
518 . $fhOut->name . " (got $n)");
526 # Compare $nBytes from files $fh0 and $fh1, and also compare additional
527 # $extra bytes from $fh1 to $$extraData.
529 sub filePartialCompare
531 my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
535 while ( $nRead < $nBytes ) {
536 my $thisRead = $nBytes - $nRead < $BufSize
537 ? $nBytes - $nRead : $BufSize;
538 $n = $fh0->read(\$data0, $thisRead);
539 if ( $n != $thisRead ) {
540 push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
541 . $fh0->name . " (got $n)");
544 $n = $fh1->read(\$data1, $thisRead);
545 return 0 if ( $n < $thisRead || $data0 ne $data1 );
549 # verify additional bytes
550 $n = $fh1->read(\$data1, $extra);
551 return 0 if ( $n != $extra || $data1 ne $$extraData );
554 $n = $fh1->read(\$data1, 100);
555 return 0 if ( $n != 0 );
561 # LinkOrCopy() does a hardlink from oldFile to newFile.
563 # If that fails (because there are too many links on oldFile)
564 # then oldFile is copied to newFile, and the pool stats are
565 # returned to be added to the new file list. That allows
566 # BackupPC_link to try again, and to create a new pool file
571 my($bpc, $oldFile, $oldFileComp, $newFile, $newFileComp) = @_;
574 unlink($newFile) if ( -f $newFile );
576 # Try to link if hardlink limit is ok, and compression types
579 return (1, undef) if ( (stat($oldFile))[3] < $bpc->{Conf}{HardLinkMax}
580 && !$oldFileComp == !$newFileComp
581 && link($oldFile, $newFile) );
583 # There are too many links on oldFile, or compression
584 # type if different, so now we have to copy it.
586 # We need to compute the file size, which is expensive
587 # since we need to read the file twice. That's probably
588 # ok since the hardlink limit is rarely hit.
590 my $readFd = BackupPC::FileZIO->open($oldFile, 0, $oldFileComp);
591 if ( !defined($readFd) ) {
592 return (0, undef, undef, undef, ["LinkOrCopy: can't open $oldFile"]);
594 while ( $readFd->read(\$data, $BufSize) > 0 ) {
595 $nRead += length($data);
599 my $poolWrite = BackupPC::PoolWrite->new($bpc, $newFile,
600 $nRead, $newFileComp);
601 while ( $readFd->read(\$data, $BufSize) > 0 ) {
602 $poolWrite->write(\$data);
604 my($exists, $digest, $outSize, $errs) = $poolWrite->close;
606 return ($exists, $digest, $nRead, $outSize, $errs);