1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 """
40 Provides filesystem-related objects.
41 @sort: FilesystemList, BackupFileList, PurgeItemList
42 @author: Kenneth J. Pronovici <pronovic@ieee.org>
43 """
44
45
46
47
48
49
50
51 import sys
52 import os
53 import re
54 import sha
55 import math
56 import logging
57 import tarfile
58
59
60 from CedarBackup2.knapsack import firstFit, bestFit, worstFit, alternateFit
61 from CedarBackup2.util import AbsolutePathList, ObjectTypeList, UnorderedList, RegexList
62 from CedarBackup2.util import removeKeys, displayBytes, calculateFileAge, encodePath
63
64
65
66
67
68
69 logger = logging.getLogger("CedarBackup2.log.filesystem")
70
71
72
73
74
75
77
78
79
80
81
82 """
83 Represents a list of filesystem items.
84
85 This is a generic class that represents a list of filesystem items. Callers
86 can add individual files or directories to the list, or can recursively add
87 the contents of a directory. The class also allows for up-front exclusions
88 in several forms (all files, all directories, all items matching a pattern,
89 all items whose basename matches a pattern, or all directories containing a
90 specific "ignore file"). Symbolic links are typically backed up
91 non-recursively, i.e. the link to a directory is backed up, but not the
92 contents of that link (we don't want to deal with recursive loops, etc.).
93
94 The custom methods such as L{addFile} will only add items if they exist on
95 the filesystem and do not match any exclusions that are already in place.
96 However, since a FilesystemList is a subclass of Python's standard list
97 class, callers can also add items to the list in the usual way, using
98 methods like C{append()} or C{insert()}. No validations apply to items
99 added to the list in this way; however, many list-manipulation methods deal
100 "gracefully" with items that don't exist in the filesystem, often by
101 ignoring them.
102
103 Once a list has been created, callers can remove individual items from the
104 list using standard methods like C{pop()} or C{remove()} or they can use
105 custom methods to remove specific types of entries or entries which match a
106 particular pattern.
107
108 @note: Regular expression patterns that apply to paths are assumed to be
109 bounded at front and back by the beginning and end of the string, i.e. they
110 are treated as if they begin with C{^} and end with C{$}. This is true
111 whether we are matching a complete path or a basename.
112
113 @note: Some platforms, like Windows, do not support soft links. On those
114 platforms, the ignore-soft-links flag can be set, but it won't do any good
115 because the operating system never reports a file as a soft link.
116
117 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
118 removeLinks, removeMatch, removeInvalid, normalize, validate,
119 excludeFiles, excludeDirs, excludeLinks, excludePaths,
120 excludePatterns, excludeBasenamePatterns, ignoreFile
121 """
122
123
124
125
126
127
145
146
147
148
149
150
152 """
153 Property target used to set the exclude files flag.
154 No validations, but we normalize the value to C{True} or C{False}.
155 """
156 if value:
157 self._excludeFiles = True
158 else:
159 self._excludeFiles = False
160
162 """
163 Property target used to get the exclude files flag.
164 """
165 return self._excludeFiles
166
168 """
169 Property target used to set the exclude directories flag.
170 No validations, but we normalize the value to C{True} or C{False}.
171 """
172 if value:
173 self._excludeDirs = True
174 else:
175 self._excludeDirs = False
176
178 """
179 Property target used to get the exclude directories flag.
180 """
181 return self._excludeDirs
182
184 """
185 Property target used to set the exclude soft links flag.
186 No validations, but we normalize the value to C{True} or C{False}.
187 """
188 if value:
189 self._excludeLinks = True
190 else:
191 self._excludeLinks = False
192
194 """
195 Property target used to get the exclude soft links flag.
196 """
197 return self._excludeLinks
198
200 """
201 Property target used to set the exclude paths list.
202 A C{None} value is converted to an empty list.
203 Elements do not have to exist on disk at the time of assignment.
204 @raise ValueError: If any list element is not an absolute path.
205 """
206 self._absoluteExcludePaths = AbsolutePathList()
207 if value is not None:
208 self._absoluteExcludePaths.extend(value)
209
211 """
212 Property target used to get the absolute exclude paths list.
213 """
214 return self._absoluteExcludePaths
215
217 """
218 Property target used to set the exclude patterns list.
219 A C{None} value is converted to an empty list.
220 """
221 self._excludePatterns = RegexList()
222 if value is not None:
223 self._excludePatterns.extend(value)
224
226 """
227 Property target used to get the exclude patterns list.
228 """
229 return self._excludePatterns
230
232 """
233 Property target used to set the exclude basename patterns list.
234 A C{None} value is converted to an empty list.
235 """
236 self._excludeBasenamePatterns = RegexList()
237 if value is not None:
238 self._excludeBasenamePatterns.extend(value)
239
241 """
242 Property target used to get the exclude basename patterns list.
243 """
244 return self._excludeBasenamePatterns
245
247 """
248 Property target used to set the ignore file.
249 The value must be a non-empty string if it is not C{None}.
250 @raise ValueError: If the value is an empty string.
251 """
252 if value is not None:
253 if len(value) < 1:
254 raise ValueError("The ignore file must be a non-empty string.")
255 self._ignoreFile = value
256
258 """
259 Property target used to get the ignore file.
260 """
261 return self._ignoreFile
262
263 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
264 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
265 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
266 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
267 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
268 "List of regular expression patterns (matching complete path) to be excluded.")
269 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
270 None, "List of regular expression patterns (matching basename) to be excluded.")
271 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
272
273
274
275
276
277
279 """
280 Adds a file to the list.
281
282 The path must exist and must be a file or a link to an existing file. It
283 will be added to the list subject to any exclusions that are in place.
284
285 @param path: File path to be added to the list
286 @type path: String representing a path on disk
287
288 @return: Number of items added to the list.
289
290 @raise ValueError: If path is not a file or does not exist.
291 @raise ValueError: If the path could not be encoded properly.
292 """
293 path = encodePath(path)
294 if not os.path.exists(path) or not os.path.isfile(path):
295 logger.debug("Path [%s] is not a file or does not exist on disk." % path)
296 raise ValueError("Path is not a file or does not exist on disk.")
297 if self.excludeLinks and os.path.islink(path):
298 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
299 return 0
300 if self.excludeFiles:
301 logger.debug("Path [%s] is excluded based on excludeFiles." % path)
302 return 0
303 if path in self.excludePaths:
304 logger.debug("Path [%s] is excluded based on excludePaths." % path)
305 return 0
306 for pattern in self.excludePatterns:
307 if re.compile(r"^%s$" % pattern).match(path):
308 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
309 return 0
310 for pattern in self.excludeBasenamePatterns:
311 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
312 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
313 return 0
314 self.append(path)
315 logger.debug("Added file to list: [%s]" % path)
316 return 1
317
319 """
320 Adds a directory to the list.
321
322 The path must exist and must be a directory or a link to an existing
323 directory. It will be added to the list subject to any exclusions that
324 are in place. The L{ignoreFile} does not apply to this method, only to
325 L{addDirContents}.
326
327 @param path: Directory path to be added to the list
328 @type path: String representing a path on disk
329
330 @return: Number of items added to the list.
331
332 @raise ValueError: If path is not a directory or does not exist.
333 @raise ValueError: If the path could not be encoded properly.
334 """
335 path = encodePath(path)
336 path = normalizeDir(path)
337 if not os.path.exists(path) or not os.path.isdir(path):
338 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
339 raise ValueError("Path is not a directory or does not exist on disk.")
340 if self.excludeLinks and os.path.islink(path):
341 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
342 return 0
343 if self.excludeDirs:
344 logger.debug("Path [%s] is excluded based on excludeDirs." % path)
345 return 0
346 if path in self.excludePaths:
347 logger.debug("Path [%s] is excluded based on excludePaths." % path)
348 return 0
349 for pattern in self.excludePatterns:
350 if re.compile(r"^%s$" % pattern).match(path):
351 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
352 return 0
353 for pattern in self.excludeBasenamePatterns:
354 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
355 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
356 return 0
357 self.append(path)
358 logger.debug("Added directory to list: [%s]" % path)
359 return 1
360
361 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0):
362 """
363 Adds the contents of a directory to the list.
364
365 The path must exist and must be a directory or a link to a directory.
366 The contents of the directory (as well as the directory path itself) will
367 be recursively added to the list, subject to any exclusions that are in
368 place. If you only want the directory and its immediate contents to be
369 added, then pass in C{recursive=False}.
370
371 @note: If a directory's absolute path matches an exclude pattern or path,
372 or if the directory contains the configured ignore file, then the
373 directory and all of its contents will be recursively excluded from the
374 list.
375
376 @note: If the passed-in directory happens to be a soft link, it will be
377 recursed. However, the linkDepth parameter controls whether any soft
378 links I{within} the directory will be recursed. The link depth is
379 maximum depth of the tree at which soft links should be followed. So, a
380 depth of 0 does not follow any soft links, a depth of 1 follows only
381 links within the passed-in directory, a depth of 2 follows the links at
382 the next level down, etc.
383
384 @note: Any invalid soft links (i.e. soft links that point to
385 non-existent items) will be silently ignored.
386
387 @note: The L{excludeDirs} flag only controls whether any given directory
388 path itself is added to the list once it has been discovered. It does
389 I{not} modify any behavior related to directory recursion.
390
391 @param path: Directory path whose contents should be added to the list
392 @type path: String representing a path on disk
393
394 @param recursive: Indicates whether directory contents should be added recursively.
395 @type recursive: Boolean value
396
397 @param addSelf: Indicates whether the directory itself should be added to the list.
398 @type addSelf: Boolean value
399
400 @param linkDepth: Maximum depth of the tree at which soft links should be followed
401 @type linkDepth: Integer value, where zero means not to follow any soft links
402
403 @return: Number of items recursively added to the list
404
405 @raise ValueError: If path is not a directory or does not exist.
406 @raise ValueError: If the path could not be encoded properly.
407 """
408 path = encodePath(path)
409 path = normalizeDir(path)
410 return self._addDirContentsInternal(path, addSelf, recursive, linkDepth)
411
412 - def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0):
413 """
414 Internal implementation of C{addDirContents}.
415
416 This internal implementation exists due to some refactoring. Basically,
417 some subclasses have a need to add the contents of a directory, but not
418 the directory itself. This is different than the standard C{FilesystemList}
419 behavior and actually ends up making a special case out of the first
420 call in the recursive chain. Since I don't want to expose the modified
421 interface, C{addDirContents} ends up being wholly implemented in terms
422 of this method.
423
424 The linkDepth parameter controls whether soft links are followed when we
425 are adding the contents recursively. Any recursive calls reduce the
426 value by one. If the value zero or less, then soft links will just be
427 added as directories, but will not be followed.
428
429 @param path: Directory path whose contents should be added to the list.
430 @param includePath: Indicates whether to include the path as well as contents.
431 @param recursive: Indicates whether directory contents should be added recursively.
432 @param linkDepth: Depth of soft links that should be followed
433
434 @return: Number of items recursively added to the list
435
436 @raise ValueError: If path is not a directory or does not exist.
437 """
438 added = 0
439 if not os.path.exists(path) or not os.path.isdir(path):
440 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
441 raise ValueError("Path is not a directory or does not exist on disk.")
442 if path in self.excludePaths:
443 logger.debug("Path [%s] is excluded based on excludePaths." % path)
444 return added
445 for pattern in self.excludePatterns:
446 if re.compile(r"^%s$" % pattern).match(path):
447 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
448 return added
449 for pattern in self.excludeBasenamePatterns:
450 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
451 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
452 return added
453 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
454 logger.debug("Path [%s] is excluded based on ignore file." % path)
455 return added
456 if includePath:
457 added += self.addDir(path)
458 for entry in os.listdir(path):
459 entrypath = os.path.join(path, entry)
460 if os.path.isfile(entrypath):
461 added += self.addFile(entrypath)
462 elif os.path.isdir(entrypath):
463 if os.path.islink(entrypath):
464 if recursive and linkDepth > 0:
465 newDepth = linkDepth - 1;
466 added += self._addDirContentsInternal(entrypath, linkDepth=newDepth)
467 else:
468 added += self.addDir(entrypath)
469 else:
470 if recursive:
471 newDepth = linkDepth - 1;
472 added += self._addDirContentsInternal(entrypath, linkDepth=newDepth)
473 else:
474 added += self.addDir(entrypath)
475 return added
476
477
478
479
480
481
483 """
484 Removes file entries from the list.
485
486 If C{pattern} is not passed in or is C{None}, then all file entries will
487 be removed from the list. Otherwise, only those file entries matching
488 the pattern will be removed. Any entry which does not exist on disk
489 will be ignored (use L{removeInvalid} to purge those entries).
490
491 This method might be fairly slow for large lists, since it must check the
492 type of each item in the list. If you know ahead of time that you want
493 to exclude all files, then you will be better off setting L{excludeFiles}
494 to C{True} before adding items to the list.
495
496 @param pattern: Regular expression pattern representing entries to remove
497
498 @return: Number of entries removed
499 @raise ValueError: If the passed-in pattern is not a valid regular expression.
500 """
501 removed = 0
502 if pattern is None:
503 for entry in self[:]:
504 if os.path.exists(entry) and os.path.isfile(entry):
505 self.remove(entry)
506 logger.debug("Removed path [%s] from list." % entry)
507 removed += 1
508 else:
509 try:
510 compiled = re.compile(pattern)
511 except re.error:
512 raise ValueError("Pattern is not a valid regular expression.")
513 for entry in self[:]:
514 if os.path.exists(entry) and os.path.isfile(entry):
515 if compiled.match(entry):
516 self.remove(entry)
517 logger.debug("Removed path [%s] from list." % entry)
518 removed += 1
519 logger.debug("Removed a total of %d entries." % removed);
520 return removed
521
523 """
524 Removes directory entries from the list.
525
526 If C{pattern} is not passed in or is C{None}, then all directory entries
527 will be removed from the list. Otherwise, only those directory entries
528 matching the pattern will be removed. Any entry which does not exist on
529 disk will be ignored (use L{removeInvalid} to purge those entries).
530
531 This method might be fairly slow for large lists, since it must check the
532 type of each item in the list. If you know ahead of time that you want
533 to exclude all directories, then you will be better off setting
534 L{excludeDirs} to C{True} before adding items to the list (note that this
535 will not prevent you from recursively adding the I{contents} of
536 directories).
537
538 @param pattern: Regular expression pattern representing entries to remove
539
540 @return: Number of entries removed
541 @raise ValueError: If the passed-in pattern is not a valid regular expression.
542 """
543 removed = 0
544 if pattern is None:
545 for entry in self[:]:
546 if os.path.exists(entry) and os.path.isdir(entry):
547 self.remove(entry)
548 logger.debug("Removed path [%s] from list." % entry)
549 removed += 1
550 else:
551 try:
552 compiled = re.compile(pattern)
553 except re.error:
554 raise ValueError("Pattern is not a valid regular expression.")
555 for entry in self[:]:
556 if os.path.exists(entry) and os.path.isdir(entry):
557 if compiled.match(entry):
558 self.remove(entry)
559 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
560 removed += 1
561 logger.debug("Removed a total of %d entries." % removed);
562 return removed
563
565 """
566 Removes soft link entries from the list.
567
568 If C{pattern} is not passed in or is C{None}, then all soft link entries
569 will be removed from the list. Otherwise, only those soft link entries
570 matching the pattern will be removed. Any entry which does not exist on
571 disk will be ignored (use L{removeInvalid} to purge those entries).
572
573 This method might be fairly slow for large lists, since it must check the
574 type of each item in the list. If you know ahead of time that you want
575 to exclude all soft links, then you will be better off setting
576 L{excludeLinks} to C{True} before adding items to the list.
577
578 @param pattern: Regular expression pattern representing entries to remove
579
580 @return: Number of entries removed
581 @raise ValueError: If the passed-in pattern is not a valid regular expression.
582 """
583 removed = 0
584 if pattern is None:
585 for entry in self[:]:
586 if os.path.exists(entry) and os.path.islink(entry):
587 self.remove(entry)
588 logger.debug("Removed path [%s] from list." % entry)
589 removed += 1
590 else:
591 try:
592 compiled = re.compile(pattern)
593 except re.error:
594 raise ValueError("Pattern is not a valid regular expression.")
595 for entry in self[:]:
596 if os.path.exists(entry) and os.path.islink(entry):
597 if compiled.match(entry):
598 self.remove(entry)
599 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
600 removed += 1
601 logger.debug("Removed a total of %d entries." % removed);
602 return removed
603
605 """
606 Removes from the list all entries matching a pattern.
607
608 This method removes from the list all entries which match the passed in
609 C{pattern}. Since there is no need to check the type of each entry, it
610 is faster to call this method than to call the L{removeFiles},
611 L{removeDirs} or L{removeLinks} methods individually. If you know which
612 patterns you will want to remove ahead of time, you may be better off
613 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
614 items to the list.
615
616 @note: Unlike when using the exclude lists, the pattern here is I{not}
617 bounded at the front and the back of the string. You can use any pattern
618 you want.
619
620 @param pattern: Regular expression pattern representing entries to remove
621
622 @return: Number of entries removed.
623 @raise ValueError: If the passed-in pattern is not a valid regular expression.
624 """
625 try:
626 compiled = re.compile(pattern)
627 except re.error:
628 raise ValueError("Pattern is not a valid regular expression.")
629 removed = 0
630 for entry in self[:]:
631 if compiled.match(entry):
632 self.remove(entry)
633 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
634 removed += 1
635 logger.debug("Removed a total of %d entries." % removed);
636 return removed
637
639 """
640 Removes from the list all entries that do not exist on disk.
641
642 This method removes from the list all entries which do not currently
643 exist on disk in some form. No attention is paid to whether the entries
644 are files or directories.
645
646 @return: Number of entries removed.
647 """
648 removed = 0
649 for entry in self[:]:
650 if not os.path.exists(entry):
651 self.remove(entry)
652 logger.debug("Removed path [%s] from list." % entry)
653 removed += 1
654 logger.debug("Removed a total of %d entries." % removed);
655 return removed
656
657
658
659
660
661
663 """Normalizes the list, ensuring that each entry is unique."""
664 orig = len(self)
665 self.sort()
666 dups = filter(lambda x, self=self: self[x] == self[x+1], range(0, len(self) - 1))
667 items = map(lambda x, self=self: self[x], dups)
668 map(self.remove, items)
669 new = len(self)
670 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now)." % (new-orig, orig, new))
671
673 """
674 Verifies that all entries in the list exist on disk.
675 @return: C{True} if all entries exist, C{False} otherwise.
676 """
677 for entry in self:
678 if not os.path.exists(entry):
679 logger.debug("Path [%s] is invalid; list is not valid." % entry)
680 return False
681 logger.debug("All entries in list are valid.")
682 return True
683
684
685
686
687
688
690 """
691 Item returned by L{BackupFileList.generateSpan}.
692 """
693 - def __init__(self, fileList, size, capacity, utilization):
694 """
695 Create object.
696 @param fileList: List of files
697 @param size: Size (in bytes) of files
698 @param utilization: Utilization, as a percentage (0-100)
699 """
700 self.fileList = fileList
701 self.size = size
702 self.capacity = capacity
703 self.utilization = utilization
704
705
706
707
708
709
711
712
713
714
715
716 """
717 List of files to be backed up.
718
719 A BackupFileList is a L{FilesystemList} containing a list of files to be
720 backed up. It only contains files, not directories (soft links are treated
721 like files). On top of the generic functionality provided by
722 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
723 for each file in the list, and it also provides a method to calculate the
724 total size of the files in the list and a way to export the list into tar
725 form.
726
727 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
728 generateFitted, generateTarfile, removeUnchanged
729 """
730
731
732
733
734
738
739
740
741
742
743
745 """
746 Adds a directory to the list.
747
748 Note that this class does not allow directories to be added by themselves
749 (a backup list contains only files). However, since links to directories
750 are technically files, we allow them to be added.
751
752 This method is implemented in terms of the superclass method, with one
753 additional validation: the superclass method is only called if the
754 passed-in path is both a directory and a link. All of the superclass's
755 existing validations and restrictions apply.
756
757 @param path: Directory path to be added to the list
758 @type path: String representing a path on disk
759
760 @return: Number of items added to the list.
761
762 @raise ValueError: If path is not a directory or does not exist.
763 @raise ValueError: If the path could not be encoded properly.
764 """
765 path = encodePath(path)
766 path = normalizeDir(path)
767 if os.path.isdir(path) and not os.path.islink(path):
768 return 0
769 else:
770 return FilesystemList.addDir(self, path)
771
772
773
774
775
776
778 """
779 Returns the total size among all files in the list.
780 Only files are counted.
781 Soft links that point at files are ignored.
782 Entries which do not exist on disk are ignored.
783 @return: Total size, in bytes
784 """
785 total = 0.0
786 for entry in self:
787 if os.path.isfile(entry) and not os.path.islink(entry):
788 total += float(os.stat(entry).st_size)
789 return total
790
792 """
793 Generates a mapping from file to file size in bytes.
794 The mapping does include soft links, which are listed with size zero.
795 Entries which do not exist on disk are ignored.
796 @return: Dictionary mapping file to file size
797 """
798 table = { }
799 for entry in self:
800 if os.path.islink(entry):
801 table[entry] = 0.0
802 elif os.path.isfile(entry):
803 table[entry] = float(os.stat(entry).st_size)
804 return table
805
807 """
808 Generates a mapping from file to file digest.
809
810 Currently, the digest is an SHA hash, which should be pretty secure. In
811 the future, this might be a different kind of hash, but we guarantee that
812 the type of the hash will not change unless the library major version
813 number is bumped.
814
815 Entries which do not exist on disk are ignored.
816
817 Soft links are ignored. We would end up generating a digest for the file
818 that the soft link points at, which doesn't make any sense.
819
820 If C{stripPrefix} is passed in, then that prefix will be stripped from
821 each key when the map is generated. This can be useful in generating two
822 "relative" digest maps to be compared to one another.
823
824 @param stripPrefix: Common prefix to be stripped from paths
825 @type stripPrefix: String with any contents
826
827 @return: Dictionary mapping file to digest value
828 @see: L{removeUnchanged}
829 """
830 table = { }
831 if stripPrefix is not None:
832 for entry in self:
833 if os.path.isfile(entry) and not os.path.islink(entry):
834 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
835 else:
836 for entry in self:
837 if os.path.isfile(entry) and not os.path.islink(entry):
838 table[entry] = BackupFileList._generateDigest(entry)
839 return table
840
842 """
843 Generates an SHA digest for a given file on disk.
844
845 The original code for this function used this simplistic implementation,
846 which requires reading the entire file into memory at once in order to
847 generate a digest value::
848
849 sha.new(open(path).read()).hexdigest()
850
851 Not surprisingly, this isn't an optimal solution. The U{Simple file
852 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
853 Python Cookbook recipe describes how to incrementally generate a hash
854 value by reading in chunks of data rather than reading the file all at
855 once. The recipe relies on the the C{update()} method of the various
856 Python hashing algorithms.
857
858 In my tests using a 110 MB file on CD, the original implementation
859 requires 111 seconds. This implementation requires only 40-45 seconds,
860 which is a pretty substantial speed-up.
861
862 Practice shows that reading in around 4kB (4096 bytes) at a time yields
863 the best performance. Smaller reads are quite a bit slower, and larger
864 reads don't make much of a difference. The 4kB number makes me a little
865 suspicious, and I think it might be related to the size of a filesystem
866 read at the hardware level. However, I've decided to just hardcode 4096
867 until I have evidence that shows it's worthwhile making the read size
868 configurable.
869
870 @param path: Path to generate digest for.
871
872 @return: ASCII-safe SHA digest for the file.
873 @raise OSError: If the file cannot be opened.
874 """
875 s = sha.new()
876 f = open(path, mode="rb")
877 readBytes = 4096
878 while(readBytes > 0):
879 readString = f.read(readBytes)
880 s.update(readString)
881 readBytes = len(readString)
882 f.close()
883 digest = s.hexdigest()
884 logger.debug("Generated digest [%s] for file [%s]." % (digest, path))
885 return digest
886 _generateDigest = staticmethod(_generateDigest)
887
889 """
890 Generates a list of items that fit in the indicated capacity.
891
892 Sometimes, callers would like to include every item in a list, but are
893 unable to because not all of the items fit in the space available. This
894 method returns a copy of the list, containing only the items that fit in
895 a given capacity. A copy is returned so that we don't lose any
896 information if for some reason the fitted list is unsatisfactory.
897
898 The fitting is done using the functions in the knapsack module. By
899 default, the first fit algorithm is used, but you can also choose
900 from best fit, worst fit and alternate fit.
901
902 @param capacity: Maximum capacity among the files in the new list
903 @type capacity: Integer, in bytes
904
905 @param algorithm: Knapsack (fit) algorithm to use
906 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
907
908 @return: Copy of list with total size no larger than indicated capacity
909 @raise ValueError: If the algorithm is invalid.
910 """
911 table = self._getKnapsackTable()
912 function = BackupFileList._getKnapsackFunction(algorithm)
913 return function(table, capacity)[0]
914
916 """
917 Splits the list of items into sub-lists that fit in a given capacity.
918
919 Sometimes, callers need split to a backup file list into a set of smaller
920 lists. For instance, you could use this to "span" the files across a set
921 of discs.
922
923 The fitting is done using the functions in the knapsack module. By
924 default, the first fit algorithm is used, but you can also choose
925 from best fit, worst fit and alternate fit.
926
927 @note: If any of your items are larger than the capacity, then it won't
928 be possible to find a solution. In this case, a value error will be
929 raised.
930
931 @param capacity: Maximum capacity among the files in the new list
932 @type capacity: Integer, in bytes
933
934 @param algorithm: Knapsack (fit) algorithm to use
935 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
936
937 @return: List of L{SpanItem} objects.
938
939 @raise ValueError: If the algorithm is invalid.
940 @raise ValueError: If it's not possible to fit some items
941 """
942 spanItems = []
943 function = BackupFileList._getKnapsackFunction(algorithm)
944 table = self._getKnapsackTable(capacity)
945 iteration = 0
946 while len(table) > 0:
947 iteration += 1
948 fit = function(table, capacity)
949 if len(fit[0]) == 0:
950
951 raise ValueError("After iteration %d, unable to add any new items." % iteration)
952 removeKeys(table, fit[0])
953 utilization = (float(fit[1])/float(capacity))*100.0
954 item = SpanItem(fit[0], fit[1], capacity, utilization)
955 spanItems.append(item)
956 return spanItems
957
959 """
960 Converts the list into the form needed by the knapsack algorithms.
961 @return: Dictionary mapping file name to tuple of (file path, file size).
962 """
963 table = { }
964 for entry in self:
965 if os.path.islink(entry):
966 table[entry] = (entry, 0.0)
967 elif os.path.isfile(entry):
968 size = float(os.stat(entry).st_size)
969 if capacity is not None:
970 if size > capacity:
971 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
972 table[entry] = (entry, size)
973 return table
974
976 """
977 Returns a reference to the function associated with an algorithm name.
978 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
979 @param algorithm: Name of the algorithm
980 @return: Reference to knapsack function
981 @raise ValueError: If the algorithm name is unknown.
982 """
983 if algorithm == "first_fit":
984 return firstFit
985 elif algorithm == "best_fit":
986 return bestFit
987 elif algorithm == "worst_fit":
988 return worstFit
989 elif algorithm == "alternate_fit":
990 return alternateFit
991 else:
992 raise ValueError("Algorithm [%s] is invalid." % algorithm);
993 _getKnapsackFunction = staticmethod(_getKnapsackFunction)
994
996 """
997 Creates a tar file containing the files in the list.
998
999 By default, this method will create uncompressed tar files. If you pass
1000 in mode C{'targz'}, then it will create gzipped tar files, and if you
1001 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
1002
1003 The tar file will be created as a GNU tar archive, which enables extended
1004 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
1005 the extra functionality out-weighs the disadvantage of not being
1006 "standard".
1007
1008 If you pass in C{flat=True}, then a "flat" archive will be created, and
1009 all of the files will be added to the root of the archive. So, the file
1010 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
1011
1012 By default, the whole method call fails if there are problems adding any
1013 of the files to the archive, resulting in an exception. Under these
1014 circumstances, callers are advised that they might want to call
1015 L{removeInvalid()} and then attempt to extract the tar file a second
1016 time, since the most common cause of failures is a missing file (a file
1017 that existed when the list was built, but is gone again by the time the
1018 tar file is built).
1019
1020 If you want to, you can pass in C{ignore=True}, and the method will
1021 ignore errors encountered when adding individual files to the archive
1022 (but not errors opening and closing the archive itself).
1023
1024 We'll always attempt to remove the tarfile from disk if an exception will
1025 be thrown.
1026
1027 @note: No validation is done as to whether the entries in the list are
1028 files, since only files or soft links should be in an object like this.
1029 However, to be safe, everything is explicitly added to the tar archive
1030 non-recursively so it's safe to include soft links to directories.
1031
1032 @note: The Python C{tarfile} module, which is used internally here, is
1033 supposed to deal properly with long filenames and links. In my testing,
1034 I have found that it appears to be able to add long really long filenames
1035 to archives, but doesn't do a good job reading them back out, even out of
1036 an archive it created. Fortunately, all Cedar Backup does is add files
1037 to archives.
1038
1039 @param path: Path of tar file to create on disk
1040 @type path: String representing a path on disk
1041
1042 @param mode: Tar creation mode
1043 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1044
1045 @param ignore: Indicates whether to ignore certain errors.
1046 @type ignore: Boolean
1047
1048 @param flat: Creates "flat" archive by putting all items in root
1049 @type flat: Boolean
1050
1051 @raise ValueError: If mode is not valid
1052 @raise ValueError: If list is empty
1053 @raise ValueError: If the path could not be encoded properly.
1054 @raise TarError: If there is a problem creating the tar file
1055 """
1056 path = encodePath(path)
1057 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1058 if(mode == 'tar'): tarmode = "w:"
1059 elif(mode == 'targz'): tarmode = "w:gz"
1060 elif(mode == 'tarbz2'): tarmode = "w:bz2"
1061 else: raise ValueError("Mode [%s] is not valid." % mode)
1062 try:
1063 tar = tarfile.open(path, tarmode)
1064 tar.posix = False
1065 for entry in self:
1066 try:
1067 if flat:
1068 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1069 else:
1070 tar.add(entry, recursive=False)
1071 except tarfile.TarError, e:
1072 if not ignore:
1073 raise e
1074 logger.info("Unable to add file [%s]; going on anyway." % entry)
1075 except OSError, e:
1076 if not ignore:
1077 raise tarfile.TarError(e)
1078 logger.info("Unable to add file [%s]; going on anyway." % entry)
1079 tar.close()
1080 except tarfile.ReadError, e:
1081 try: tar.close()
1082 except: pass
1083 if os.path.exists(path):
1084 try: os.remove(path)
1085 except: pass
1086 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1087 except tarfile.TarError, e:
1088 try: tar.close()
1089 except: pass
1090 if os.path.exists(path):
1091 try: os.remove(path)
1092 except: pass
1093 raise e
1094
1096 """
1097 Removes unchanged entries from the list.
1098
1099 This method relies on a digest map as returned from L{generateDigestMap}.
1100 For each entry in C{digestMap}, if the entry also exists in the current
1101 list I{and} the entry in the current list has the same digest value as in
1102 the map, the entry in the current list will be removed.
1103
1104 This method offers a convenient way for callers to filter unneeded
1105 entries from a list. The idea is that a caller will capture a digest map
1106 from C{generateDigestMap} at some point in time (perhaps the beginning of
1107 the week), and will save off that map using C{pickle} or some other
1108 method. Then, the caller could use this method sometime in the future to
1109 filter out any unchanged files based on the saved-off map.
1110
1111 If C{captureDigest} is passed-in as C{True}, then digest information will
1112 be captured for the entire list before the removal step occurs using the
1113 same rules as in L{generateDigestMap}. The check will involve a lookup
1114 into the complete digest map.
1115
1116 If C{captureDigest} is passed in as C{False}, we will only generate a
1117 digest value for files we actually need to check, and we'll ignore any
1118 entry in the list which isn't a file that currently exists on disk.
1119
1120 The return value varies depending on C{captureDigest}, as well. To
1121 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1122 we'll just return a single value representing the number of entries
1123 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1124 map)}. The returned digest map will be in exactly the form returned by
1125 L{generateDigestMap}.
1126
1127 @note: For performance reasons, this method actually ends up rebuilding
1128 the list from scratch. First, we build a temporary dictionary containing
1129 all of the items from the original list. Then, we remove items as needed
1130 from the dictionary (which is faster than the equivalent operation on a
1131 list). Finally, we replace the contents of the current list based on the
1132 keys left in the dictionary. This should be transparent to the caller.
1133
1134 @param digestMap: Dictionary mapping file name to digest value.
1135 @type digestMap: Map as returned from L{generateDigestMap}.
1136
1137 @param captureDigest: Indicates that digest information should be captured.
1138 @type captureDigest: Boolean
1139
1140 @return: Number of entries removed
1141 """
1142 if captureDigest:
1143 removed = 0
1144 table = {}
1145 captured = {}
1146 for entry in self:
1147 if os.path.isfile(entry) and not os.path.islink(entry):
1148 table[entry] = BackupFileList._generateDigest(entry)
1149 captured[entry] = table[entry]
1150 else:
1151 table[entry] = None
1152 for entry in digestMap.keys():
1153 if table.has_key(entry):
1154 if table[entry] is not None:
1155 digest = table[entry]
1156 if digest == digestMap[entry]:
1157 removed += 1
1158 del table[entry]
1159 logger.debug("Discarded unchanged file [%s]." % entry)
1160 self[:] = table.keys()
1161 return (removed, captured)
1162 else:
1163 removed = 0
1164 table = {}
1165 for entry in self:
1166 table[entry] = None
1167 for entry in digestMap.keys():
1168 if table.has_key(entry):
1169 if os.path.isfile(entry) and not os.path.islink(entry):
1170 digest = BackupFileList._generateDigest(entry)
1171 if digest == digestMap[entry]:
1172 removed += 1
1173 del table[entry]
1174 logger.debug("Discarded unchanged file [%s]." % entry)
1175 self[:] = table.keys()
1176 return removed
1177
1178
1179
1180
1181
1182
1184
1185
1186
1187
1188
1189 """
1190 List of files and directories to be purged.
1191
1192 A PurgeItemList is a L{FilesystemList} containing a list of files and
1193 directories to be purged. On top of the generic functionality provided by
1194 L{FilesystemList}, this class adds functionality to remove items that are
1195 too young to be purged, and to actually remove each item in the list from
1196 the filesystem.
1197
1198 The other main difference is that when you add a directory's contents to a
1199 purge item list, the directory itself is not added to the list. This way,
1200 if someone asks to purge within in C{/opt/backup/collect}, that directory
1201 doesn't get removed once all of the files within it is gone.
1202 """
1203
1204
1205
1206
1207
1211
1212
1213
1214
1215
1216
1217 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0):
1218 """
1219 Adds the contents of a directory to the list.
1220
1221 The path must exist and must be a directory or a link to a directory.
1222 The contents of the directory (but I{not} the directory path itself) will
1223 be recursively added to the list, subject to any exclusions that are in
1224 place. If you only want the directory and its contents to be added, then
1225 pass in C{recursive=False}.
1226
1227 @note: If a directory's absolute path matches an exclude pattern or path,
1228 or if the directory contains the configured ignore file, then the
1229 directory and all of its contents will be recursively excluded from the
1230 list.
1231
1232 @note: If the passed-in directory happens to be a soft link, it will be
1233 recursed. However, the linkDepth parameter controls whether any soft
1234 links I{within} the directory will be recursed. The link depth is
1235 maximum depth of the tree at which soft links should be followed. So, a
1236 depth of 0 does not follow any soft links, a depth of 1 follows only
1237 links within the passed-in directory, a depth of 2 follows the links at
1238 the next level down, etc.
1239
1240 @note: Any invalid soft links (i.e. soft links that point to
1241 non-existent items) will be silently ignored.
1242
1243 @note: The L{excludeDirs} flag only controls whether any given soft link
1244 path itself is added to the list once it has been discovered. It does
1245 I{not} modify any behavior related to directory recursion.
1246
1247 @note: The L{excludeDirs} flag only controls whether any given directory
1248 path itself is added to the list once it has been discovered. It does
1249 I{not} modify any behavior related to directory recursion.
1250
1251 @param path: Directory path whose contents should be added to the list
1252 @type path: String representing a path on disk
1253
1254 @param recursive: Indicates whether directory contents should be added recursively.
1255 @type recursive: Boolean value
1256
1257 @param addSelf: Ignored in this subclass.
1258
1259 @param linkDepth: Depth of soft links that should be followed
1260 @type linkDepth: Integer value, where zero means not to follow any soft links
1261
1262 @return: Number of items recursively added to the list
1263
1264 @raise ValueError: If path is not a directory or does not exist.
1265 @raise ValueError: If the path could not be encoded properly.
1266 """
1267 path = encodePath(path)
1268 path = normalizeDir(path)
1269 return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth)
1270
1271
1272
1273
1274
1275
1277 """
1278 Removes from the list files younger than a certain age (in days).
1279
1280 Any file whose "age" in days is less than (C{<}) the value of the
1281 C{daysOld} parameter will be removed from the list so that it will not be
1282 purged later when L{purgeItems} is called. Directories and soft links
1283 will be ignored.
1284
1285 The "age" of a file is the amount of time since the file was last used,
1286 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1287
1288 @note: Some people find the "sense" of this method confusing or
1289 "backwards". Keep in mind that this method is used to remove items
1290 I{from the list}, not from the filesystem! It removes from the list
1291 those items that you would I{not} want to purge because they are too
1292 young. As an example, passing in C{daysOld} of zero (0) would remove
1293 from the list no files, which would result in purging all of the files
1294 later. I would be happy to make a synonym of this method with an
1295 easier-to-understand "sense", if someone can suggest one.
1296
1297 @param daysOld: Minimum age of files that are to be kept in the list.
1298 @type daysOld: Integer value >= 0.
1299
1300 @return: Number of entries removed
1301 """
1302 removed = 0
1303 daysOld = int(daysOld)
1304 if daysOld < 0:
1305 raise ValueError("Days old value must be an integer >= 0.")
1306 for entry in self[:]:
1307 if os.path.isfile(entry) and not os.path.islink(entry):
1308 try:
1309 ageInDays = calculateFileAge(entry)
1310 ageInWholeDays = math.floor(ageInDays)
1311 if ageInWholeDays < daysOld:
1312 removed += 1
1313 self.remove(entry)
1314 except OSError:
1315 pass
1316 return removed
1317
1319 """
1320 Purges all items in the list.
1321
1322 Every item in the list will be purged. Directories in the list will
1323 I{not} be purged recursively, and hence will only be removed if they are
1324 empty. Errors will be ignored.
1325
1326 To faciliate easy removal of directories that will end up being empty,
1327 the delete process happens in two passes: files first (including soft
1328 links), then directories.
1329
1330 @return: Tuple containing count of (files, dirs) removed
1331 """
1332 files = 0
1333 dirs = 0
1334 for entry in self:
1335 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1336 try:
1337 os.remove(entry)
1338 files += 1
1339 logger.debug("Purged file [%s]." % entry)
1340 except OSError:
1341 pass
1342 for entry in self:
1343 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1344 try:
1345 os.rmdir(entry)
1346 dirs += 1
1347 logger.debug("Purged empty directory [%s]." % entry)
1348 except OSError:
1349 pass
1350 return (files, dirs)
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1362 """
1363 Normalizes a directory name.
1364
1365 For our purposes, a directory name is normalized by removing the trailing
1366 path separator, if any. This is important because we want directories to
1367 appear within lists in a consistent way, although from the user's
1368 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1369
1370 @param path: Path to be normalized.
1371 @type path: String representing a path on disk
1372
1373 @return: Normalized path, which should be equivalent to the original.
1374 """
1375 if path != os.sep and path[-1:] == os.sep:
1376 return path[:-1]
1377 return path
1378
1379
1380
1381
1382
1383
1384 -def compareContents(path1, path2, verbose=False):
1385 """
1386 Compares the contents of two directories to see if they are equivalent.
1387
1388 The two directories are recursively compared. First, we check whether they
1389 contain exactly the same set of files. Then, we check to see every given
1390 file has exactly the same contents in both directories.
1391
1392 This is all relatively simple to implement through the magic of
1393 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1394 off the front of each entry in the mapping it generates. This makes our
1395 comparison as simple as creating a list for each path, then generating a
1396 digest map for each path and comparing the two.
1397
1398 If no exception is thrown, the two directories are considered identical.
1399
1400 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1401 used so that any thrown exception can indicate exactly which file caused the
1402 comparison to fail. The thrown C{ValueError} exception distinguishes
1403 between the directories containing different files, and containing the same
1404 files with differing content.
1405
1406 @note: Symlinks are I{not} followed for the purposes of this comparison.
1407
1408 @param path1: First path to compare.
1409 @type path1: String representing a path on disk
1410
1411 @param path2: First path to compare.
1412 @type path2: String representing a path on disk
1413
1414 @param verbose: Indicates whether a verbose response should be given.
1415 @type verbose: Boolean
1416
1417 @raise ValueError: If a directory doesn't exist or can't be read.
1418 @raise ValueError: If the two directories are not equivalent.
1419 @raise IOError: If there is an unusual problem reading the directories.
1420 """
1421 try:
1422 path1List = BackupFileList()
1423 path1List.addDirContents(path1)
1424 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1425 path2List = BackupFileList()
1426 path2List.addDirContents(path2)
1427 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1428 compareDigestMaps(path1Digest, path2Digest, verbose)
1429 except IOError, e:
1430 logger.error("I/O error encountered during consistency check.")
1431 raise e
1432
1434 """
1435 Compares two digest maps and throws an exception if they differ.
1436
1437 @param digest1: First digest to compare.
1438 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1439
1440 @param digest2: Second digest to compare.
1441 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1442
1443 @param verbose: Indicates whether a verbose response should be given.
1444 @type verbose: Boolean
1445
1446 @raise ValueError: If the two directories are not equivalent.
1447 """
1448 if not verbose:
1449 if digest1 != digest2:
1450 raise ValueError("Consistency check failed.")
1451 else:
1452 list1 = UnorderedList(digest1.keys())
1453 list2 = UnorderedList(digest2.keys())
1454 if list1 != list2:
1455 raise ValueError("Directories contain a different set of files.")
1456 for key in list1:
1457 if digest1[key] != digest2[key]:
1458 raise ValueError("File contents for [%s] vary between directories." % key)
1459