1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 """
40 Provides filesystem-related objects.
41 @sort: FilesystemList, BackupFileList, PurgeItemList
42 @author: Kenneth J. Pronovici <pronovic@ieee.org>
43 """
44
45
46
47
48
49
50
51 import sys
52 import os
53 import re
54 import sha
55 import logging
56 import tarfile
57
58
59 from CedarBackup2.knapsack import firstFit, bestFit, worstFit, alternateFit
60 from CedarBackup2.util import AbsolutePathList, ObjectTypeList, UnorderedList, RegexList
61 from CedarBackup2.util import removeKeys, displayBytes, calculateFileAge, encodePath
62
63
64
65
66
67
68 logger = logging.getLogger("CedarBackup2.log.filesystem")
69
70
71
72
73
74
76
77
78
79
80
81 """
82 Represents a list of filesystem items.
83
84 This is a generic class that represents a list of filesystem items. Callers
85 can add individual files or directories to the list, or can recursively add
86 the contents of a directory. The class also allows for up-front exclusions
87 in several forms (all files, all directories, all items matching a pattern,
88 all items whose basename matches a pattern, or all directories containing a
89 specific "ignore file"). Symbolic links are typically backed up
90 non-recursively, i.e. the link to a directory is backed up, but not the
91 contents of that link (we don't want to deal with recursive loops, etc.).
92
93 The custom methods such as L{addFile} will only add items if they exist on
94 the filesystem and do not match any exclusions that are already in place.
95 However, since a FilesystemList is a subclass of Python's standard list
96 class, callers can also add items to the list in the usual way, using
97 methods like C{append()} or C{insert()}. No validations apply to items
98 added to the list in this way; however, many list-manipulation methods deal
99 "gracefully" with items that don't exist in the filesystem, often by
100 ignoring them.
101
102 Once a list has been created, callers can remove individual items from the
103 list using standard methods like C{pop()} or C{remove()} or they can use
104 custom methods to remove specific types of entries or entries which match a
105 particular pattern.
106
107 @note: Regular expression patterns that apply to paths are assumed to be
108 bounded at front and back by the beginning and end of the string, i.e. they
109 are treated as if they begin with C{^} and end with C{$}. This is true
110 whether we are matching a complete path or a basename.
111
112 @note: Some platforms, like Windows, do not support soft links. On those
113 platforms, the ignore-soft-links flag can be set, but it won't do any good
114 because the operating system never reports a file as a soft link.
115
116 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
117 removeLinks, removeMatch, removeInvalid, normalize, validate,
118 excludeFiles, excludeDirs, excludeLinks, excludePaths,
119 excludePatterns, excludeBasenamePatterns, ignoreFile
120 """
121
122
123
124
125
126
144
145
146
147
148
149
151 """
152 Property target used to set the exclude files flag.
153 No validations, but we normalize the value to C{True} or C{False}.
154 """
155 if value:
156 self._excludeFiles = True
157 else:
158 self._excludeFiles = False
159
161 """
162 Property target used to get the exclude files flag.
163 """
164 return self._excludeFiles
165
167 """
168 Property target used to set the exclude directories flag.
169 No validations, but we normalize the value to C{True} or C{False}.
170 """
171 if value:
172 self._excludeDirs = True
173 else:
174 self._excludeDirs = False
175
177 """
178 Property target used to get the exclude directories flag.
179 """
180 return self._excludeDirs
181
183 """
184 Property target used to set the exclude soft links flag.
185 No validations, but we normalize the value to C{True} or C{False}.
186 """
187 if value:
188 self._excludeLinks = True
189 else:
190 self._excludeLinks = False
191
193 """
194 Property target used to get the exclude soft links flag.
195 """
196 return self._excludeLinks
197
199 """
200 Property target used to set the exclude paths list.
201 A C{None} value is converted to an empty list.
202 Elements do not have to exist on disk at the time of assignment.
203 @raise ValueError: If any list element is not an absolute path.
204 """
205 self._absoluteExcludePaths = AbsolutePathList()
206 if value is not None:
207 self._absoluteExcludePaths.extend(value)
208
210 """
211 Property target used to get the absolute exclude paths list.
212 """
213 return self._absoluteExcludePaths
214
216 """
217 Property target used to set the exclude patterns list.
218 A C{None} value is converted to an empty list.
219 """
220 self._excludePatterns = RegexList()
221 if value is not None:
222 self._excludePatterns.extend(value)
223
225 """
226 Property target used to get the exclude patterns list.
227 """
228 return self._excludePatterns
229
231 """
232 Property target used to set the exclude basename patterns list.
233 A C{None} value is converted to an empty list.
234 """
235 self._excludeBasenamePatterns = RegexList()
236 if value is not None:
237 self._excludeBasenamePatterns.extend(value)
238
240 """
241 Property target used to get the exclude basename patterns list.
242 """
243 return self._excludeBasenamePatterns
244
246 """
247 Property target used to set the ignore file.
248 The value must be a non-empty string if it is not C{None}.
249 @raise ValueError: If the value is an empty string.
250 """
251 if value is not None:
252 if len(value) < 1:
253 raise ValueError("The ignore file must be a non-empty string.")
254 self._ignoreFile = value
255
257 """
258 Property target used to get the ignore file.
259 """
260 return self._ignoreFile
261
262 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
263 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
264 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
265 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
266 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
267 "List of regular expression patterns (matching complete path) to be excluded.")
268 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
269 None, "List of regular expression patterns (matching basename) to be excluded.")
270 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
271
272
273
274
275
276
278 """
279 Adds a file to the list.
280
281 The path must exist and must be a file or a link to an existing file. It
282 will be added to the list subject to any exclusions that are in place.
283
284 @param path: File path to be added to the list
285 @type path: String representing a path on disk
286
287 @return: Number of items added to the list.
288
289 @raise ValueError: If path is not a file or does not exist.
290 @raise ValueError: If the path could not be encoded properly.
291 """
292 path = encodePath(path)
293 if not os.path.exists(path) or not os.path.isfile(path):
294 logger.debug("Path [%s] is not a file or does not exist on disk." % path)
295 raise ValueError("Path is not a file or does not exist on disk.")
296 if self.excludeLinks and os.path.islink(path):
297 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
298 return 0
299 if self.excludeFiles:
300 logger.debug("Path [%s] is excluded based on excludeFiles." % path)
301 return 0
302 if path in self.excludePaths:
303 logger.debug("Path [%s] is excluded based on excludePaths." % path)
304 return 0
305 for pattern in self.excludePatterns:
306 if re.compile(r"^%s$" % pattern).match(path):
307 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
308 return 0
309 for pattern in self.excludeBasenamePatterns:
310 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
311 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
312 return 0
313 self.append(path)
314 logger.debug("Added file to list: [%s]" % path)
315 return 1
316
318 """
319 Adds a directory to the list.
320
321 The path must exist and must be a directory or a link to an existing
322 directory. It will be added to the list subject to any exclusions that
323 are in place. The L{ignoreFile} does not apply to this method, only to
324 L{addDirContents}.
325
326 @param path: Directory path to be added to the list
327 @type path: String representing a path on disk
328
329 @return: Number of items added to the list.
330
331 @raise ValueError: If path is not a directory or does not exist.
332 @raise ValueError: If the path could not be encoded properly.
333 """
334 path = encodePath(path)
335 path = normalizeDir(path)
336 if not os.path.exists(path) or not os.path.isdir(path):
337 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
338 raise ValueError("Path is not a directory or does not exist on disk.")
339 if self.excludeLinks and os.path.islink(path):
340 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
341 return 0
342 if self.excludeDirs:
343 logger.debug("Path [%s] is excluded based on excludeDirs." % path)
344 return 0
345 if path in self.excludePaths:
346 logger.debug("Path [%s] is excluded based on excludePaths." % path)
347 return 0
348 for pattern in self.excludePatterns:
349 if re.compile(r"^%s$" % pattern).match(path):
350 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
351 return 0
352 for pattern in self.excludeBasenamePatterns:
353 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
354 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
355 return 0
356 self.append(path)
357 logger.debug("Added directory to list: [%s]" % path)
358 return 1
359
360 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0):
361 """
362 Adds the contents of a directory to the list.
363
364 The path must exist and must be a directory or a link to a directory.
365 The contents of the directory (as well as the directory path itself) will
366 be recursively added to the list, subject to any exclusions that are in
367 place. If you only want the directory and its immediate contents to be
368 added, then pass in C{recursive=False}.
369
370 @note: If a directory's absolute path matches an exclude pattern or path,
371 or if the directory contains the configured ignore file, then the
372 directory and all of its contents will be recursively excluded from the
373 list.
374
375 @note: If the passed-in directory happens to be a soft link, it will be
376 recursed. However, the linkDepth parameter controls whether any soft
377 links I{within} the directory will be recursed. The link depth is
378 maximum depth of the tree at which soft links should be followed. So, a
379 depth of 0 does not follow any soft links, a depth of 1 follows only
380 links within the passed-in directory, a depth of 2 follows the links at
381 the next level down, etc.
382
383 @note: Any invalid soft links (i.e. soft links that point to
384 non-existent items) will be silently ignored.
385
386 @note: The L{excludeDirs} flag only controls whether any given directory
387 path itself is added to the list once it has been discovered. It does
388 I{not} modify any behavior related to directory recursion.
389
390 @param path: Directory path whose contents should be added to the list
391 @type path: String representing a path on disk
392
393 @param recursive: Indicates whether directory contents should be added recursively.
394 @type recursive: Boolean value
395
396 @param addSelf: Indicates whether the directory itself should be added to the list.
397 @type addSelf: Boolean value
398
399 @param linkDepth: Maximum depth of the tree at which soft links should be followed
400 @type linkDepth: Integer value, where zero means not to follow any soft links
401
402 @return: Number of items recursively added to the list
403
404 @raise ValueError: If path is not a directory or does not exist.
405 @raise ValueError: If the path could not be encoded properly.
406 """
407 path = encodePath(path)
408 path = normalizeDir(path)
409 return self._addDirContentsInternal(path, addSelf, recursive, linkDepth)
410
411 - def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0):
412 """
413 Internal implementation of C{addDirContents}.
414
415 This internal implementation exists due to some refactoring. Basically,
416 some subclasses have a need to add the contents of a directory, but not
417 the directory itself. This is different than the standard C{FilesystemList}
418 behavior and actually ends up making a special case out of the first
419 call in the recursive chain. Since I don't want to expose the modified
420 interface, C{addDirContents} ends up being wholly implemented in terms
421 of this method.
422
423 The linkDepth parameter controls whether soft links are followed when we
424 are adding the contents recursively. Any recursive calls reduce the
425 value by one. If the value zero or less, then soft links will just be
426 added as directories, but will not be followed.
427
428 @param path: Directory path whose contents should be added to the list.
429 @param includePath: Indicates whether to include the path as well as contents.
430 @param recursive: Indicates whether directory contents should be added recursively.
431 @param linkDepth: Depth of soft links that should be followed
432
433 @return: Number of items recursively added to the list
434
435 @raise ValueError: If path is not a directory or does not exist.
436 """
437 added = 0
438 if not os.path.exists(path) or not os.path.isdir(path):
439 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
440 raise ValueError("Path is not a directory or does not exist on disk.")
441 if path in self.excludePaths:
442 logger.debug("Path [%s] is excluded based on excludePaths." % path)
443 return added
444 for pattern in self.excludePatterns:
445 if re.compile(r"^%s$" % pattern).match(path):
446 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
447 return added
448 for pattern in self.excludeBasenamePatterns:
449 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
450 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
451 return added
452 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
453 logger.debug("Path [%s] is excluded based on ignore file." % path)
454 return added
455 if includePath:
456 added += self.addDir(path)
457 for entry in os.listdir(path):
458 entrypath = os.path.join(path, entry)
459 if os.path.isfile(entrypath):
460 added += self.addFile(entrypath)
461 elif os.path.isdir(entrypath):
462 if os.path.islink(entrypath):
463 if recursive and linkDepth > 0:
464 newDepth = linkDepth - 1;
465 added += self._addDirContentsInternal(entrypath, linkDepth=newDepth)
466 else:
467 added += self.addDir(entrypath)
468 else:
469 if recursive:
470 newDepth = linkDepth - 1;
471 added += self._addDirContentsInternal(entrypath, linkDepth=newDepth)
472 else:
473 added += self.addDir(entrypath)
474 return added
475
476
477
478
479
480
482 """
483 Removes file entries from the list.
484
485 If C{pattern} is not passed in or is C{None}, then all file entries will
486 be removed from the list. Otherwise, only those file entries matching
487 the pattern will be removed. Any entry which does not exist on disk
488 will be ignored (use L{removeInvalid} to purge those entries).
489
490 This method might be fairly slow for large lists, since it must check the
491 type of each item in the list. If you know ahead of time that you want
492 to exclude all files, then you will be better off setting L{excludeFiles}
493 to C{True} before adding items to the list.
494
495 @param pattern: Regular expression pattern representing entries to remove
496
497 @return: Number of entries removed
498 @raise ValueError: If the passed-in pattern is not a valid regular expression.
499 """
500 removed = 0
501 if pattern is None:
502 for entry in self[:]:
503 if os.path.exists(entry) and os.path.isfile(entry):
504 self.remove(entry)
505 logger.debug("Removed path [%s] from list." % entry)
506 removed += 1
507 else:
508 try:
509 compiled = re.compile(pattern)
510 except re.error:
511 raise ValueError("Pattern is not a valid regular expression.")
512 for entry in self[:]:
513 if os.path.exists(entry) and os.path.isfile(entry):
514 if compiled.match(entry):
515 self.remove(entry)
516 logger.debug("Removed path [%s] from list." % entry)
517 removed += 1
518 logger.debug("Removed a total of %d entries." % removed);
519 return removed
520
522 """
523 Removes directory entries from the list.
524
525 If C{pattern} is not passed in or is C{None}, then all directory entries
526 will be removed from the list. Otherwise, only those directory entries
527 matching the pattern will be removed. Any entry which does not exist on
528 disk will be ignored (use L{removeInvalid} to purge those entries).
529
530 This method might be fairly slow for large lists, since it must check the
531 type of each item in the list. If you know ahead of time that you want
532 to exclude all directories, then you will be better off setting
533 L{excludeDirs} to C{True} before adding items to the list (note that this
534 will not prevent you from recursively adding the I{contents} of
535 directories).
536
537 @param pattern: Regular expression pattern representing entries to remove
538
539 @return: Number of entries removed
540 @raise ValueError: If the passed-in pattern is not a valid regular expression.
541 """
542 removed = 0
543 if pattern is None:
544 for entry in self[:]:
545 if os.path.exists(entry) and os.path.isdir(entry):
546 self.remove(entry)
547 logger.debug("Removed path [%s] from list." % entry)
548 removed += 1
549 else:
550 try:
551 compiled = re.compile(pattern)
552 except re.error:
553 raise ValueError("Pattern is not a valid regular expression.")
554 for entry in self[:]:
555 if os.path.exists(entry) and os.path.isdir(entry):
556 if compiled.match(entry):
557 self.remove(entry)
558 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
559 removed += 1
560 logger.debug("Removed a total of %d entries." % removed);
561 return removed
562
564 """
565 Removes soft link entries from the list.
566
567 If C{pattern} is not passed in or is C{None}, then all soft link entries
568 will be removed from the list. Otherwise, only those soft link entries
569 matching the pattern will be removed. Any entry which does not exist on
570 disk will be ignored (use L{removeInvalid} to purge those entries).
571
572 This method might be fairly slow for large lists, since it must check the
573 type of each item in the list. If you know ahead of time that you want
574 to exclude all soft links, then you will be better off setting
575 L{excludeLinks} to C{True} before adding items to the list.
576
577 @param pattern: Regular expression pattern representing entries to remove
578
579 @return: Number of entries removed
580 @raise ValueError: If the passed-in pattern is not a valid regular expression.
581 """
582 removed = 0
583 if pattern is None:
584 for entry in self[:]:
585 if os.path.exists(entry) and os.path.islink(entry):
586 self.remove(entry)
587 logger.debug("Removed path [%s] from list." % entry)
588 removed += 1
589 else:
590 try:
591 compiled = re.compile(pattern)
592 except re.error:
593 raise ValueError("Pattern is not a valid regular expression.")
594 for entry in self[:]:
595 if os.path.exists(entry) and os.path.islink(entry):
596 if compiled.match(entry):
597 self.remove(entry)
598 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
599 removed += 1
600 logger.debug("Removed a total of %d entries." % removed);
601 return removed
602
604 """
605 Removes from the list all entries matching a pattern.
606
607 This method removes from the list all entries which match the passed in
608 C{pattern}. Since there is no need to check the type of each entry, it
609 is faster to call this method than to call the L{removeFiles},
610 L{removeDirs} or L{removeLinks} methods individually. If you know which
611 patterns you will want to remove ahead of time, you may be better off
612 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
613 items to the list.
614
615 @note: Unlike when using the exclude lists, the pattern here is I{not}
616 bounded at the front and the back of the string. You can use any pattern
617 you want.
618
619 @param pattern: Regular expression pattern representing entries to remove
620
621 @return: Number of entries removed.
622 @raise ValueError: If the passed-in pattern is not a valid regular expression.
623 """
624 try:
625 compiled = re.compile(pattern)
626 except re.error:
627 raise ValueError("Pattern is not a valid regular expression.")
628 removed = 0
629 for entry in self[:]:
630 if compiled.match(entry):
631 self.remove(entry)
632 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
633 removed += 1
634 logger.debug("Removed a total of %d entries." % removed);
635 return removed
636
638 """
639 Removes from the list all entries that do not exist on disk.
640
641 This method removes from the list all entries which do not currently
642 exist on disk in some form. No attention is paid to whether the entries
643 are files or directories.
644
645 @return: Number of entries removed.
646 """
647 removed = 0
648 for entry in self[:]:
649 if not os.path.exists(entry):
650 self.remove(entry)
651 logger.debug("Removed path [%s] from list." % entry)
652 removed += 1
653 logger.debug("Removed a total of %d entries." % removed);
654 return removed
655
656
657
658
659
660
662 """Normalizes the list, ensuring that each entry is unique."""
663 orig = len(self)
664 self.sort()
665 dups = filter(lambda x, self=self: self[x] == self[x+1], range(0, len(self) - 1))
666 items = map(lambda x, self=self: self[x], dups)
667 map(self.remove, items)
668 new = len(self)
669 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now)." % (new-orig, orig, new))
670
672 """
673 Verifies that all entries in the list exist on disk.
674 @return: C{True} if all entries exist, C{False} otherwise.
675 """
676 for entry in self:
677 if not os.path.exists(entry):
678 logger.debug("Path [%s] is invalid; list is not valid." % entry)
679 return False
680 logger.debug("All entries in list are valid.")
681 return True
682
683
684
685
686
687
689 """
690 Item returned by L{BackupFileList.generateSpan}.
691 """
692 - def __init__(self, fileList, size, capacity, utilization):
693 """
694 Create object.
695 @param fileList: List of files
696 @param size: Size (in bytes) of files
697 @param utilization: Utilization, as a percentage (0-100)
698 """
699 self.fileList = fileList
700 self.size = size
701 self.capacity = capacity
702 self.utilization = utilization
703
704
705
706
707
708
710
711
712
713
714
715 """
716 List of files to be backed up.
717
718 A BackupFileList is a L{FilesystemList} containing a list of files to be
719 backed up. It only contains files, not directories (soft links are treated
720 like files). On top of the generic functionality provided by
721 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
722 for each file in the list, and it also provides a method to calculate the
723 total size of the files in the list and a way to export the list into tar
724 form.
725
726 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
727 generateFitted, generateTarfile, removeUnchanged
728 """
729
730
731
732
733
737
738
739
740
741
742
744 """
745 Adds a directory to the list.
746
747 Note that this class does not allow directories to be added by themselves
748 (a backup list contains only files). However, since links to directories
749 are technically files, we allow them to be added.
750
751 This method is implemented in terms of the superclass method, with one
752 additional validation: the superclass method is only called if the
753 passed-in path is both a directory and a link. All of the superclass's
754 existing validations and restrictions apply.
755
756 @param path: Directory path to be added to the list
757 @type path: String representing a path on disk
758
759 @return: Number of items added to the list.
760
761 @raise ValueError: If path is not a directory or does not exist.
762 @raise ValueError: If the path could not be encoded properly.
763 """
764 path = encodePath(path)
765 path = normalizeDir(path)
766 if os.path.isdir(path) and not os.path.islink(path):
767 return 0
768 else:
769 return FilesystemList.addDir(self, path)
770
771
772
773
774
775
777 """
778 Returns the total size among all files in the list.
779 Only files are counted.
780 Soft links that point at files are ignored.
781 Entries which do not exist on disk are ignored.
782 @return: Total size, in bytes
783 """
784 total = 0.0
785 for entry in self:
786 if os.path.isfile(entry) and not os.path.islink(entry):
787 total += float(os.stat(entry).st_size)
788 return total
789
791 """
792 Generates a mapping from file to file size in bytes.
793 The mapping does include soft links, which are listed with size zero.
794 Entries which do not exist on disk are ignored.
795 @return: Dictionary mapping file to file size
796 """
797 table = { }
798 for entry in self:
799 if os.path.islink(entry):
800 table[entry] = 0.0
801 elif os.path.isfile(entry):
802 table[entry] = float(os.stat(entry).st_size)
803 return table
804
806 """
807 Generates a mapping from file to file digest.
808
809 Currently, the digest is an SHA hash, which should be pretty secure. In
810 the future, this might be a different kind of hash, but we guarantee that
811 the type of the hash will not change unless the library major version
812 number is bumped.
813
814 Entries which do not exist on disk are ignored.
815
816 Soft links are ignored. We would end up generating a digest for the file
817 that the soft link points at, which doesn't make any sense.
818
819 If C{stripPrefix} is passed in, then that prefix will be stripped from
820 each key when the map is generated. This can be useful in generating two
821 "relative" digest maps to be compared to one another.
822
823 @param stripPrefix: Common prefix to be stripped from paths
824 @type stripPrefix: String with any contents
825
826 @return: Dictionary mapping file to digest value
827 @see: L{removeUnchanged}
828 """
829 table = { }
830 if stripPrefix is not None:
831 for entry in self:
832 if os.path.isfile(entry) and not os.path.islink(entry):
833 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
834 else:
835 for entry in self:
836 if os.path.isfile(entry) and not os.path.islink(entry):
837 table[entry] = BackupFileList._generateDigest(entry)
838 return table
839
841 """
842 Generates an SHA digest for a given file on disk.
843
844 The original code for this function used this simplistic implementation,
845 which requires reading the entire file into memory at once in order to
846 generate a digest value::
847
848 sha.new(open(path).read()).hexdigest()
849
850 Not surprisingly, this isn't an optimal solution. The U{Simple file
851 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
852 Python Cookbook recipe describes how to incrementally generate a hash
853 value by reading in chunks of data rather than reading the file all at
854 once. The recipe relies on the the C{update()} method of the various
855 Python hashing algorithms.
856
857 In my tests using a 110 MB file on CD, the original implementation
858 requires 111 seconds. This implementation requires only 40-45 seconds,
859 which is a pretty substantial speed-up.
860
861 Practice shows that reading in around 4kB (4096 bytes) at a time yields
862 the best performance. Smaller reads are quite a bit slower, and larger
863 reads don't make much of a difference. The 4kB number makes me a little
864 suspicious, and I think it might be related to the size of a filesystem
865 read at the hardware level. However, I've decided to just hardcode 4096
866 until I have evidence that shows it's worthwhile making the read size
867 configurable.
868
869 @param path: Path to generate digest for.
870
871 @return: ASCII-safe SHA digest for the file.
872 @raise OSError: If the file cannot be opened.
873 """
874 s = sha.new()
875 f = open(path, mode="rb")
876 readBytes = 4096
877 while(readBytes > 0):
878 readString = f.read(readBytes)
879 s.update(readString)
880 readBytes = len(readString)
881 f.close()
882 digest = s.hexdigest()
883 logger.debug("Generated digest [%s] for file [%s]." % (digest, path))
884 return digest
885 _generateDigest = staticmethod(_generateDigest)
886
888 """
889 Generates a list of items that fit in the indicated capacity.
890
891 Sometimes, callers would like to include every item in a list, but are
892 unable to because not all of the items fit in the space available. This
893 method returns a copy of the list, containing only the items that fit in
894 a given capacity. A copy is returned so that we don't lose any
895 information if for some reason the fitted list is unsatisfactory.
896
897 The fitting is done using the functions in the knapsack module. By
898 default, the first fit algorithm is used, but you can also choose
899 from best fit, worst fit and alternate fit.
900
901 @param capacity: Maximum capacity among the files in the new list
902 @type capacity: Integer, in bytes
903
904 @param algorithm: Knapsack (fit) algorithm to use
905 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
906
907 @return: Copy of list with total size no larger than indicated capacity
908 @raise ValueError: If the algorithm is invalid.
909 """
910 table = self._getKnapsackTable()
911 function = BackupFileList._getKnapsackFunction(algorithm)
912 return function(table, capacity)[0]
913
915 """
916 Splits the list of items into sub-lists that fit in a given capacity.
917
918 Sometimes, callers need split to a backup file list into a set of smaller
919 lists. For instance, you could use this to "span" the files across a set
920 of discs.
921
922 The fitting is done using the functions in the knapsack module. By
923 default, the first fit algorithm is used, but you can also choose
924 from best fit, worst fit and alternate fit.
925
926 @note: If any of your items are larger than the capacity, then it won't
927 be possible to find a solution. In this case, a value error will be
928 raised.
929
930 @param capacity: Maximum capacity among the files in the new list
931 @type capacity: Integer, in bytes
932
933 @param algorithm: Knapsack (fit) algorithm to use
934 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
935
936 @return: List of L{SpanItem} objects.
937
938 @raise ValueError: If the algorithm is invalid.
939 @raise ValueError: If it's not possible to fit some items
940 """
941 spanItems = []
942 function = BackupFileList._getKnapsackFunction(algorithm)
943 table = self._getKnapsackTable(capacity)
944 iteration = 0
945 while len(table) > 0:
946 iteration += 1
947 fit = function(table, capacity)
948 if len(fit[0]) == 0:
949
950 raise ValueError("After iteration %d, unable to add any new items." % iteration)
951 removeKeys(table, fit[0])
952 utilization = (float(fit[1])/float(capacity))*100.0
953 item = SpanItem(fit[0], fit[1], capacity, utilization)
954 spanItems.append(item)
955 return spanItems
956
958 """
959 Converts the list into the form needed by the knapsack algorithms.
960 @return: Dictionary mapping file name to tuple of (file path, file size).
961 """
962 table = { }
963 for entry in self:
964 if os.path.islink(entry):
965 table[entry] = (entry, 0.0)
966 elif os.path.isfile(entry):
967 size = float(os.stat(entry).st_size)
968 if capacity is not None:
969 if size > capacity:
970 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
971 table[entry] = (entry, size)
972 return table
973
975 """
976 Returns a reference to the function associated with an algorithm name.
977 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
978 @param algorithm: Name of the algorithm
979 @return: Reference to knapsack function
980 @raise ValueError: If the algorithm name is unknown.
981 """
982 if algorithm == "first_fit":
983 return firstFit
984 elif algorithm == "best_fit":
985 return bestFit
986 elif algorithm == "worst_fit":
987 return worstFit
988 elif algorithm == "alternate_fit":
989 return alternateFit
990 else:
991 raise ValueError("Algorithm [%s] is invalid." % algorithm);
992 _getKnapsackFunction = staticmethod(_getKnapsackFunction)
993
995 """
996 Creates a tar file containing the files in the list.
997
998 By default, this method will create uncompressed tar files. If you pass
999 in mode C{'targz'}, then it will create gzipped tar files, and if you
1000 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
1001
1002 The tar file will be created as a GNU tar archive, which enables extended
1003 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
1004 the extra functionality out-weighs the disadvantage of not being
1005 "standard".
1006
1007 If you pass in C{flat=True}, then a "flat" archive will be created, and
1008 all of the files will be added to the root of the archive. So, the file
1009 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
1010
1011 By default, the whole method call fails if there are problems adding any
1012 of the files to the archive, resulting in an exception. Under these
1013 circumstances, callers are advised that they might want to call
1014 L{removeInvalid()} and then attempt to extract the tar file a second
1015 time, since the most common cause of failures is a missing file (a file
1016 that existed when the list was built, but is gone again by the time the
1017 tar file is built).
1018
1019 If you want to, you can pass in C{ignore=True}, and the method will
1020 ignore errors encountered when adding individual files to the archive
1021 (but not errors opening and closing the archive itself).
1022
1023 We'll always attempt to remove the tarfile from disk if an exception will
1024 be thrown.
1025
1026 @note: No validation is done as to whether the entries in the list are
1027 files, since only files or soft links should be in an object like this.
1028 However, to be safe, everything is explicitly added to the tar archive
1029 non-recursively so it's safe to include soft links to directories.
1030
1031 @note: The Python C{tarfile} module, which is used internally here, is
1032 supposed to deal properly with long filenames and links. In my testing,
1033 I have found that it appears to be able to add long really long filenames
1034 to archives, but doesn't do a good job reading them back out, even out of
1035 an archive it created. Fortunately, all Cedar Backup does is add files
1036 to archives.
1037
1038 @param path: Path of tar file to create on disk
1039 @type path: String representing a path on disk
1040
1041 @param mode: Tar creation mode
1042 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1043
1044 @param ignore: Indicates whether to ignore certain errors.
1045 @type ignore: Boolean
1046
1047 @param flat: Creates "flat" archive by putting all items in root
1048 @type flat: Boolean
1049
1050 @raise ValueError: If mode is not valid
1051 @raise ValueError: If list is empty
1052 @raise ValueError: If the path could not be encoded properly.
1053 @raise TarError: If there is a problem creating the tar file
1054 """
1055 path = encodePath(path)
1056 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1057 if(mode == 'tar'): tarmode = "w:"
1058 elif(mode == 'targz'): tarmode = "w:gz"
1059 elif(mode == 'tarbz2'): tarmode = "w:bz2"
1060 else: raise ValueError("Mode [%s] is not valid." % mode)
1061 try:
1062 tar = tarfile.open(path, tarmode)
1063 tar.posix = False
1064 for entry in self:
1065 try:
1066 if flat:
1067 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1068 else:
1069 tar.add(entry, recursive=False)
1070 except tarfile.TarError, e:
1071 if not ignore:
1072 raise e
1073 logger.info("Unable to add file [%s]; going on anyway." % entry)
1074 except OSError, e:
1075 if not ignore:
1076 raise tarfile.TarError(e)
1077 logger.info("Unable to add file [%s]; going on anyway." % entry)
1078 tar.close()
1079 except tarfile.ReadError, e:
1080 try: tar.close()
1081 except: pass
1082 if os.path.exists(path):
1083 try: os.remove(path)
1084 except: pass
1085 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1086 except tarfile.TarError, e:
1087 try: tar.close()
1088 except: pass
1089 if os.path.exists(path):
1090 try: os.remove(path)
1091 except: pass
1092 raise e
1093
1095 """
1096 Removes unchanged entries from the list.
1097
1098 This method relies on a digest map as returned from L{generateDigestMap}.
1099 For each entry in C{digestMap}, if the entry also exists in the current
1100 list I{and} the entry in the current list has the same digest value as in
1101 the map, the entry in the current list will be removed.
1102
1103 This method offers a convenient way for callers to filter unneeded
1104 entries from a list. The idea is that a caller will capture a digest map
1105 from C{generateDigestMap} at some point in time (perhaps the beginning of
1106 the week), and will save off that map using C{pickle} or some other
1107 method. Then, the caller could use this method sometime in the future to
1108 filter out any unchanged files based on the saved-off map.
1109
1110 If C{captureDigest} is passed-in as C{True}, then digest information will
1111 be captured for the entire list before the removal step occurs using the
1112 same rules as in L{generateDigestMap}. The check will involve a lookup
1113 into the complete digest map.
1114
1115 If C{captureDigest} is passed in as C{False}, we will only generate a
1116 digest value for files we actually need to check, and we'll ignore any
1117 entry in the list which isn't a file that currently exists on disk.
1118
1119 The return value varies depending on C{captureDigest}, as well. To
1120 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1121 we'll just return a single value representing the number of entries
1122 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1123 map)}. The returned digest map will be in exactly the form returned by
1124 L{generateDigestMap}.
1125
1126 @note: For performance reasons, this method actually ends up rebuilding
1127 the list from scratch. First, we build a temporary dictionary containing
1128 all of the items from the original list. Then, we remove items as needed
1129 from the dictionary (which is faster than the equivalent operation on a
1130 list). Finally, we replace the contents of the current list based on the
1131 keys left in the dictionary. This should be transparent to the caller.
1132
1133 @param digestMap: Dictionary mapping file name to digest value.
1134 @type digestMap: Map as returned from L{generateDigestMap}.
1135
1136 @param captureDigest: Indicates that digest information should be captured.
1137 @type captureDigest: Boolean
1138
1139 @return: Number of entries removed
1140 """
1141 if captureDigest:
1142 removed = 0
1143 table = {}
1144 captured = {}
1145 for entry in self:
1146 if os.path.isfile(entry) and not os.path.islink(entry):
1147 table[entry] = BackupFileList._generateDigest(entry)
1148 captured[entry] = table[entry]
1149 else:
1150 table[entry] = None
1151 for entry in digestMap.keys():
1152 if table.has_key(entry):
1153 if table[entry] is not None:
1154 digest = table[entry]
1155 if digest == digestMap[entry]:
1156 removed += 1
1157 del table[entry]
1158 logger.debug("Discarded unchanged file [%s]." % entry)
1159 self[:] = table.keys()
1160 return (removed, captured)
1161 else:
1162 removed = 0
1163 table = {}
1164 for entry in self:
1165 table[entry] = None
1166 for entry in digestMap.keys():
1167 if table.has_key(entry):
1168 if os.path.isfile(entry) and not os.path.islink(entry):
1169 digest = BackupFileList._generateDigest(entry)
1170 if digest == digestMap[entry]:
1171 removed += 1
1172 del table[entry]
1173 logger.debug("Discarded unchanged file [%s]." % entry)
1174 self[:] = table.keys()
1175 return removed
1176
1177
1178
1179
1180
1181
1183
1184
1185
1186
1187
1188 """
1189 List of files and directories to be purged.
1190
1191 A PurgeItemList is a L{FilesystemList} containing a list of files and
1192 directories to be purged. On top of the generic functionality provided by
1193 L{FilesystemList}, this class adds functionality to remove items that are
1194 too young to be purged, and to actually remove each item in the list from
1195 the filesystem.
1196
1197 The other main difference is that when you add a directory's contents to a
1198 purge item list, the directory itself is not added to the list. This way,
1199 if someone asks to purge within in C{/opt/backup/collect}, that directory
1200 doesn't get removed once all of the files within it is gone.
1201 """
1202
1203
1204
1205
1206
1210
1211
1212
1213
1214
1215
1216 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0):
1217 """
1218 Adds the contents of a directory to the list.
1219
1220 The path must exist and must be a directory or a link to a directory.
1221 The contents of the directory (but I{not} the directory path itself) will
1222 be recursively added to the list, subject to any exclusions that are in
1223 place. If you only want the directory and its contents to be added, then
1224 pass in C{recursive=False}.
1225
1226 @note: If a directory's absolute path matches an exclude pattern or path,
1227 or if the directory contains the configured ignore file, then the
1228 directory and all of its contents will be recursively excluded from the
1229 list.
1230
1231 @note: If the passed-in directory happens to be a soft link, it will be
1232 recursed. However, the linkDepth parameter controls whether any soft
1233 links I{within} the directory will be recursed. The link depth is
1234 maximum depth of the tree at which soft links should be followed. So, a
1235 depth of 0 does not follow any soft links, a depth of 1 follows only
1236 links within the passed-in directory, a depth of 2 follows the links at
1237 the next level down, etc.
1238
1239 @note: Any invalid soft links (i.e. soft links that point to
1240 non-existent items) will be silently ignored.
1241
1242 @note: The L{excludeDirs} flag only controls whether any given soft link
1243 path itself is added to the list once it has been discovered. It does
1244 I{not} modify any behavior related to directory recursion.
1245
1246 @note: The L{excludeDirs} flag only controls whether any given directory
1247 path itself is added to the list once it has been discovered. It does
1248 I{not} modify any behavior related to directory recursion.
1249
1250 @param path: Directory path whose contents should be added to the list
1251 @type path: String representing a path on disk
1252
1253 @param recursive: Indicates whether directory contents should be added recursively.
1254 @type recursive: Boolean value
1255
1256 @param addSelf: Ignored in this subclass.
1257
1258 @param linkDepth: Depth of soft links that should be followed
1259 @type linkDepth: Integer value, where zero means not to follow any soft links
1260
1261 @return: Number of items recursively added to the list
1262
1263 @raise ValueError: If path is not a directory or does not exist.
1264 @raise ValueError: If the path could not be encoded properly.
1265 """
1266 path = encodePath(path)
1267 path = normalizeDir(path)
1268 return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth)
1269
1270
1271
1272
1273
1274
1276 """
1277 Removes from the list files younger than a certain age (in days).
1278
1279 Any file whose "age" in days is less than (C{<}) the value of the
1280 C{daysOld} parameter will be removed from the list so that it will not be
1281 purged later when L{purgeItems} is called. Directories and soft links
1282 will be ignored.
1283
1284 The "age" of a file is the amount of time since the file was last used,
1285 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1286
1287 @note: Some people find the "sense" of this method confusing or
1288 "backwards". Keep in mind that this method is used to remove items
1289 I{from the list}, not from the filesystem! It removes from the list
1290 those items that you would I{not} want to purge because they are too
1291 young. As an example, passing in C{daysOld} of zero (0) would remove
1292 from the list no files, which would result in purging all of the files
1293 later. I would be happy to make a synonym of this method with an
1294 easier-to-understand "sense", if someone can suggest one.
1295
1296 @param daysOld: Minimum age of files that are to be kept in the list.
1297 @type daysOld: Integer value >= 0.
1298
1299 @return: Number of entries removed
1300 """
1301 removed = 0
1302 daysOld = int(daysOld)
1303 if daysOld < 0:
1304 raise ValueError("Days old value must be an integer >= 0.")
1305 for entry in self[:]:
1306 if os.path.isfile(entry) and not os.path.islink(entry):
1307 try:
1308 age = calculateFileAge(entry)
1309 if age < daysOld:
1310 removed += 1
1311 self.remove(entry)
1312 except OSError:
1313 pass
1314 return removed
1315
1317 """
1318 Purges all items in the list.
1319
1320 Every item in the list will be purged. Directories in the list will
1321 I{not} be purged recursively, and hence will only be removed if they are
1322 empty. Errors will be ignored.
1323
1324 To faciliate easy removal of directories that will end up being empty,
1325 the delete process happens in two passes: files first (including soft
1326 links), then directories.
1327
1328 @return: Tuple containing count of (files, dirs) removed
1329 """
1330 files = 0
1331 dirs = 0
1332 for entry in self:
1333 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1334 try:
1335 os.remove(entry)
1336 files += 1
1337 logger.debug("Purged file [%s]." % entry)
1338 except OSError:
1339 pass
1340 for entry in self:
1341 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1342 try:
1343 os.rmdir(entry)
1344 dirs += 1
1345 logger.debug("Purged empty directory [%s]." % entry)
1346 except OSError:
1347 pass
1348 return (files, dirs)
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1360 """
1361 Normalizes a directory name.
1362
1363 For our purposes, a directory name is normalized by removing the trailing
1364 path separator, if any. This is important because we want directories to
1365 appear within lists in a consistent way, although from the user's
1366 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1367
1368 @param path: Path to be normalized.
1369 @type path: String representing a path on disk
1370
1371 @return: Normalized path, which should be equivalent to the original.
1372 """
1373 if path != os.sep and path[-1:] == os.sep:
1374 return path[:-1]
1375 return path
1376
1377
1378
1379
1380
1381
1382 -def compareContents(path1, path2, verbose=False):
1383 """
1384 Compares the contents of two directories to see if they are equivalent.
1385
1386 The two directories are recursively compared. First, we check whether they
1387 contain exactly the same set of files. Then, we check to see every given
1388 file has exactly the same contents in both directories.
1389
1390 This is all relatively simple to implement through the magic of
1391 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1392 off the front of each entry in the mapping it generates. This makes our
1393 comparison as simple as creating a list for each path, then generating a
1394 digest map for each path and comparing the two.
1395
1396 If no exception is thrown, the two directories are considered identical.
1397
1398 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1399 used so that any thrown exception can indicate exactly which file caused the
1400 comparison to fail. The thrown C{ValueError} exception distinguishes
1401 between the directories containing different files, and containing the same
1402 files with differing content.
1403
1404 @note: Symlinks are I{not} followed for the purposes of this comparison.
1405
1406 @param path1: First path to compare.
1407 @type path1: String representing a path on disk
1408
1409 @param path2: First path to compare.
1410 @type path2: String representing a path on disk
1411
1412 @param verbose: Indicates whether a verbose response should be given.
1413 @type verbose: Boolean
1414
1415 @raise ValueError: If a directory doesn't exist or can't be read.
1416 @raise ValueError: If the two directories are not equivalent.
1417 @raise IOError: If there is an unusual problem reading the directories.
1418 """
1419 try:
1420 path1List = BackupFileList()
1421 path1List.addDirContents(path1)
1422 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1423 path2List = BackupFileList()
1424 path2List.addDirContents(path2)
1425 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1426 compareDigestMaps(path1Digest, path2Digest, verbose)
1427 except IOError, e:
1428 logger.error("I/O error encountered during consistency check.")
1429 raise e
1430
1432 """
1433 Compares two digest maps and throws an exception if they differ.
1434
1435 @param digest1: First digest to compare.
1436 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1437
1438 @param digest2: Second digest to compare.
1439 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1440
1441 @param verbose: Indicates whether a verbose response should be given.
1442 @type verbose: Boolean
1443
1444 @raise ValueError: If the two directories are not equivalent.
1445 """
1446 if not verbose:
1447 if digest1 != digest2:
1448 raise ValueError("Consistency check failed.")
1449 else:
1450 list1 = UnorderedList(digest1.keys())
1451 list2 = UnorderedList(digest2.keys())
1452 if list1 != list2:
1453 raise ValueError("Directories contain a different set of files.")
1454 for key in list1:
1455 if digest1[key] != digest2[key]:
1456 raise ValueError("File contents for [%s] vary between directories." % key)
1457