[diffoscope] 04/04: Refactor Container abstract method names to be more obvious in what they do

Ximin Luo infinity0 at debian.org
Fri May 26 16:55:55 CEST 2017


This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch experimental
in repository diffoscope.

commit 8a8cb3f681a1e6b32d35d6efe2ec5095fcaa4bd6
Author: Ximin Luo <infinity0 at debian.org>
Date:   Fri May 26 16:54:44 2017 +0200

    Refactor Container abstract method names to be more obvious in what they do
    
    Add logic to force-compare 1-element containers with each other, and remove
    several unnecessary get[_adjusted]_member overrides in subclasses.
---
 diffoscope/comparators/ar.py               | 11 +++---
 diffoscope/comparators/bzip2.py            |  3 --
 diffoscope/comparators/deb.py              |  6 ++--
 diffoscope/comparators/debian.py           |  8 ++---
 diffoscope/comparators/dex.py              |  3 --
 diffoscope/comparators/fsimage.py          |  3 --
 diffoscope/comparators/gzip.py             |  3 --
 diffoscope/comparators/rust.py             |  3 --
 diffoscope/comparators/utils/container.py  | 55 +++++++++++++++++-------------
 diffoscope/comparators/utils/libarchive.py | 12 ++++---
 diffoscope/comparators/xz.py               |  3 --
 11 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/diffoscope/comparators/ar.py b/diffoscope/comparators/ar.py
index 361f741..0de0bff 100644
--- a/diffoscope/comparators/ar.py
+++ b/diffoscope/comparators/ar.py
@@ -37,18 +37,17 @@ logger = logging.getLogger(__name__)
 # the __.PKGDEF member which is just a text file containing the Go interface.
 
 class ArContainer(LibarchiveContainer):
-    def get_members(self):
-        members = LibarchiveContainer.get_members(self)
-        cls = members.__class__
+    def get_adjusted_members(self):
+        members = list(super().get_adjusted_members())
         known_ignores = {
             "/" : "this is the symbol table, already accounted for in other output",
             "//" : "this is the table for GNU long names, already accounted for in the archive filelist",
         }
-        filtered_out = cls([p for p in members.items() if p[0] in known_ignores])
+        filtered_out = [p for p in members if p[0] in known_ignores]
         if filtered_out:
-            for k, v in filtered_out.items():
+            for k, v in filtered_out:
                 logger.debug("ignored ar member '%s' because %s", k, known_ignores[k])
-        return cls([p for p in members.items() if p[0] not in known_ignores])
+        return [p for p in members if p[0] not in known_ignores]
 
 class ArSymbolTableDumper(Command):
     @tool_required('nm')
diff --git a/diffoscope/comparators/bzip2.py b/diffoscope/comparators/bzip2.py
index baac954..3775c43 100644
--- a/diffoscope/comparators/bzip2.py
+++ b/diffoscope/comparators/bzip2.py
@@ -38,9 +38,6 @@ class Bzip2Container(Archive):
     def close_archive(self):
         pass
 
-    def get_members(self):
-        return collections.OrderedDict({'bzip2-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [self.get_compressed_content_name('.bz2')]
 
diff --git a/diffoscope/comparators/deb.py b/diffoscope/comparators/deb.py
index 9a0758e..66a6d05 100644
--- a/diffoscope/comparators/deb.py
+++ b/diffoscope/comparators/deb.py
@@ -40,7 +40,7 @@ logger = logging.getLogger(__name__)
 # given container
 def get_build_id_map(container):
     d = {}
-    for member_name, member in container.get_members().items():
+    for member_name, member in container.get_adjusted_members():
         # Let's assume the name will end with .deb to avoid looking at
         # too many irrelevant files
         if not member_name.endswith('.deb'):
@@ -59,7 +59,7 @@ class DebContainer(LibarchiveContainer):
 
     @property
     def data_tar(self):
-        for name, member in self.get_members().items():
+        for name, member in self.get_adjusted_members():
             if DebContainer.RE_DATA_TAR.match(name):
                 specialize(member)
                 if name.endswith('.tar'):
@@ -69,7 +69,7 @@ class DebContainer(LibarchiveContainer):
 
     @property
     def control_tar(self):
-        for name, member in self.get_members().items():
+        for name, member in self.get_adjusted_members():
             if DebContainer.RE_CONTROL_TAR.match(name):
                 specialize(member)
                 if name.endswith('.tar'):
diff --git a/diffoscope/comparators/debian.py b/diffoscope/comparators/debian.py
index 4658fc2..f8df2bf 100644
--- a/diffoscope/comparators/debian.py
+++ b/diffoscope/comparators/debian.py
@@ -87,11 +87,9 @@ class DebControlContainer(Container):
 
         return re.compile(re.escape(version))
 
-    def get_members(self):
-        return collections.OrderedDict([
-            (self._trim_version_number(name), self.get_member(name))
-            for name in self.get_member_names()
-        ])
+    def get_adjusted_members(self):
+        for name in self.get_member_names():
+            yield self._trim_version_number(name), self.get_member(name)
 
     def get_member_names(self):
         field = self.source.deb822.get('Files') or \
diff --git a/diffoscope/comparators/dex.py b/diffoscope/comparators/dex.py
index bd393cd..a26ab42 100644
--- a/diffoscope/comparators/dex.py
+++ b/diffoscope/comparators/dex.py
@@ -42,9 +42,6 @@ class DexContainer(Archive):
     def close_archive(self):
         pass
 
-    def get_members(self):
-        return collections.OrderedDict({'dex-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [self.get_compressed_content_name('.dex') + '.jar']
 
diff --git a/diffoscope/comparators/fsimage.py b/diffoscope/comparators/fsimage.py
index f30bbb7..5135ab7 100644
--- a/diffoscope/comparators/fsimage.py
+++ b/diffoscope/comparators/fsimage.py
@@ -59,9 +59,6 @@ class FsImageContainer(Archive):
         self.g.umount_all()
         self.g.close()
 
-    def get_members(self):
-        return collections.OrderedDict({'fsimage-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [os.path.basename(self.source.path) + '.tar']
 
diff --git a/diffoscope/comparators/gzip.py b/diffoscope/comparators/gzip.py
index 4eda027..73126bb 100644
--- a/diffoscope/comparators/gzip.py
+++ b/diffoscope/comparators/gzip.py
@@ -40,9 +40,6 @@ class GzipContainer(Archive):
     def close_archive(self):
         pass
 
-    def get_members(self):
-        return collections.OrderedDict({'gzip-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [self.get_compressed_content_name('.gz')]
 
diff --git a/diffoscope/comparators/rust.py b/diffoscope/comparators/rust.py
index 475fe1e..28d8dc2 100644
--- a/diffoscope/comparators/rust.py
+++ b/diffoscope/comparators/rust.py
@@ -41,9 +41,6 @@ class RustObjectContainer(Archive):
     def close_archive(self):
         pass
 
-    def get_members(self):
-        return collections.OrderedDict({'deflate-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [self.get_compressed_content_name('.deflate')]
 
diff --git a/diffoscope/comparators/utils/container.py b/diffoscope/comparators/utils/container.py
index b8107d9..0f9832f 100644
--- a/diffoscope/comparators/utils/container.py
+++ b/diffoscope/comparators/utils/container.py
@@ -58,12 +58,32 @@ class Container(object, metaclass=abc.ABCMeta):
     def source(self):
         return self._source
 
-    def get_members(self):
+    @abc.abstractmethod
+    def get_member_names(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_member(self, member_name):
+        raise NotImplementedError()
+
+    def get_filtered_members(self):
+        # If your get_member implementation is O(n) then this will be O(n^2)
+        # cost. In such cases it is HIGHLY RECOMMENDED to override this as well
+        for name in filter_excludes(self.get_member_names()):
+            yield name, self.get_member(name)
+
+    def get_adjusted_members(self):
         """
-        Returns a dictionary. The key is what is used to match when comparing
-        containers.
+        Returns an iterable of pairs. The key is what is used to match when
+        comparing containers. This may be used to e.g. strip off version
+        numbers, hashes, etc, efficiently for known file formats, so that we
+        don't need to use the expensive tlsh "fuzzy-hashing" logic.
+
+        Note that containers with 1 element are already force-compared against
+        other containers with 1 element, so you don't need to override this
+        method for those cases.
         """
-        return OrderedDict(self.get_all_members())
+        return self.get_filtered_members()
 
     def lookup_file(self, *names):
         """
@@ -89,32 +109,14 @@ class Container(object, metaclass=abc.ABCMeta):
 
         return container.lookup_file(*remainings)
 
-    @abc.abstractmethod
-    def get_member_names(self):
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def get_member(self, member_name):
-        raise NotImplementedError()
-
-    def get_filtered_member_names(self):
-        return filter_excludes(self.get_member_names())
-
     def get_filtered_members_sizes(self):
-        for name in self.get_filtered_member_names():
-            member = self.get_member(name)
+        for name, member in self.get_adjusted_members():
             if member.is_directory():
                 size = 4096 # default "size" of a directory
             else:
                 size = path_apparent_size(member.path)
             yield name, (member, size)
 
-    def get_all_members(self):
-        # If your get_member implementation is O(n) then this will be O(n^2)
-        # cost. In such cases it is HIGHLY RECOMMENDED to override this as well
-        for name in self.get_filtered_member_names():
-            yield name, self.get_member(name)
-
     def comparisons(self, other):
         my_members = OrderedDict(self.get_filtered_members_sizes())
         my_remainders = OrderedDict()
@@ -123,6 +125,13 @@ class Container(object, metaclass=abc.ABCMeta):
         # TODO: progress could be a bit more accurate here, give more weight to fuzzy-hashed files
 
         with Progress(total_size) as p:
+            if len(my_members) == 1 and len(other_members) == 1:
+                _, (my_member, my_size) = my_members.popitem()
+                _, (other_member, other_size) = other_members.popitem()
+                p.begin_step(my_size + other_size, msg=my_member.progress_name)
+                yield my_member, other_member, NO_COMMENT
+                return
+
             # keep it sorted like my members
             while my_members:
                 my_member_name, (my_member, my_size) = my_members.popitem(last=False)
diff --git a/diffoscope/comparators/utils/libarchive.py b/diffoscope/comparators/utils/libarchive.py
index 7ec2b0f..f427ede 100644
--- a/diffoscope/comparators/utils/libarchive.py
+++ b/diffoscope/comparators/utils/libarchive.py
@@ -173,10 +173,6 @@ class LibarchiveContainer(Archive):
         self.ensure_unpacked()
         return self._members.keys()
 
-    def extract(self, member_name, dest_dir):
-        self.ensure_unpacked()
-        return self._members[member_name]
-
     def get_member(self, member_name):
         with libarchive.file_reader(self.source.path) as archive:
             for entry in archive:
@@ -184,11 +180,17 @@ class LibarchiveContainer(Archive):
                     return self.get_subclass(entry)
         raise KeyError('%s not found in archive', member_name)
 
-    def get_all_members(self):
+    def get_filtered_members(self):
         with libarchive.file_reader(self.source.path) as archive:
             for entry in archive:
+                if any_excluded(entry.pathname):
+                    continue
                 yield entry.pathname, self.get_subclass(entry)
 
+    def extract(self, member_name, dest_dir):
+        self.ensure_unpacked()
+        return self._members[member_name]
+
     def get_subclass(self, entry):
         if entry.isdir:
             return LibarchiveDirectory(self, entry)
diff --git a/diffoscope/comparators/xz.py b/diffoscope/comparators/xz.py
index ab1d89c..fedc848 100644
--- a/diffoscope/comparators/xz.py
+++ b/diffoscope/comparators/xz.py
@@ -38,9 +38,6 @@ class XzContainer(Archive):
     def close_archive(self):
         pass
 
-    def get_members(self):
-        return collections.OrderedDict({'xz-content': self.get_member(self.get_member_names()[0])})
-
     def get_member_names(self):
         return [self.get_compressed_content_name('.xz')]
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list