[diffoscope] 03/03: comparators.squashfs: Extract archive in one go rather than per-file, speeding up ISO comparison by ~10x

Chris Lamb chris at chris-lamb.co.uk
Tue Mar 14 19:00:05 CET 2017


This is an automated email from the git hooks/post-receive script.

lamby pushed a commit to branch experimental
in repository diffoscope.

commit 52b70b269e4faa31dba92799f57cc135dcb60832
Author: Chris Lamb <lamby at debian.org>
Date:   Tue Mar 14 18:21:52 2017 +0100

    comparators.squashfs: Extract archive in one go rather than per-file, speeding up ISO comparison by ~10x
    
    Signed-off-by: Chris Lamb <lamby at debian.org>
---
 diffoscope/comparators/squashfs.py | 125 +++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 47 deletions(-)

diff --git a/diffoscope/comparators/squashfs.py b/diffoscope/comparators/squashfs.py
index 8350d4e..3561b3e 100644
--- a/diffoscope/comparators/squashfs.py
+++ b/diffoscope/comparators/squashfs.py
@@ -18,14 +18,17 @@
 # You should have received a copy of the GNU General Public License
 # along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
 
+import os
 import re
 import stat
 import logging
+import functools
 import subprocess
 import collections
 
 from diffoscope.tools import tool_required
 from diffoscope.difference import Difference
+from diffoscope.tempfiles import get_temporary_directory
 
 from .utils.file import File
 from .device import Device
@@ -71,6 +74,12 @@ class SquashfsMember(ArchiveMember):
     def is_device(self):
         return False
 
+    @property
+    def path(self):
+        # Use our extracted version and also avoid creating a temporary
+        # directory per-file in ArchiveMember.path.
+        return os.path.join(self.container._temp_dir, self._name)
+
 
 class SquashfsRegularFile(SquashfsMember):
     # Example line:
@@ -202,63 +211,85 @@ class SquashfsDevice(Device, SquashfsMember):
         return True
 
 
-SQUASHFS_LS_MAPPING = {
-    'd': SquashfsDirectory,
-    'l': SquashfsSymlink,
-    'c': SquashfsDevice,
-    'b': SquashfsDevice,
-    '-': SquashfsRegularFile
-}
-
-
 class SquashfsContainer(Archive):
-    @tool_required('unsquashfs')
-    def entries(self, path):
-        # We pass `-d ''` in order to get a listing with the names we actually
-        # need to use when extracting files
-        cmd = ['unsquashfs', '-d', '', '-lls', path]
-        output = subprocess.check_output(cmd, shell=False).decode('utf-8')
-        header = True
-
-        for line in output.rstrip('\n').split('\n'):
-            if header:
-                if line == '':
-                    header = False
-                continue
-
-            if len(line) > 0 and line[0] in SQUASHFS_LS_MAPPING:
-                try:
-                    cls = SQUASHFS_LS_MAPPING[line[0]]
-                    yield cls, cls.parse(line)
-                except SquashfsInvalidLineFormat:
-                    logger.warning("Invalid squashfs entry: %s", line)
-            else:
-                logger.warning("Unknown squashfs entry: %s", line)
+    MEMBER_CLASS = {
+        'd': SquashfsDirectory,
+        'l': SquashfsSymlink,
+        'c': SquashfsDevice,
+        'b': SquashfsDevice,
+        '-': SquashfsRegularFile
+    }
 
     def open_archive(self):
-        return collections.OrderedDict([
-            (kwargs['member_name'], (cls, kwargs))
-            for cls, kwargs in self.entries(self.source.path)
-        ])
+        return True
 
     def close_archive(self):
         pass
 
+    def get_member(self, member_name):
+        self.ensure_unpacked()
+        cls, kwargs = self._members[member_name]
+        return cls(self, member_name, **kwargs)
+
+    def extract(self, member_name, destdir):
+        # Ignore destdir argument and use our unpacked path
+        self.ensure_unpacked()
+        return member_name
+
     def get_member_names(self):
-        return self.archive.keys()
+        self.ensure_unpacked()
+        return self._members.keys()
 
-    @tool_required('unsquashfs')
-    def extract(self, member_name, dest_dir):
-        if '..' in member_name.split('/'):
-            raise ValueError("relative path in squashfs")
-        cmd = ['unsquashfs', '-n', '-f', '-d', dest_dir, self.source.path, member_name]
-        logger.debug("unsquashfs %s into %s", member_name, dest_dir)
-        subprocess.check_call(cmd, shell=False, stdout=subprocess.PIPE)
-        return '%s%s' % (dest_dir, member_name)
+    def ensure_unpacked(self):
+        if hasattr(self, '_members'):
+            return
 
-    def get_member(self, member_name):
-        cls, kwargs = self.archive[member_name]
-        return cls(self, **kwargs)
+        self._members = collections.OrderedDict()
+        self._temp_dir = get_temporary_directory().name
+
+        logger.debug("Extracting %s to %s", self.source.path, self._temp_dir)
+
+        output = subprocess.check_output((
+            'unsquashfs',
+            '-n',
+            '-f',
+            '-no',
+            '-li',
+            '-d', '.',
+            self.source.path,
+        ), stderr=subprocess.PIPE, cwd=self._temp_dir)
+
+        output = iter(output.decode('utf-8').rstrip('\n').split('\n'))
+
+        # Skip headers
+        for _ in iter(functools.partial(next, output), ''):
+            pass
+
+        for line in output:
+            if not line:
+                continue
+
+            try:
+                cls = self.MEMBER_CLASS[line[0]]
+            except KeyError:
+                logger.debug("Unknown squashfs entry: %s", line)
+                continue
+
+            try:
+                kwargs = cls.parse(line)
+            except SquashfsInvalidLineFormat:
+                continue
+
+            # Pop to avoid duplicate member name twice and strip the leading
+            # "./" for aesthetics
+            member_name = kwargs.pop('member_name')[2:]
+
+            self._members[member_name] = (cls, kwargs)
+
+        logger.debug(
+            "Extracted %d entries from %s to %s",
+            len(self._members), self.source.path, self._temp_dir,
+        )
 
 
 class SquashfsFile(File):

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list