[diffoscope] 02/02: comparators/libarchive: Avoid multiple iterations over archive by unpacking once for an ~8X runtime optimisation.

Chris Lamb chris at chris-lamb.co.uk
Mon Dec 26 14:14:01 CET 2016


This is an automated email from the git hooks/post-receive script.

lamby pushed a commit to branch master
in repository diffoscope.

commit 5fdfe91e71f1c520d902350b18f793b8c69d9118
Author: Chris Lamb <lamby at debian.org>
Date:   Mon Dec 26 10:53:37 2016 +0000

    comparators/libarchive: Avoid multiple iterations over archive by unpacking once for an ~8X runtime optimisation.
    
    On my machine this speeds up comparison of two FreeBSD base images from
    4 minutes to 30 seconds, but this optimisation will likely yield even
    greater speedups on larger archives.
    
    Signed-off-by: Chris Lamb <lamby at debian.org>
---
 diffoscope/comparators/libarchive.py | 56 +++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/diffoscope/comparators/libarchive.py b/diffoscope/comparators/libarchive.py
index d1755dc..d332b08 100644
--- a/diffoscope/comparators/libarchive.py
+++ b/diffoscope/comparators/libarchive.py
@@ -23,7 +23,7 @@ import os.path
 import ctypes
 import libarchive
 
-from diffoscope import logger
+from diffoscope import logger, get_temporary_directory
 from diffoscope.comparators.utils import Archive, ArchiveMember
 from diffoscope.comparators.device import Device
 from diffoscope.comparators.symlink import Symlink
@@ -162,27 +162,12 @@ class LibarchiveContainer(Archive):
         return
 
     def get_member_names(self):
-        with libarchive.file_reader(self.source.path) as archive:
-            member_names = [entry.pathname for entry in archive]
-        return member_names
+        self.ensure_unpacked()
+        return self._member_names
 
     def extract(self, member_name, dest_dir):
-        dest_name = os.path.basename(member_name.rstrip("/"+os.sep))
-        if not dest_name:
-            raise ValueError('could not make safe name to extract member_name to: %s' % member_name)
-        dest_path = os.path.join(dest_dir, dest_name)
-        logger.debug('libarchive extracting %s to %s', member_name, dest_path)
-        with libarchive.file_reader(self.source.path) as archive:
-            # FIXME: another O(n^2) lookup here, this will hit quite badly
-            # for large archives with a lot of small files.
-            for entry in archive:
-                if entry.pathname == member_name:
-                    logger.debug('entry found, writing %s', dest_path)
-                    with open(dest_path, 'wb') as f:
-                        for buf in entry.get_blocks():
-                            f.write(buf)
-                    return dest_path
-        raise KeyError('%s not found in archive', member_name)
+        self.ensure_unpacked()
+        return os.path.join(self._unpacked, member_name)
 
     def get_member(self, member_name):
         with libarchive.file_reader(self.source.path) as archive:
@@ -205,3 +190,34 @@ class LibarchiveContainer(Archive):
             return LibarchiveDevice(self, entry)
 
         return LibarchiveMember(self, entry)
+
+    def ensure_unpacked(self):
+        if hasattr(self, '_unpacked'):
+            return
+
+        self._unpacked = get_temporary_directory().name
+        self._member_names = []
+
+        logger.debug("Extracting %s to %s", self.source.path, self._unpacked)
+
+        with libarchive.file_reader(self.source.path) as archive:
+            for entry in archive:
+                self._member_names.append(entry.pathname)
+
+                if entry.isdir:
+                    continue
+
+                if not os.path.basename(entry.pathname.rstrip('/' + os.sep)):
+                    continue
+
+                dst = os.path.join(self._unpacked, entry.pathname)
+                os.makedirs(os.path.dirname(dst), exist_ok=True)
+
+                with open(dst, 'wb') as f:
+                    for block in entry.get_blocks():
+                        f.write(block)
+
+        logger.debug(
+            "Extracted %d entries from %s to %s",
+            len(self._member_names), self.source.path, self._unpacked,
+        )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list