[diffoscope] 01/01: comparators: add a fallback_recognizes to work around file(1) #876316. (Closes: #875282)

Ximin Luo infinity0 at debian.org
Thu Sep 21 13:50:00 CEST 2017


This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch master
in repository diffoscope.

commit 7b8b9ae712a4f129db03ced11d8eee3c714a22fe
Author: Ximin Luo <infinity0 at debian.org>
Date:   Thu Sep 21 13:49:06 2017 +0200

    comparators: add a fallback_recognizes to work around file(1) #876316. (Closes: #875282)
---
 diffoscope/comparators/directory.py        |  4 +++
 diffoscope/comparators/gzip.py             |  4 +++
 diffoscope/comparators/utils/file.py       | 48 ++++++++++++++++++++++++++----
 diffoscope/comparators/utils/specialize.py | 36 ++++++++++++++--------
 diffoscope/comparators/xz.py               |  4 +++
 5 files changed, 78 insertions(+), 18 deletions(-)

diff --git a/diffoscope/comparators/directory.py b/diffoscope/comparators/directory.py
index 3b195bc..44c39dd 100644
--- a/diffoscope/comparators/directory.py
+++ b/diffoscope/comparators/directory.py
@@ -150,6 +150,10 @@ class Directory(object):
     def recognizes(file):
         return file.is_directory()
 
+    @classmethod
+    def fallback_recognizes(cls, file):
+        return False
+
 
 class FilesystemDirectory(Directory):
     def __init__(self, path):
diff --git a/diffoscope/comparators/gzip.py b/diffoscope/comparators/gzip.py
index 6dd1d0b..31843d6 100644
--- a/diffoscope/comparators/gzip.py
+++ b/diffoscope/comparators/gzip.py
@@ -56,5 +56,9 @@ class GzipFile(File):
     CONTAINER_CLASS = GzipContainer
     FILE_TYPE_RE = re.compile(r'^gzip compressed data\b')
 
+    # Work around file(1) Debian bug #876316
+    FALLBACK_FILE_EXTENSION_SUFFIX = ".gz"
+    FALLBACK_FILE_TYPE_HEADER_PREFIX = b"\x1f\x8b"
+
     def compare_details(self, other, source=None):
         return [Difference.from_text(self.magic_file_type, other.magic_file_type, self, other, source='metadata')]
diff --git a/diffoscope/comparators/utils/file.py b/diffoscope/comparators/utils/file.py
index 14d5412..c5d85d3 100644
--- a/diffoscope/comparators/utils/file.py
+++ b/diffoscope/comparators/utils/file.py
@@ -57,6 +57,10 @@ def path_apparent_size(path=".", visited=None):
     return sum(visited.values())
 
 
+def _run_tests(fold, tests):
+    return fold(t(y, x) for x, t, y in tests)
+
+
 class File(object, metaclass=abc.ABCMeta):
     if hasattr(magic, 'open'):  # use Magic-file-extensions from file
         @classmethod
@@ -112,14 +116,26 @@ class File(object, metaclass=abc.ABCMeta):
 
     @classmethod
     def recognizes(cls, file):
+        """Check if a file's type matches the one represented by this class.
+
+        The default test returns True if the file matches these tests:
+
+        (cls.FILE_TYPE_RE OR
+         cls.FILE_TYPE_HEADER_PREFIX) AND
+        (cls.FILE_EXTENSION_SUFFIX)
+
+        If any test is None then the test is ignored and effectively deleted
+        from the above definition.
+
+        By default, the tests are all None and the test returns False for all
+        files. Subclasses may override them with specific values, or override
+        this method to implement a totally different test.
+        """
         # The structure below allows us to construct a boolean tree of tests
         # that can be combined with all() and any(). Tests that are not defined
         # for a class are filtered out, so that we don't get into a "vacuous
         # truth" situation like a naive all([]) invocation would give.
 
-        def run_tests(fold, tests):
-            return fold(t(y, x) for x, t, y in tests)
-
         file_type_tests = [test for test in (
             (cls.FILE_TYPE_RE,
              lambda m, t: t.search(m), file.magic_file_type),
@@ -131,10 +147,32 @@ class File(object, metaclass=abc.ABCMeta):
             (cls.FILE_EXTENSION_SUFFIX,
              str.endswith, file.name),
             (file_type_tests,
-             run_tests, any),
+             _run_tests, any),
+        ) if test[0]]  # filter out undefined tests, inc. file_type_tests if it's empty
+
+        return _run_tests(all, all_tests) if all_tests else False
+
+    FALLBACK_FILE_EXTENSION_SUFFIX = None
+    FALLBACK_FILE_TYPE_HEADER_PREFIX = None
+
+    @classmethod
+    def fallback_recognizes(cls, file):
+        """This is checked if the file could not be identified by recognizes().
+        This helps to work around bugs in file(1), see Debian bug #876316.
+
+        The default test returns True if the file matches these tests:
+
+        cls.FALLBACK_FILE_EXTENSION_SUFFIX AND
+        cls.FALLBACK_FILE_TYPE_HEADER_PREFIX
+        """
+        all_tests = [test for test in (
+            (cls.FALLBACK_FILE_EXTENSION_SUFFIX,
+             str.endswith, file.name),
+            (cls.FALLBACK_FILE_TYPE_HEADER_PREFIX,
+             bytes.startswith, file.file_header),
         ) if test[0]]  # filter out undefined tests, inc. file_type_tests if it's empty
 
-        return run_tests(all, all_tests) if all_tests else False
+        return _run_tests(all, all_tests) if all_tests else False
 
     # This might be different from path and is used to do file extension matching
     @property
diff --git a/diffoscope/comparators/utils/specialize.py b/diffoscope/comparators/utils/specialize.py
index 82bc501..55b8aa7 100644
--- a/diffoscope/comparators/utils/specialize.py
+++ b/diffoscope/comparators/utils/specialize.py
@@ -26,23 +26,33 @@ from .. import ComparatorManager
 logger = logging.getLogger(__name__)
 
 
+def try_recognize(file, cls, recognizes):
+    if isinstance(file, cls):
+        return True
+
+    # Does this file class match?
+    with profile('recognizes', file):
+        #logger.debug("trying %s on %s", cls, file)
+        if not recognizes(file):
+            return False
+
+    # Found a match; perform type magic
+    logger.debug("Using %s for %s", cls.__name__, file.name)
+    new_cls = type(cls.__name__, (cls, type(file)), {})
+    file.__class__ = new_cls
+
+    return True
+
+
 def specialize(file):
     for cls in ComparatorManager().classes:
-        if isinstance(file, cls):
+        if try_recognize(file, cls, cls.recognizes):
             return file
 
-        # Does this file class match?
-        with profile('recognizes', file):
-            if not cls.recognizes(file):
-                continue
-
-        # Found a match; perform type magic
-        logger.debug("Using %s for %s", cls.__name__, file.name)
-        new_cls = type(cls.__name__, (cls, type(file)), {})
-        file.__class__ = new_cls
-
-        return file
+    for cls in ComparatorManager().classes:
+        if try_recognize(file, cls, cls.fallback_recognizes):
+            logger.debug("File recognized by fallback. Magic says: %s", file.magic_file_type)
+            return file
 
     logger.debug("Unidentified file. Magic says: %s", file.magic_file_type)
-
     return file
diff --git a/diffoscope/comparators/xz.py b/diffoscope/comparators/xz.py
index da75050..a76408c 100644
--- a/diffoscope/comparators/xz.py
+++ b/diffoscope/comparators/xz.py
@@ -54,3 +54,7 @@ class XzContainer(Archive):
 class XzFile(File):
     CONTAINER_CLASS = XzContainer
     FILE_TYPE_RE = re.compile(r'^XZ compressed data$')
+
+    # Work around file(1) Debian bug #876316
+    FALLBACK_FILE_EXTENSION_SUFFIX = ".xz"
+    FALLBACK_FILE_TYPE_HEADER_PREFIX = b"\xfd7zXZ\x00"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list