[Git][reproducible-builds/diffoscope][master] 2 commits: add specialize_as(), use it to speed up .smali comparison in APKs

Chris Lamb (@lamby) gitlab at salsa.debian.org
Thu Aug 31 15:50:56 UTC 2023



Chris Lamb pushed to branch master at Reproducible Builds / diffoscope


Commits:
fe513c02 by FC Stegerman at 2023-08-31T08:29:10-07:00
add specialize_as(), use it to speed up .smali comparison in APKs

- - - - -
1f8d9e17 by Chris Lamb at 2023-08-31T08:50:13-07:00
Add documentation for the new specialize_as, and expand the documentation of `specialize` too. (Re: reproducible-builds/diffoscope!108)

- - - - -


2 changed files:

- diffoscope/comparators/apk.py
- diffoscope/comparators/utils/specialize.py


Changes:

=====================================
diffoscope/comparators/apk.py
=====================================
@@ -35,9 +35,11 @@ from diffoscope.tools import (
 )
 from diffoscope.tempfiles import get_temporary_directory
 
-from .utils.archive import Archive
+from .text import TextFile
+from .utils.archive import Archive, ArchiveMember
 from .utils.command import Command
 from .utils.compare import compare_files
+from .utils.specialize import specialize_as
 from .zip import ZipContainer, zipinfo_differences, ZipFileBase
 from .missing_file import MissingFile
 
@@ -157,6 +159,14 @@ class ApkContainer(Archive):
     def get_member_names(self):
         return self._members
 
+    def get_member(self, member_name):
+        member = ArchiveMember(self, member_name)
+        if member_name.endswith(".smali") and member_name.startswith("smali"):
+            # smali{,_classesN}/**/*.smali files from apktool are always text,
+            # and using libmagic on thousands of these files takes minutes
+            return specialize_as(TextFile, member)
+        return member
+
     def extract(self, member_name, dest_dir):
         return os.path.join(self._tmpdir.name, member_name)
 


=====================================
diffoscope/comparators/utils/specialize.py
=====================================
@@ -28,9 +28,6 @@ logger = logging.getLogger(__name__)
 
 
 def try_recognize(file, cls, recognizes):
-    if isinstance(file, cls):
-        return True
-
     # Does this file class match?
     with profile("recognizes", file):
         # logger.debug("trying %s on %s", cls, file)
@@ -43,17 +40,40 @@ def try_recognize(file, cls, recognizes):
         format_class(cls, strip="diffoscope.comparators."),
         file.name,
     )
-    new_cls = type(cls.__name__, (cls, type(file)), {})
-    file.__class__ = new_cls
+    specialize_as(cls, file)
 
     return True
 
 
+def specialize_as(cls, file):
+    """
+    Sometimes it is near-certain that files within a Container with a given
+    extension (say) are of a known File type. We therefore do not need to run
+    libmagic on these files, especially in cases where the Container contains
+    hundreds of similar/smal files. (This can be seeen in the case of apktool
+    and .smali files). In this case, this method can be used to essentially
+    fix/force the type. Care should naturally be taken within Container
+    implementations; such as checking the file extension and so forth.
+    """
+
+    new_cls = type(cls.__name__, (cls, type(file)), {})
+    file.__class__ = new_cls
+    return file
+
+
 def specialize(file):
+    # If we already know the class (ie. via `specialize_as`), then we do not
+    # need to run `File.recognizes` at all.
+    for cls in ComparatorManager().classes:
+        if isinstance(file, cls):
+            return file
+
+    # Run the usual `File.recognizes` implementation.
     for cls in ComparatorManager().classes:
         if try_recognize(file, cls, cls.recognizes):
             return file
 
+    # If there are no matches, run the fallback implementation.
     for cls in ComparatorManager().classes:
         if try_recognize(file, cls, cls.fallback_recognizes):
             logger.debug(



View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/bf334e1d83d5c1a0ccaa0da4f2d8ecaeb2f1b84d...1f8d9e17b01c12c773ad8ba9e93458b054654bc7

-- 
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/bf334e1d83d5c1a0ccaa0da4f2d8ecaeb2f1b84d...1f8d9e17b01c12c773ad8ba9e93458b054654bc7
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20230831/06178326/attachment.htm>


More information about the rb-commits mailing list