[Git][reproducible-builds/diffoscope][master] 4 commits: file: Improve performance of has_same_content by spawning cmp less frequently

Chris Lamb gitlab at salsa.debian.org
Thu Jan 7 10:56:07 UTC 2021



Chris Lamb pushed to branch master at Reproducible Builds / diffoscope


Commits:
80bf1aef by Dimitrios Apostolou at 2021-01-06T07:26:43+01:00
file: Improve performance of has_same_content by spawning cmp less frequently

* If the two sizes are different, files are different
* Else read the first SMALL_FILE_THRESHOLD bytes (64K)
  * If the first bytes are different, files are different
  * Else first bytes are identical, and if file size is
    <= SMALL_FILE_THRESHOLD then files are identical

Only if all these checks fail, spawn external command `cmp -s` to do the
comparison.

- - - - -
cc38d8e4 by Dimitrios Apostolou at 2021-01-06T07:26:43+01:00
Log when cmp command is spawned

- - - - -
223b1588 by Dimitrios Apostolou at 2021-01-07T06:44:28+01:00
Avoid invoking diff for short outputs that are identical

Commands with short outputs, like `stat`, `getfacl` and `lsattr`, now store
all their output in memory buffers and we run an in-memory comparison first.
We proceed to call `diff` only if the buffers are not identical.

- - - - -
a1cc1c0c by Chris Lamb at 2021-01-07T10:55:52+00:00
Wrap our external call to cmp(1) with a profile to match the internal one..

- - - - -


4 changed files:

- diffoscope/comparators/directory.py
- diffoscope/comparators/utils/file.py
- diffoscope/difference.py
- diffoscope/feeders.py


Changes:

=====================================
diffoscope/comparators/directory.py
=====================================
@@ -161,13 +161,17 @@ def compare_meta(path1, path2):
         return differences
 
     try:
-        differences.append(Difference.from_operation(Stat, path1, path2))
+        differences.append(
+            Difference.from_operation(Stat, path1, path2, short=True)
+        )
     except RequiredToolNotFound:
         logger.error("Unable to find 'stat'! Is PATH wrong?")
     if os.path.islink(path1) or os.path.islink(path2):
         return [d for d in differences if d is not None]
     try:
-        differences.append(Difference.from_operation(Getfacl, path1, path2))
+        differences.append(
+            Difference.from_operation(Getfacl, path1, path2, short=True)
+        )
     except RequiredToolNotFound:
         logger.info(
             "Unable to find 'getfacl', some directory metadata differences might not be noticed."


=====================================
diffoscope/comparators/utils/file.py
=====================================
@@ -1,7 +1,7 @@
 #
 # diffoscope: in-depth comparison of files, archives, and directories
 #
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -447,29 +447,39 @@ class File(metaclass=abc.ABCMeta):
             # files not readable (e.g. broken symlinks) or something else,
             # just assume they are different
             return False
-        if my_size == other_size and my_size <= SMALL_FILE_THRESHOLD:
-            try:
-                with profile("command", "cmp (internal)"):
-                    with open(self.path, "rb") as file1, open(
-                        other.path, "rb"
-                    ) as file2:
-                        return file1.read() == file2.read()
-            except OSError:
-                # one or both files could not be opened for some reason,
-                # assume they are different
-                return False
+        if my_size != other_size:
+            return False
+
+        # Compare from python the first bytes, and only if they are
+        # identical then call external command.
+        assert my_size == other_size
+        try:
+            with profile("command", "cmp (internal)"):
+                with open(self.path, "rb") as file1, open(
+                    other.path, "rb"
+                ) as file2:
+                    content1 = file1.read(SMALL_FILE_THRESHOLD)
+                    content2 = file2.read(SMALL_FILE_THRESHOLD)
+        except OSError:
+            # one or both files could not be opened for some reason,
+            # assume they are different
+            return False
 
+        if content1 != content2:
+            return False
+        if my_size <= SMALL_FILE_THRESHOLD:
+            return True
+
+        # Big files, same size, same first bytes
         return self.cmp_external(other)
 
     @tool_required("cmp")
     def cmp_external(self, other):
-        return (
-            subprocess.call(
-                ("cmp", "-s", self.path, other.path),
-                close_fds=True,
-            )
-            == 0
-        )
+        cmdline = ("cmp", "-s", self.path, other.path)
+        logger.debug("Executing: %s", " ".join(cmdline))
+
+        with profile("command", "cmp (external)"):
+            return subprocess.call(cmdline, close_fds=True) == 0
 
     # To be specialized directly, or by implementing compare_details
     def compare(self, other, source=None):


=====================================
diffoscope/difference.py
=====================================
@@ -18,6 +18,7 @@
 # along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
 
 import heapq
+import io
 import logging
 import subprocess
 
@@ -224,6 +225,12 @@ class Difference:
 
     @staticmethod
     def from_text(content1, content2, *args, **kwargs):
+        """
+        Works for both bytes and str objects.
+        """
+        # Avoid spawning diff if buffers have same contents
+        if content1 == content2:
+            return None
         return Difference.from_feeder(
             feeders.from_text(content1),
             feeders.from_text(content2),
@@ -283,9 +290,26 @@ class Difference:
             kwargs["source"] = source_op.full_name(truncate=120)
 
         try:
-            difference = Difference.from_feeder(
-                feeder1, feeder2, path1, path2, *args, **kwargs
-            )
+            short = kwargs.pop("short", False)
+            # If the outputs are expected to be short, store them in memory
+            # and do a direct comparison, and only spawn diff if needed.
+            if short:
+                memfile1 = io.BytesIO()
+                feeder1(memfile1)
+                memfile2 = io.BytesIO()
+                feeder2(memfile2)
+                bytes1 = memfile1.getbuffer().tobytes()
+                bytes2 = memfile2.getbuffer().tobytes()
+                # Check if the buffers are the same before invoking diff
+                if bytes1 == bytes2:
+                    return None, True
+                difference = Difference.from_text(
+                    bytes1, bytes2, path1, path2, *args, **kwargs
+                )
+            else:
+                difference = Difference.from_feeder(
+                    feeder1, feeder2, path1, path2, *args, **kwargs
+                )
         except subprocess.CalledProcessError as exc:
             if exc.returncode in ignore_returncodes:
                 return None, False


=====================================
diffoscope/feeders.py
=====================================
@@ -146,10 +146,17 @@ def from_operation(operation):
 
 
 def from_text(content):
+    """
+    Works for both bytes and str objects.
+    """
+
     def feeder(f):
         for offset in range(0, len(content), DIFF_CHUNK):
             buf = filter_reader(content[offset : offset + DIFF_CHUNK])
-            f.write(buf.encode("utf-8"))
+            if isinstance(buf, str):
+                f.write(buf.encode("utf-8"))
+            else:
+                f.write(buf)
 
         return content and content[-1] == "\n"
 



View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/ebd3bdcb6127350be8a4e3716f81dc640690e4c8...a1cc1c0cc7c5a084bb1a5f5dbc5c4f35d09b1c2c

-- 
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/ebd3bdcb6127350be8a4e3716f81dc640690e4c8...a1cc1c0cc7c5a084bb1a5f5dbc5c4f35d09b1c2c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20210107/6fe34efd/attachment.htm>


More information about the rb-commits mailing list