[Git][reproducible-builds/diffoscope][master] 4 commits: file: Improve performance of has_same_content by spawning cmp less frequently
Chris Lamb
gitlab at salsa.debian.org
Thu Jan 7 10:56:07 UTC 2021
Chris Lamb pushed to branch master at Reproducible Builds / diffoscope
Commits:
80bf1aef by Dimitrios Apostolou at 2021-01-06T07:26:43+01:00
file: Improve performance of has_same_content by spawning cmp less frequently
* If the two sizes are different, files are different
* Else read the first SMALL_FILE_THRESHOLD bytes (64K)
* If the first bytes are different, files are different
* Else first bytes are identical, and if file size is
<= SMALL_FILE_THRESHOLD then files are identical
Only if all these checks fail, spawn external command `cmp -s` to do the
comparison.
- - - - -
cc38d8e4 by Dimitrios Apostolou at 2021-01-06T07:26:43+01:00
Log when cmp command is spawned
- - - - -
223b1588 by Dimitrios Apostolou at 2021-01-07T06:44:28+01:00
Avoid invoking diff for short outputs that are identical
Commands with short outputs, like `stat`, `getfacl` and `lsattr`, now store
all their output in memory buffers and we run an in-memory comparison first.
We proceed to call `diff` only if the buffers are not identical.
- - - - -
a1cc1c0c by Chris Lamb at 2021-01-07T10:55:52+00:00
Wrap our external call to cmp(1) with a profile to match the internal one..
- - - - -
4 changed files:
- diffoscope/comparators/directory.py
- diffoscope/comparators/utils/file.py
- diffoscope/difference.py
- diffoscope/feeders.py
Changes:
=====================================
diffoscope/comparators/directory.py
=====================================
@@ -161,13 +161,17 @@ def compare_meta(path1, path2):
return differences
try:
- differences.append(Difference.from_operation(Stat, path1, path2))
+ differences.append(
+ Difference.from_operation(Stat, path1, path2, short=True)
+ )
except RequiredToolNotFound:
logger.error("Unable to find 'stat'! Is PATH wrong?")
if os.path.islink(path1) or os.path.islink(path2):
return [d for d in differences if d is not None]
try:
- differences.append(Difference.from_operation(Getfacl, path1, path2))
+ differences.append(
+ Difference.from_operation(Getfacl, path1, path2, short=True)
+ )
except RequiredToolNotFound:
logger.info(
"Unable to find 'getfacl', some directory metadata differences might not be noticed."
=====================================
diffoscope/comparators/utils/file.py
=====================================
@@ -1,7 +1,7 @@
#
# diffoscope: in-depth comparison of files, archives, and directories
#
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -447,29 +447,39 @@ class File(metaclass=abc.ABCMeta):
# files not readable (e.g. broken symlinks) or something else,
# just assume they are different
return False
- if my_size == other_size and my_size <= SMALL_FILE_THRESHOLD:
- try:
- with profile("command", "cmp (internal)"):
- with open(self.path, "rb") as file1, open(
- other.path, "rb"
- ) as file2:
- return file1.read() == file2.read()
- except OSError:
- # one or both files could not be opened for some reason,
- # assume they are different
- return False
+ if my_size != other_size:
+ return False
+
+ # Compare from python the first bytes, and only if they are
+ # identical then call external command.
+ assert my_size == other_size
+ try:
+ with profile("command", "cmp (internal)"):
+ with open(self.path, "rb") as file1, open(
+ other.path, "rb"
+ ) as file2:
+ content1 = file1.read(SMALL_FILE_THRESHOLD)
+ content2 = file2.read(SMALL_FILE_THRESHOLD)
+ except OSError:
+ # one or both files could not be opened for some reason,
+ # assume they are different
+ return False
+ if content1 != content2:
+ return False
+ if my_size <= SMALL_FILE_THRESHOLD:
+ return True
+
+ # Big files, same size, same first bytes
return self.cmp_external(other)
@tool_required("cmp")
def cmp_external(self, other):
- return (
- subprocess.call(
- ("cmp", "-s", self.path, other.path),
- close_fds=True,
- )
- == 0
- )
+ cmdline = ("cmp", "-s", self.path, other.path)
+ logger.debug("Executing: %s", " ".join(cmdline))
+
+ with profile("command", "cmp (external)"):
+ return subprocess.call(cmdline, close_fds=True) == 0
# To be specialized directly, or by implementing compare_details
def compare(self, other, source=None):
=====================================
diffoscope/difference.py
=====================================
@@ -18,6 +18,7 @@
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import heapq
+import io
import logging
import subprocess
@@ -224,6 +225,12 @@ class Difference:
@staticmethod
def from_text(content1, content2, *args, **kwargs):
+ """
+ Works for both bytes and str objects.
+ """
+ # Avoid spawning diff if buffers have same contents
+ if content1 == content2:
+ return None
return Difference.from_feeder(
feeders.from_text(content1),
feeders.from_text(content2),
@@ -283,9 +290,26 @@ class Difference:
kwargs["source"] = source_op.full_name(truncate=120)
try:
- difference = Difference.from_feeder(
- feeder1, feeder2, path1, path2, *args, **kwargs
- )
+ short = kwargs.pop("short", False)
+ # If the outputs are expected to be short, store them in memory
+ # and do a direct comparison, and only spawn diff if needed.
+ if short:
+ memfile1 = io.BytesIO()
+ feeder1(memfile1)
+ memfile2 = io.BytesIO()
+ feeder2(memfile2)
+ bytes1 = memfile1.getbuffer().tobytes()
+ bytes2 = memfile2.getbuffer().tobytes()
+ # Check if the buffers are the same before invoking diff
+ if bytes1 == bytes2:
+ return None, True
+ difference = Difference.from_text(
+ bytes1, bytes2, path1, path2, *args, **kwargs
+ )
+ else:
+ difference = Difference.from_feeder(
+ feeder1, feeder2, path1, path2, *args, **kwargs
+ )
except subprocess.CalledProcessError as exc:
if exc.returncode in ignore_returncodes:
return None, False
=====================================
diffoscope/feeders.py
=====================================
@@ -146,10 +146,17 @@ def from_operation(operation):
def from_text(content):
+ """
+ Works for both bytes and str objects.
+ """
+
def feeder(f):
for offset in range(0, len(content), DIFF_CHUNK):
buf = filter_reader(content[offset : offset + DIFF_CHUNK])
- f.write(buf.encode("utf-8"))
+ if isinstance(buf, str):
+ f.write(buf.encode("utf-8"))
+ else:
+ f.write(buf)
return content and content[-1] == "\n"
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/ebd3bdcb6127350be8a4e3716f81dc640690e4c8...a1cc1c0cc7c5a084bb1a5f5dbc5c4f35d09b1c2c
--
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/ebd3bdcb6127350be8a4e3716f81dc640690e4c8...a1cc1c0cc7c5a084bb1a5f5dbc5c4f35d09b1c2c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20210107/6fe34efd/attachment.htm>
More information about the rb-commits
mailing list