[Git][reproducible-builds/diffoscope][master] 5 commits: Show fuzzyness amount in percentage terms, not out of the rather-arbitrary 400.

Chris Lamb gitlab at salsa.debian.org
Tue Jan 26 17:28:25 UTC 2021



Chris Lamb pushed to branch master at Reproducible Builds / diffoscope


Commits:
42ebb627 by Chris Lamb at 2021-01-26T12:36:52+00:00
Show fuzzyness amount in percentage terms, not out of the rather-arbitrary 400.

- - - - -
62ae0bbb by Chris Lamb at 2021-01-26T12:38:35+00:00
comparators.utils.fuzzy: Tidy module.

- - - - -
005b2743 by Chris Lamb at 2021-01-26T12:43:32+00:00
Improve the logging of fuzzy matching.

- - - - -
8c50cb83 by Chris Lamb at 2021-01-26T12:47:13+00:00
Increase fuzzy matching threshold to ensure that we show more differences. (Closes: reproducible-builds/diffoscope#232)

The two files in this bug report had a score of 74, but another file had a
score of 106.

- - - - -
9a3553b8 by Chris Lamb at 2021-01-26T17:27:36+00:00
Update my copyright years.

- - - - -


8 changed files:

- diffoscope/__init__.py
- diffoscope/comparators/directory.py
- diffoscope/comparators/json.py
- diffoscope/comparators/utils/container.py
- diffoscope/comparators/utils/fuzzy.py
- diffoscope/config.py
- diffoscope/main.py
- tests/test_tools.py


Changes:

=====================================
diffoscope/__init__.py
=====================================
@@ -2,7 +2,7 @@
 # diffoscope: in-depth comparison of files, archives, and directories
 #
 # Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2015-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2015-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by


=====================================
diffoscope/comparators/directory.py
=====================================
@@ -2,7 +2,7 @@
 # diffoscope: in-depth comparison of files, archives, and directories
 #
 # Copyright © 2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2015-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2015-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by


=====================================
diffoscope/comparators/json.py
=====================================
@@ -1,7 +1,7 @@
 #
 # diffoscope: in-depth comparison of files, archives, and directories
 #
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by


=====================================
diffoscope/comparators/utils/container.py
=====================================
@@ -1,7 +1,7 @@
 #
 # diffoscope: in-depth comparison of files, archives, and directories
 #
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -164,10 +164,8 @@ class Container(metaclass=abc.ABCMeta):
             for my_name, other_name, score in self.perform_fuzzy_matching(
                 my_members, other_members
             ):
-                comment = (
-                    "Files similar despite different names"
-                    " (score: {}, lower is more similar)".format(score)
-                )
+                score_display = score / 400.0 * 100
+                comment = f"Files {score_display:.0f}% similar despite different names"
                 if score == 0:
                     comment = "Files identical despite different names"
                 yield prep_yield(my_name, other_name, comment)


=====================================
diffoscope/comparators/utils/fuzzy.py
=====================================
@@ -2,7 +2,7 @@
 # diffoscope: in-depth comparison of files, archives, and directories
 #
 # Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -31,35 +31,44 @@ logger = logging.getLogger(__name__)
 
 
 def perform_fuzzy_matching(members1, members2):
-    if tlsh is None or Config().fuzzy_threshold == 0:
+    threshold = Config().fuzzy_threshold
+
+    if tlsh is None or threshold == 0:
         return
-    already_compared = set()
+
     # Create local copies because they will be modified by consumer
     members1 = dict(members1)
     members2 = dict(members2)
+
+    seen = set()
     for name1, (file1, _) in members1.items():
         if file1.is_directory() or not file1.fuzzy_hash:
             continue
+
         comparisons = []
         for name2, (file2, _) in members2.items():
-            if (
-                name2 in already_compared
-                or file2.is_directory()
-                or not file2.fuzzy_hash
-            ):
+            if name2 in seen or file2.is_directory() or not file2.fuzzy_hash:
                 continue
             comparisons.append(
                 (tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2)
             )
-        if comparisons:
-            comparisons.sort(key=operator.itemgetter(0))
-            score, name2 = comparisons[0]
-            logger.debug(
-                "fuzzy top match %s %s: %d difference score",
-                name1,
-                name2,
-                score,
-            )
-            if score < Config().fuzzy_threshold:
-                yield name1, name2, score
-                already_compared.add(name2)
+
+        if not comparisons:
+            continue
+
+        comparisons.sort(key=operator.itemgetter(0))
+        score, name2 = comparisons[0]
+
+        suffix = "will not compare files"
+        if score < threshold:
+            seen.add(name2)
+            yield name1, name2, score
+            suffix = "will compare files"
+
+        logger.debug(
+            "Fuzzy matching %s %s (score: %d/400): %s",
+            name1,
+            name2,
+            score,
+            suffix,
+        )


=====================================
diffoscope/config.py
=====================================
@@ -2,7 +2,7 @@
 # diffoscope: in-depth comparison of files, archives, and directories
 #
 # Copyright © 2015 Reiner Herrmann <reiner at reiner-h.de>
-# Copyright © 2016-2017, 2019-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2017, 2019-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -56,7 +56,7 @@ class Config:
         self.difftool = None
         self.diff_masks = ()
         self.new_file = False
-        self.fuzzy_threshold = 60
+        self.fuzzy_threshold = 120
         self.enforce_constraints = True
         self.excludes = ()
         self.exclude_commands = ()


=====================================
diffoscope/main.py
=====================================
@@ -4,7 +4,7 @@
 # diffoscope: in-depth comparison of files, archives, and directories
 #
 # Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by


=====================================
tests/test_tools.py
=====================================
@@ -1,7 +1,7 @@
 #
 # diffoscope: in-depth comparison of files, archives, and directories
 #
-# Copyright © 2017, 2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2017, 2020-2021 Chris Lamb <lamby at debian.org>
 #
 # diffoscope is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by



View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/1b82ceff578b4e12fba4e0b176105afff76d0720...9a3553b8a267304e904d237462735f3ec4fc8ca5

-- 
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/1b82ceff578b4e12fba4e0b176105afff76d0720...9a3553b8a267304e904d237462735f3ec4fc8ca5
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20210126/a611b1dc/attachment.htm>


More information about the rb-commits mailing list