[Git][reproducible-builds/diffoscope][master] 5 commits: Show fuzzyness amount in percentage terms, not out of the rather-arbitrary 400.
Chris Lamb
gitlab at salsa.debian.org
Tue Jan 26 17:28:25 UTC 2021
Chris Lamb pushed to branch master at Reproducible Builds / diffoscope
Commits:
42ebb627 by Chris Lamb at 2021-01-26T12:36:52+00:00
Show fuzzyness amount in percentage terms, not out of the rather-arbitrary 400.
- - - - -
62ae0bbb by Chris Lamb at 2021-01-26T12:38:35+00:00
comparators.utils.fuzzy: Tidy module.
- - - - -
005b2743 by Chris Lamb at 2021-01-26T12:43:32+00:00
Improve the logging of fuzzy matching.
- - - - -
8c50cb83 by Chris Lamb at 2021-01-26T12:47:13+00:00
Increase fuzzy matching threshold to ensure that we show more differences. (Closes: reproducible-builds/diffoscope#232)
The two files in this bug report had a score of 74, but another file had a
score of 106.
- - - - -
9a3553b8 by Chris Lamb at 2021-01-26T17:27:36+00:00
Update my copyright years.
- - - - -
8 changed files:
- diffoscope/__init__.py
- diffoscope/comparators/directory.py
- diffoscope/comparators/json.py
- diffoscope/comparators/utils/container.py
- diffoscope/comparators/utils/fuzzy.py
- diffoscope/config.py
- diffoscope/main.py
- tests/test_tools.py
Changes:
=====================================
diffoscope/__init__.py
=====================================
@@ -2,7 +2,7 @@
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2015-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2015-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=====================================
diffoscope/comparators/directory.py
=====================================
@@ -2,7 +2,7 @@
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2015-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2015-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=====================================
diffoscope/comparators/json.py
=====================================
@@ -1,7 +1,7 @@
#
# diffoscope: in-depth comparison of files, archives, and directories
#
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=====================================
diffoscope/comparators/utils/container.py
=====================================
@@ -1,7 +1,7 @@
#
# diffoscope: in-depth comparison of files, archives, and directories
#
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -164,10 +164,8 @@ class Container(metaclass=abc.ABCMeta):
for my_name, other_name, score in self.perform_fuzzy_matching(
my_members, other_members
):
- comment = (
- "Files similar despite different names"
- " (score: {}, lower is more similar)".format(score)
- )
+ score_display = score / 400.0 * 100
+ comment = f"Files {score_display:.0f}% similar despite different names"
if score == 0:
comment = "Files identical despite different names"
yield prep_yield(my_name, other_name, comment)
=====================================
diffoscope/comparators/utils/fuzzy.py
=====================================
@@ -2,7 +2,7 @@
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -31,35 +31,44 @@ logger = logging.getLogger(__name__)
def perform_fuzzy_matching(members1, members2):
- if tlsh is None or Config().fuzzy_threshold == 0:
+ threshold = Config().fuzzy_threshold
+
+ if tlsh is None or threshold == 0:
return
- already_compared = set()
+
# Create local copies because they will be modified by consumer
members1 = dict(members1)
members2 = dict(members2)
+
+ seen = set()
for name1, (file1, _) in members1.items():
if file1.is_directory() or not file1.fuzzy_hash:
continue
+
comparisons = []
for name2, (file2, _) in members2.items():
- if (
- name2 in already_compared
- or file2.is_directory()
- or not file2.fuzzy_hash
- ):
+ if name2 in seen or file2.is_directory() or not file2.fuzzy_hash:
continue
comparisons.append(
(tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2)
)
- if comparisons:
- comparisons.sort(key=operator.itemgetter(0))
- score, name2 = comparisons[0]
- logger.debug(
- "fuzzy top match %s %s: %d difference score",
- name1,
- name2,
- score,
- )
- if score < Config().fuzzy_threshold:
- yield name1, name2, score
- already_compared.add(name2)
+
+ if not comparisons:
+ continue
+
+ comparisons.sort(key=operator.itemgetter(0))
+ score, name2 = comparisons[0]
+
+ suffix = "will not compare files"
+ if score < threshold:
+ seen.add(name2)
+ yield name1, name2, score
+ suffix = "will compare files"
+
+ logger.debug(
+ "Fuzzy matching %s %s (score: %d/400): %s",
+ name1,
+ name2,
+ score,
+ suffix,
+ )
=====================================
diffoscope/config.py
=====================================
@@ -2,7 +2,7 @@
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2015 Reiner Herrmann <reiner at reiner-h.de>
-# Copyright © 2016-2017, 2019-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2017, 2019-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -56,7 +56,7 @@ class Config:
self.difftool = None
self.diff_masks = ()
self.new_file = False
- self.fuzzy_threshold = 60
+ self.fuzzy_threshold = 120
self.enforce_constraints = True
self.excludes = ()
self.exclude_commands = ()
=====================================
diffoscope/main.py
=====================================
@@ -4,7 +4,7 @@
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar at debian.org>
-# Copyright © 2016-2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2016-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=====================================
tests/test_tools.py
=====================================
@@ -1,7 +1,7 @@
#
# diffoscope: in-depth comparison of files, archives, and directories
#
-# Copyright © 2017, 2020 Chris Lamb <lamby at debian.org>
+# Copyright © 2017, 2020-2021 Chris Lamb <lamby at debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/1b82ceff578b4e12fba4e0b176105afff76d0720...9a3553b8a267304e904d237462735f3ec4fc8ca5
--
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/1b82ceff578b4e12fba4e0b176105afff76d0720...9a3553b8a267304e904d237462735f3ec4fc8ca5
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20210126/a611b1dc/attachment.htm>
More information about the rb-commits
mailing list