[diffoscope] 04/04: Add fuzzy-matching across containers.
Maria Glukhova
siamezzze-guest at moszumanska.debian.org
Sat Jan 21 17:07:31 CET 2017
This is an automated email from the git hooks/post-receive script.
siamezzze-guest pushed a commit to branch siamezzze/containers
in repository diffoscope.
commit 3218f0c81b64616adcc80a1230cc5a7c1f5d14e8
Author: Maria Glukhova <siamezzze at gmail.com>
Date: Sat Jan 21 16:11:56 2017 +0200
Add fuzzy-matching across containers.
That would enable diffoscope to always treat sub-containers with similar
content as similar files (and compare them), no matter their name or
container type.
For example, that will allow to correctly compare a.tar.gz and b.zip.bz2
if contents of a and b are similar.
As the downside, that will require to specialize files before comparing
them (to find out if they are containers or not).
---
diffoscope/comparators/utils/fuzzy.py | 75 ++++++++++++++++++++++++++++++++---
1 file changed, 70 insertions(+), 5 deletions(-)
diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
index 4d6b865..d05ca96 100644
--- a/diffoscope/comparators/utils/fuzzy.py
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -21,6 +21,8 @@ import logging
import operator
from diffoscope.config import Config
+from diffoscope.comparators.missing_file import MissingFile
+from .specialize import specialize
try:
import tlsh
@@ -30,6 +32,49 @@ except ImportError:
logger = logging.getLogger(__name__)
+def fuzzy_matching_containers(container1, container2):
+ if container1 is None or container2 is None:
+ return None
+
+ already_compared = set()
+ members1 = container1.get_members()
+ members2 = container2.get_members()
+ nonpaired = set(members2)
+
+ # Set difference score of containers to the maximum difference score
+ # for each matched member, if every member is matched, and fuzzy_threshold
+ # otherwise.
+ score_containers = 0
+
+ # "Lighter" version of fuzzy-matching, containers are not taken into
+ # account (because specializing every file in the tree would be rather
+ # non-effective).
+ for name1, file1 in members1.items():
+ if file1.is_directory() or not file1.fuzzy_hash:
+ continue
+ comparisons = []
+ for name2, file2 in members2.items():
+ if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+ nonpaired.discard(name2)
+ continue
+ comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+ if comparisons:
+ score, name2 = min(comparisons, key=operator.itemgetter(0))
+ logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+ if score < Config().fuzzy_threshold:
+ nonpaired.discard(name2)
+ already_compared.add(name2)
+ score_containers = max(score_containers, score)
+ else:
+ # Haven't fount a pair for some file in container1.
+ return Config().fuzzy_threshold
+
+ # Found a pair for every file.
+ if len(nonpaired) == 0:
+ return score_containers
+ # Haven't fount a pair for some files in container2.
+ return Config().fuzzy_threshold
+
def perform_fuzzy_matching(members1, members2):
if tlsh == None or Config().fuzzy_threshold == 0:
return
@@ -40,15 +85,35 @@ def perform_fuzzy_matching(members1, members2):
for name1, file1 in members1.items():
if file1.is_directory() or not file1.fuzzy_hash:
continue
+ specialize(file1)
+ if isinstance(file1, MissingFile):
+ continue
comparisons = []
for name2, file2 in members2.items():
- if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+ if name2 in already_compared or file2.is_directory() \
+ or not file2.fuzzy_hash:
continue
- comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+
+ matched_containers = None
+ # Avoid specializing file2 if file1 is definitely not a container.
+ if file1.as_container:
+ specialize(file2)
+ if isinstance(file2, MissingFile):
+ continue
+
+ # Containers are matched if we are able to match every member.
+ matched_containers = \
+ fuzzy_matching_containers(file1.as_container,
+ file2.as_container)
+
+ diff_value = tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash)
+ if matched_containers:
+ diff_value = min(diff_value, matched_containers)
+ comparisons.append((diff_value, name2))
if comparisons:
- comparisons.sort(key=operator.itemgetter(0))
- score, name2 = comparisons[0]
- logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+ score, name2 = min(comparisons, key=operator.itemgetter(0))
+ logger.debug('fuzzy top match %s %s: %d difference score',
+ name1, name2, score)
if score < Config().fuzzy_threshold:
yield name1, name2, score
already_compared.add(name2)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the diffoscope
mailing list