[diffoscope] 04/04: Add fuzzy-matching across containers.

Maria Glukhova siamezzze-guest at moszumanska.debian.org
Sat Jan 21 17:07:31 CET 2017


This is an automated email from the git hooks/post-receive script.

siamezzze-guest pushed a commit to branch siamezzze/containers
in repository diffoscope.

commit 3218f0c81b64616adcc80a1230cc5a7c1f5d14e8
Author: Maria Glukhova <siamezzze at gmail.com>
Date:   Sat Jan 21 16:11:56 2017 +0200

    Add fuzzy-matching across containers.
    
    That would enable diffoscope to always treat sub-containers with similar
    content as similar files (and compare them), no matter their name or
    container type.
    For example, that will allow to correctly compare a.tar.gz and b.zip.bz2
    if contents of a and b are similar.
    As the downside, that will require to specialize files before comparing
    them (to find out if they are containers or not).
---
 diffoscope/comparators/utils/fuzzy.py | 75 ++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 5 deletions(-)

diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
index 4d6b865..d05ca96 100644
--- a/diffoscope/comparators/utils/fuzzy.py
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -21,6 +21,8 @@ import logging
 import operator
 
 from diffoscope.config import Config
+from diffoscope.comparators.missing_file import MissingFile
+from .specialize import specialize
 
 try:
     import tlsh
@@ -30,6 +32,49 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
+def fuzzy_matching_containers(container1, container2):
+    if container1 is None or container2 is None:
+        return None
+
+    already_compared = set()
+    members1 = container1.get_members()
+    members2 = container2.get_members()
+    nonpaired = set(members2)
+
+    # Set difference score of containers to the maximum difference score
+    # for each matched member, if every member is matched, and fuzzy_threshold
+    # otherwise.
+    score_containers = 0
+
+    # "Lighter" version of fuzzy-matching, containers are not taken into
+    # account (because specializing every file in the tree would be rather
+    # non-effective).
+    for name1, file1 in members1.items():
+        if file1.is_directory() or not file1.fuzzy_hash:
+            continue
+        comparisons = []
+        for name2, file2 in members2.items():
+            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+                nonpaired.discard(name2)
+                continue
+            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+        if comparisons:
+            score, name2 = min(comparisons, key=operator.itemgetter(0))
+            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+            if score < Config().fuzzy_threshold:
+                nonpaired.discard(name2)
+                already_compared.add(name2)
+                score_containers = max(score_containers, score)
+            else:
+                # Haven't fount a pair for some file in container1.
+                return Config().fuzzy_threshold
+
+    # Found a pair for every file.
+    if len(nonpaired) == 0:
+        return score_containers
+    # Haven't fount a pair for some files in container2.
+    return Config().fuzzy_threshold
+
 def perform_fuzzy_matching(members1, members2):
     if tlsh == None or Config().fuzzy_threshold == 0:
         return
@@ -40,15 +85,35 @@ def perform_fuzzy_matching(members1, members2):
     for name1, file1 in members1.items():
         if file1.is_directory() or not file1.fuzzy_hash:
             continue
+        specialize(file1)
+        if isinstance(file1, MissingFile):
+            continue
         comparisons = []
         for name2, file2 in members2.items():
-            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
+            if name2 in already_compared or file2.is_directory() \
+                    or not file2.fuzzy_hash:
                 continue
-            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
+
+            matched_containers = None
+            # Avoid specializing file2 if file1 is definitely not a container.
+            if file1.as_container:
+                specialize(file2)
+                if isinstance(file2, MissingFile):
+                    continue
+
+                # Containers are matched if we are able to match every member.
+                matched_containers = \
+                    fuzzy_matching_containers(file1.as_container,
+                                              file2.as_container)
+
+            diff_value = tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash)
+            if matched_containers:
+                diff_value = min(diff_value, matched_containers)
+            comparisons.append((diff_value, name2))
         if comparisons:
-            comparisons.sort(key=operator.itemgetter(0))
-            score, name2 = comparisons[0]
-            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
+            score, name2 = min(comparisons, key=operator.itemgetter(0))
+            logger.debug('fuzzy top match %s %s: %d difference score',
+                         name1, name2, score)
             if score < Config().fuzzy_threshold:
                 yield name1, name2, score
                 already_compared.add(name2)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list