[diffoscope] 01/03: Revert "Add fuzzy-matching across containers."

Maria Glukhova siamezzze-guest at moszumanska.debian.org
Sun Jan 22 19:01:19 CET 2017


This is an automated email from the git hooks/post-receive script.

siamezzze-guest pushed a commit to branch siamezzze/containers
in repository diffoscope.

commit ce81bcbd386de3c0e3b6d3dae1892162a3744cba
Author: Maria Glukhova <siamezzze at gmail.com>
Date:   Sun Jan 22 04:49:22 2017 +0200

    Revert "Add fuzzy-matching across containers."
    
    This reverts commit 3218f0c81b64616adcc80a1230cc5a7c1f5d14e8.
---
 diffoscope/comparators/utils/fuzzy.py | 75 +++--------------------------------
 1 file changed, 5 insertions(+), 70 deletions(-)

diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
index d05ca96..4d6b865 100644
--- a/diffoscope/comparators/utils/fuzzy.py
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -21,8 +21,6 @@ import logging
 import operator
 
 from diffoscope.config import Config
-from diffoscope.comparators.missing_file import MissingFile
-from .specialize import specialize
 
 try:
     import tlsh
@@ -32,49 +30,6 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
-def fuzzy_matching_containers(container1, container2):
-    if container1 is None or container2 is None:
-        return None
-
-    already_compared = set()
-    members1 = container1.get_members()
-    members2 = container2.get_members()
-    nonpaired = set(members2)
-
-    # Set difference score of containers to the maximum difference score
-    # for each matched member, if every member is matched, and fuzzy_threshold
-    # otherwise.
-    score_containers = 0
-
-    # "Lighter" version of fuzzy-matching, containers are not taken into
-    # account (because specializing every file in the tree would be rather
-    # non-effective).
-    for name1, file1 in members1.items():
-        if file1.is_directory() or not file1.fuzzy_hash:
-            continue
-        comparisons = []
-        for name2, file2 in members2.items():
-            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
-                nonpaired.discard(name2)
-                continue
-            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
-        if comparisons:
-            score, name2 = min(comparisons, key=operator.itemgetter(0))
-            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
-            if score < Config().fuzzy_threshold:
-                nonpaired.discard(name2)
-                already_compared.add(name2)
-                score_containers = max(score_containers, score)
-            else:
-                # Haven't fount a pair for some file in container1.
-                return Config().fuzzy_threshold
-
-    # Found a pair for every file.
-    if len(nonpaired) == 0:
-        return score_containers
-    # Haven't fount a pair for some files in container2.
-    return Config().fuzzy_threshold
-
 def perform_fuzzy_matching(members1, members2):
     if tlsh == None or Config().fuzzy_threshold == 0:
         return
@@ -85,35 +40,15 @@ def perform_fuzzy_matching(members1, members2):
     for name1, file1 in members1.items():
         if file1.is_directory() or not file1.fuzzy_hash:
             continue
-        specialize(file1)
-        if isinstance(file1, MissingFile):
-            continue
         comparisons = []
         for name2, file2 in members2.items():
-            if name2 in already_compared or file2.is_directory() \
-                    or not file2.fuzzy_hash:
+            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
                 continue
-
-            matched_containers = None
-            # Avoid specializing file2 if file1 is definitely not a container.
-            if file1.as_container:
-                specialize(file2)
-                if isinstance(file2, MissingFile):
-                    continue
-
-                # Containers are matched if we are able to match every member.
-                matched_containers = \
-                    fuzzy_matching_containers(file1.as_container,
-                                              file2.as_container)
-
-            diff_value = tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash)
-            if matched_containers:
-                diff_value = min(diff_value, matched_containers)
-            comparisons.append((diff_value, name2))
+            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
         if comparisons:
-            score, name2 = min(comparisons, key=operator.itemgetter(0))
-            logger.debug('fuzzy top match %s %s: %d difference score',
-                         name1, name2, score)
+            comparisons.sort(key=operator.itemgetter(0))
+            score, name2 = comparisons[0]
+            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
             if score < Config().fuzzy_threshold:
                 yield name1, name2, score
                 already_compared.add(name2)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list