[diffoscope] 01/03: Revert "Add fuzzy-matching across containers."
Maria Glukhova
siamezzze-guest at moszumanska.debian.org
Sun Jan 22 19:01:19 CET 2017
This is an automated email from the git hooks/post-receive script.
siamezzze-guest pushed a commit to branch siamezzze/containers
in repository diffoscope.
commit ce81bcbd386de3c0e3b6d3dae1892162a3744cba
Author: Maria Glukhova <siamezzze at gmail.com>
Date: Sun Jan 22 04:49:22 2017 +0200
Revert "Add fuzzy-matching across containers."
This reverts commit 3218f0c81b64616adcc80a1230cc5a7c1f5d14e8.
---
diffoscope/comparators/utils/fuzzy.py | 75 +++--------------------------------
1 file changed, 5 insertions(+), 70 deletions(-)
diff --git a/diffoscope/comparators/utils/fuzzy.py b/diffoscope/comparators/utils/fuzzy.py
index d05ca96..4d6b865 100644
--- a/diffoscope/comparators/utils/fuzzy.py
+++ b/diffoscope/comparators/utils/fuzzy.py
@@ -21,8 +21,6 @@ import logging
import operator
from diffoscope.config import Config
-from diffoscope.comparators.missing_file import MissingFile
-from .specialize import specialize
try:
import tlsh
@@ -32,49 +30,6 @@ except ImportError:
logger = logging.getLogger(__name__)
-def fuzzy_matching_containers(container1, container2):
- if container1 is None or container2 is None:
- return None
-
- already_compared = set()
- members1 = container1.get_members()
- members2 = container2.get_members()
- nonpaired = set(members2)
-
- # Set difference score of containers to the maximum difference score
- # for each matched member, if every member is matched, and fuzzy_threshold
- # otherwise.
- score_containers = 0
-
- # "Lighter" version of fuzzy-matching, containers are not taken into
- # account (because specializing every file in the tree would be rather
- # non-effective).
- for name1, file1 in members1.items():
- if file1.is_directory() or not file1.fuzzy_hash:
- continue
- comparisons = []
- for name2, file2 in members2.items():
- if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
- nonpaired.discard(name2)
- continue
- comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
- if comparisons:
- score, name2 = min(comparisons, key=operator.itemgetter(0))
- logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
- if score < Config().fuzzy_threshold:
- nonpaired.discard(name2)
- already_compared.add(name2)
- score_containers = max(score_containers, score)
- else:
- # Haven't fount a pair for some file in container1.
- return Config().fuzzy_threshold
-
- # Found a pair for every file.
- if len(nonpaired) == 0:
- return score_containers
- # Haven't fount a pair for some files in container2.
- return Config().fuzzy_threshold
-
def perform_fuzzy_matching(members1, members2):
if tlsh == None or Config().fuzzy_threshold == 0:
return
@@ -85,35 +40,15 @@ def perform_fuzzy_matching(members1, members2):
for name1, file1 in members1.items():
if file1.is_directory() or not file1.fuzzy_hash:
continue
- specialize(file1)
- if isinstance(file1, MissingFile):
- continue
comparisons = []
for name2, file2 in members2.items():
- if name2 in already_compared or file2.is_directory() \
- or not file2.fuzzy_hash:
+ if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
continue
-
- matched_containers = None
- # Avoid specializing file2 if file1 is definitely not a container.
- if file1.as_container:
- specialize(file2)
- if isinstance(file2, MissingFile):
- continue
-
- # Containers are matched if we are able to match every member.
- matched_containers = \
- fuzzy_matching_containers(file1.as_container,
- file2.as_container)
-
- diff_value = tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash)
- if matched_containers:
- diff_value = min(diff_value, matched_containers)
- comparisons.append((diff_value, name2))
+ comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
if comparisons:
- score, name2 = min(comparisons, key=operator.itemgetter(0))
- logger.debug('fuzzy top match %s %s: %d difference score',
- name1, name2, score)
+ comparisons.sort(key=operator.itemgetter(0))
+ score, name2 = comparisons[0]
+ logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
if score < Config().fuzzy_threshold:
yield name1, name2, score
already_compared.add(name2)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the diffoscope
mailing list