[diffoscope] 01/04: progress: weigh elements by their size
Ximin Luo
infinity0 at debian.org
Fri May 26 16:55:54 CEST 2017
This is an automated email from the git hooks/post-receive script.
infinity0 pushed a commit to branch experimental
in repository diffoscope.
commit 6e05d2be4ddb982ae8ecf014163d5c1d27cd8716
Author: Ximin Luo <infinity0 at debian.org>
Date: Thu May 25 14:43:34 2017 +0200
progress: weigh elements by their size
---
diffoscope/comparators/directory.py | 18 ++++-----
diffoscope/comparators/utils/container.py | 62 ++++++++++++++++++++----------
diffoscope/comparators/utils/file.py | 16 ++++++++
diffoscope/comparators/utils/libarchive.py | 2 -
diffoscope/excludes.py | 12 +++---
5 files changed, 72 insertions(+), 38 deletions(-)
diff --git a/diffoscope/comparators/directory.py b/diffoscope/comparators/directory.py
index 91535a9..08fec63 100644
--- a/diffoscope/comparators/directory.py
+++ b/diffoscope/comparators/directory.py
@@ -22,11 +22,11 @@ import os
import re
import logging
import subprocess
+from collections import OrderedDict
from diffoscope.exc import RequiredToolNotFound
from diffoscope.tools import tool_required
from diffoscope.progress import Progress
-from diffoscope.excludes import filter_excludes
from diffoscope.difference import Difference
from .binary import FilesystemFile
@@ -191,17 +191,17 @@ class DirectoryContainer(Container):
return FilesystemFile(os.path.join(self.source.path, member_name), container=self)
def comparisons(self, other):
- my_names = self.get_member_names()
- other_names = other.get_member_names()
- to_compare = set(my_names).intersection(other_names)
- to_compare = set(filter_excludes(to_compare))
+ my_members = OrderedDict(self.get_filtered_members_sizes())
+ other_members = OrderedDict(other.get_filtered_members_sizes())
+ total_size = sum(x[1] for x in my_members.values()) + sum(x[1] for x in other_members.values())
- with Progress(len(to_compare)) as p:
+ to_compare = set(my_members.keys()).intersection(other_members.keys())
+ with Progress(total_size) as p:
for name in sorted(to_compare):
- my_file = self.get_member(name)
- other_file = other.get_member(name)
+ my_file, my_size = my_members[name]
+ other_file, other_size = other_members[name]
yield my_file, other_file, name
- p.step(msg=name)
+ p.step(my_size + other_size, msg=name)
def compare(self, other, source=None):
from .utils.compare import compare_files
diff --git a/diffoscope/comparators/utils/container.py b/diffoscope/comparators/utils/container.py
index d46b085..7114dfb 100644
--- a/diffoscope/comparators/utils/container.py
+++ b/diffoscope/comparators/utils/container.py
@@ -20,14 +20,16 @@
import abc
import logging
import itertools
-import collections
+from collections import OrderedDict
from diffoscope.config import Config
from diffoscope.difference import Difference
+from diffoscope.excludes import filter_excludes
from diffoscope.progress import Progress
from ..missing_file import MissingFile
+from .file import path_apparent_size
from .fuzzy import perform_fuzzy_matching
NO_COMMENT = None
@@ -61,8 +63,7 @@ class Container(object, metaclass=abc.ABCMeta):
Returns a dictionary. The key is what is used to match when comparing
containers.
"""
-
- return collections.OrderedDict(self.get_all_members())
+ return OrderedDict(self.get_all_members())
def lookup_file(self, *names):
"""
@@ -96,41 +97,60 @@ class Container(object, metaclass=abc.ABCMeta):
def get_member(self, member_name):
raise NotImplementedError()
+ def get_filtered_member_names(self):
+ return filter_excludes(self.get_member_names())
+
+ def get_filtered_members_sizes(self):
+ for name in self.get_filtered_member_names():
+ member = self.get_member(name)
+ if member.is_directory():
+ size = 4096 # default "size" of a directory
+ else:
+ size = path_apparent_size(member.path)
+ yield name, (member, size)
+
def get_all_members(self):
# If your get_member implementation is O(n) then this will be O(n^2)
# cost. In such cases it is HIGHLY RECOMMENDED to override this as well
- for name in self.get_member_names():
+ for name in self.get_filtered_member_names():
yield name, self.get_member(name)
def comparisons(self, other):
- my_members = self.get_members()
- my_reminders = collections.OrderedDict()
- other_members = other.get_members()
+ my_members = OrderedDict(self.get_filtered_members_sizes())
+ my_remainders = OrderedDict()
+ other_members = OrderedDict(other.get_filtered_members_sizes())
+ total_size = sum(x[1] for x in my_members.values()) + sum(x[1] for x in other_members.values())
+ # TODO: progress could be a bit more accurate here, give more weight to fuzzy-hashed files
- with Progress(max(len(my_members), len(other_members))) as p:
+ with Progress(total_size) as p:
# keep it sorted like my members
while my_members:
- my_member_name, my_member = my_members.popitem(last=False)
+ my_member_name, (my_member, my_size) = my_members.popitem(last=False)
if my_member_name in other_members:
- p.step(msg=my_member.progress_name)
- yield my_member, other_members.pop(my_member_name), NO_COMMENT
+ other_member, other_size = other_members.pop(my_member_name)
+ p.step(my_size + other_size, msg=my_member.progress_name)
+ yield my_member, other_member, NO_COMMENT
else:
- my_reminders[my_member_name] = my_member
-
- my_members = my_reminders
- for my_name, other_name, score in perform_fuzzy_matching(my_members, other_members):
+ my_remainders[my_member_name] = (my_member, my_size)
+
+ my_members = my_remainders
+ my_members_fuzz = OrderedDict((k, v[0]) for k, v in my_members.items())
+ other_members_fuzz = OrderedDict((k, v[0]) for k, v in other_members.items())
+ for my_name, other_name, score in perform_fuzzy_matching(my_members_fuzz, other_members_fuzz):
+ my_member, my_size = my_members.pop(my_name)
+ other_member, other_size = other_members.pop(other_name)
comment = "Files similar despite different names" \
" (difference score: {})".format(score)
- p.step(2, msg=my_name)
- yield my_members.pop(my_name), other_members.pop(other_name), comment
+ p.step(my_size + other_size, msg=my_name)
+ yield my_member, other_member, comment
if Config().new_file:
- for my_member in my_members.values():
- p.step(msg=my_member.progress_name)
+ for my_member, my_size in my_members.values():
+ p.step(my_size, msg=my_member.progress_name)
yield my_member, MissingFile('/dev/null', my_member), NO_COMMENT
- for other_member in other_members.values():
- p.step(msg=other_member.progress_name)
+ for other_member, other_size in other_members.values():
+ p.step(other_size, msg=other_member.progress_name)
yield MissingFile('/dev/null', other_member), other_member, NO_COMMENT
def compare(self, other, source=None, no_recurse=False):
diff --git a/diffoscope/comparators/utils/file.py b/diffoscope/comparators/utils/file.py
index ff0774f..f38cead 100644
--- a/diffoscope/comparators/utils/file.py
+++ b/diffoscope/comparators/utils/file.py
@@ -41,6 +41,22 @@ SMALL_FILE_THRESHOLD = 65536 # 64 kiB
logger = logging.getLogger(__name__)
+def path_apparent_size(path=".", visited=None):
+ # should output the same as `du --apparent-size -bs "$path"`
+ if not visited:
+ stat = os.stat(path, follow_symlinks=False)
+ visited = { stat.st_ino: stat.st_size }
+ if os.path.isdir(path) and not os.path.islink(path):
+ for entry in os.scandir(path):
+ inode = entry.inode()
+ if inode in visited:
+ continue
+ visited[inode] = entry.stat(follow_symlinks=False).st_size
+ if entry.is_dir(follow_symlinks=False):
+ folder_size(entry.path, visited)
+ return sum(visited.values())
+
+
class File(object, metaclass=abc.ABCMeta):
RE_FILE_TYPE = None
RE_FILE_EXTENSION = None
diff --git a/diffoscope/comparators/utils/libarchive.py b/diffoscope/comparators/utils/libarchive.py
index 2895c67..7ec2b0f 100644
--- a/diffoscope/comparators/utils/libarchive.py
+++ b/diffoscope/comparators/utils/libarchive.py
@@ -187,8 +187,6 @@ class LibarchiveContainer(Archive):
def get_all_members(self):
with libarchive.file_reader(self.source.path) as archive:
for entry in archive:
- if any_excluded(entry.pathname):
- continue
yield entry.pathname, self.get_subclass(entry)
def get_subclass(self, entry):
diff --git a/diffoscope/excludes.py b/diffoscope/excludes.py
index 0bb509a..2ce60ee 100644
--- a/diffoscope/excludes.py
+++ b/diffoscope/excludes.py
@@ -26,17 +26,17 @@ logger = logging.getLogger(__name__)
def filter_excludes(filenames):
- result = []
-
for x in filenames:
for y in Config().excludes:
if fnmatch.fnmatchcase(x, y):
logger.debug("Excluding %s as it matches pattern '%s'", x, y)
break
else:
- result.append(x)
-
- return result
+ yield x
def any_excluded(*filenames):
- return len(filter_excludes(filenames)) != len(filenames)
+ for x in filenames:
+ for y in Config().excludes:
+ if fnmatch.fnmatchcase(x, y):
+ return True
+ return False
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the diffoscope
mailing list