[diffoscope] 01/04: progress: weigh elements by their size

Ximin Luo infinity0 at debian.org
Fri May 26 16:55:54 CEST 2017


This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch experimental
in repository diffoscope.

commit 6e05d2be4ddb982ae8ecf014163d5c1d27cd8716
Author: Ximin Luo <infinity0 at debian.org>
Date:   Thu May 25 14:43:34 2017 +0200

    progress: weigh elements by their size
---
 diffoscope/comparators/directory.py        | 18 ++++-----
 diffoscope/comparators/utils/container.py  | 62 ++++++++++++++++++++----------
 diffoscope/comparators/utils/file.py       | 16 ++++++++
 diffoscope/comparators/utils/libarchive.py |  2 -
 diffoscope/excludes.py                     | 12 +++---
 5 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/diffoscope/comparators/directory.py b/diffoscope/comparators/directory.py
index 91535a9..08fec63 100644
--- a/diffoscope/comparators/directory.py
+++ b/diffoscope/comparators/directory.py
@@ -22,11 +22,11 @@ import os
 import re
 import logging
 import subprocess
+from collections import OrderedDict
 
 from diffoscope.exc import RequiredToolNotFound
 from diffoscope.tools import tool_required
 from diffoscope.progress import Progress
-from diffoscope.excludes import filter_excludes
 from diffoscope.difference import Difference
 
 from .binary import FilesystemFile
@@ -191,17 +191,17 @@ class DirectoryContainer(Container):
             return FilesystemFile(os.path.join(self.source.path, member_name), container=self)
 
     def comparisons(self, other):
-        my_names = self.get_member_names()
-        other_names = other.get_member_names()
-        to_compare = set(my_names).intersection(other_names)
-        to_compare = set(filter_excludes(to_compare))
+        my_members = OrderedDict(self.get_filtered_members_sizes())
+        other_members = OrderedDict(other.get_filtered_members_sizes())
+        total_size = sum(x[1] for x in my_members.values()) + sum(x[1] for x in other_members.values())
 
-        with Progress(len(to_compare)) as p:
+        to_compare = set(my_members.keys()).intersection(other_members.keys())
+        with Progress(total_size) as p:
             for name in sorted(to_compare):
-                my_file = self.get_member(name)
-                other_file = other.get_member(name)
+                my_file, my_size = my_members[name]
+                other_file, other_size = other_members[name]
                 yield my_file, other_file, name
-                p.step(msg=name)
+                p.step(my_size + other_size, msg=name)
 
     def compare(self, other, source=None):
         from .utils.compare import compare_files
diff --git a/diffoscope/comparators/utils/container.py b/diffoscope/comparators/utils/container.py
index d46b085..7114dfb 100644
--- a/diffoscope/comparators/utils/container.py
+++ b/diffoscope/comparators/utils/container.py
@@ -20,14 +20,16 @@
 import abc
 import logging
 import itertools
-import collections
+from collections import OrderedDict
 
 from diffoscope.config import Config
 from diffoscope.difference import Difference
+from diffoscope.excludes import filter_excludes
 from diffoscope.progress import Progress
 
 from ..missing_file import MissingFile
 
+from .file import path_apparent_size
 from .fuzzy import perform_fuzzy_matching
 
 NO_COMMENT = None
@@ -61,8 +63,7 @@ class Container(object, metaclass=abc.ABCMeta):
         Returns a dictionary. The key is what is used to match when comparing
         containers.
         """
-
-        return collections.OrderedDict(self.get_all_members())
+        return OrderedDict(self.get_all_members())
 
     def lookup_file(self, *names):
         """
@@ -96,41 +97,60 @@ class Container(object, metaclass=abc.ABCMeta):
     def get_member(self, member_name):
         raise NotImplementedError()
 
+    def get_filtered_member_names(self):
+        return filter_excludes(self.get_member_names())
+
+    def get_filtered_members_sizes(self):
+        for name in self.get_filtered_member_names():
+            member = self.get_member(name)
+            if member.is_directory():
+                size = 4096 # default "size" of a directory
+            else:
+                size = path_apparent_size(member.path)
+            yield name, (member, size)
+
     def get_all_members(self):
         # If your get_member implementation is O(n) then this will be O(n^2)
         # cost. In such cases it is HIGHLY RECOMMENDED to override this as well
-        for name in self.get_member_names():
+        for name in self.get_filtered_member_names():
             yield name, self.get_member(name)
 
     def comparisons(self, other):
-        my_members = self.get_members()
-        my_reminders = collections.OrderedDict()
-        other_members = other.get_members()
+        my_members = OrderedDict(self.get_filtered_members_sizes())
+        my_remainders = OrderedDict()
+        other_members = OrderedDict(other.get_filtered_members_sizes())
+        total_size = sum(x[1] for x in my_members.values()) + sum(x[1] for x in other_members.values())
+        # TODO: progress could be a bit more accurate here, give more weight to fuzzy-hashed files
 
-        with Progress(max(len(my_members), len(other_members))) as p:
+        with Progress(total_size) as p:
             # keep it sorted like my members
             while my_members:
-                my_member_name, my_member = my_members.popitem(last=False)
+                my_member_name, (my_member, my_size) = my_members.popitem(last=False)
                 if my_member_name in other_members:
-                    p.step(msg=my_member.progress_name)
-                    yield my_member, other_members.pop(my_member_name), NO_COMMENT
+                    other_member, other_size = other_members.pop(my_member_name)
+                    p.step(my_size + other_size, msg=my_member.progress_name)
+                    yield my_member, other_member, NO_COMMENT
                 else:
-                    my_reminders[my_member_name] = my_member
-
-            my_members = my_reminders
-            for my_name, other_name, score in perform_fuzzy_matching(my_members, other_members):
+                    my_remainders[my_member_name] = (my_member, my_size)
+
+            my_members = my_remainders
+            my_members_fuzz = OrderedDict((k, v[0]) for k, v in my_members.items())
+            other_members_fuzz = OrderedDict((k, v[0]) for k, v in other_members.items())
+            for my_name, other_name, score in perform_fuzzy_matching(my_members_fuzz, other_members_fuzz):
+                my_member, my_size = my_members.pop(my_name)
+                other_member, other_size = other_members.pop(other_name)
                 comment = "Files similar despite different names" \
                     " (difference score: {})".format(score)
-                p.step(2, msg=my_name)
-                yield my_members.pop(my_name), other_members.pop(other_name), comment
+                p.step(my_size + other_size, msg=my_name)
+                yield my_member, other_member, comment
 
             if Config().new_file:
-                for my_member in my_members.values():
-                    p.step(msg=my_member.progress_name)
+                for my_member, my_size in my_members.values():
+                    p.step(my_size, msg=my_member.progress_name)
                     yield my_member, MissingFile('/dev/null', my_member), NO_COMMENT
 
-                for other_member in other_members.values():
-                    p.step(msg=other_member.progress_name)
+                for other_member, other_size in other_members.values():
+                    p.step(other_size, msg=other_member.progress_name)
                     yield MissingFile('/dev/null', other_member), other_member, NO_COMMENT
 
     def compare(self, other, source=None, no_recurse=False):
diff --git a/diffoscope/comparators/utils/file.py b/diffoscope/comparators/utils/file.py
index ff0774f..f38cead 100644
--- a/diffoscope/comparators/utils/file.py
+++ b/diffoscope/comparators/utils/file.py
@@ -41,6 +41,22 @@ SMALL_FILE_THRESHOLD = 65536 # 64 kiB
 logger = logging.getLogger(__name__)
 
 
+def path_apparent_size(path=".", visited=None):
+    # should output the same as `du --apparent-size -bs "$path"`
+    if not visited:
+        stat = os.stat(path, follow_symlinks=False)
+        visited = { stat.st_ino: stat.st_size }
+    if os.path.isdir(path) and not os.path.islink(path):
+        for entry in os.scandir(path):
+            inode = entry.inode()
+            if inode in visited:
+                continue
+            visited[inode] = entry.stat(follow_symlinks=False).st_size
+            if entry.is_dir(follow_symlinks=False):
+                folder_size(entry.path, visited)
+    return sum(visited.values())
+
+
 class File(object, metaclass=abc.ABCMeta):
     RE_FILE_TYPE = None
     RE_FILE_EXTENSION = None
diff --git a/diffoscope/comparators/utils/libarchive.py b/diffoscope/comparators/utils/libarchive.py
index 2895c67..7ec2b0f 100644
--- a/diffoscope/comparators/utils/libarchive.py
+++ b/diffoscope/comparators/utils/libarchive.py
@@ -187,8 +187,6 @@ class LibarchiveContainer(Archive):
     def get_all_members(self):
         with libarchive.file_reader(self.source.path) as archive:
             for entry in archive:
-                if any_excluded(entry.pathname):
-                    continue
                 yield entry.pathname, self.get_subclass(entry)
 
     def get_subclass(self, entry):
diff --git a/diffoscope/excludes.py b/diffoscope/excludes.py
index 0bb509a..2ce60ee 100644
--- a/diffoscope/excludes.py
+++ b/diffoscope/excludes.py
@@ -26,17 +26,17 @@ logger = logging.getLogger(__name__)
 
 
 def filter_excludes(filenames):
-    result = []
-
     for x in filenames:
         for y in Config().excludes:
             if fnmatch.fnmatchcase(x, y):
                 logger.debug("Excluding %s as it matches pattern '%s'", x, y)
                 break
         else:
-            result.append(x)
-
-    return result
+            yield x
 
 def any_excluded(*filenames):
-    return len(filter_excludes(filenames)) != len(filenames)
+    for x in filenames:
+        for y in Config().excludes:
+            if fnmatch.fnmatchcase(x, y):
+                return True
+    return False

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list