[diffoscope] 05/05: WIP: more intuitive "limit" flags

Ximin Luo infinity0 at debian.org
Wed Jun 21 23:10:30 CEST 2017


This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch WIP/humungous-diffs
in repository diffoscope.

commit 2fdd410fec6d0d4ccff520abb4058c384827e4e1
Author: Ximin Luo <infinity0 at debian.org>
Date:   Wed Jun 21 23:10:01 2017 +0200

    WIP: more intuitive "limit" flags
---
 diffoscope/config.py               | 35 +++++++++-------
 diffoscope/main.py                 | 86 +++++++++++++++++++++-----------------
 diffoscope/presenters/html/html.py | 58 ++++++++++++++++---------
 3 files changed, 105 insertions(+), 74 deletions(-)

diff --git a/diffoscope/config.py b/diffoscope/config.py
index bd6b3a9..6572980 100644
--- a/diffoscope/config.py
+++ b/diffoscope/config.py
@@ -20,16 +20,21 @@
 
 
 class Config(object):
-    max_diff_block_lines = 256
-    max_diff_block_lines_parent = 50
     max_diff_block_lines_saved = float("inf")
-    # html-dir output uses ratio * max-diff-block-lines as its limit
-    max_diff_block_lines_html_dir_ratio = 4
     # GNU diff cannot process arbitrary large files :(
     max_diff_input_lines = 2 ** 20
-    max_report_size = 2000 * 2 ** 10 # 2000 kB
+
+    # hard limits, restricts single-file and multi-file formats
+    max_report_size = 40 * 2 ** 20 # 40 MB
+    max_diff_block_lines = 2 ** 10 # 1024 lines
+    # structural limits, restricts single-file formats
+    # semi-restricts multi-file formats
+    max_page_size_parent = 400 * 2 ** 10 # 400 kB
+    max_page_size_child = 200 * 2 ** 10 # 200 kB
+    max_diff_block_lines_parent = 2 ** 7 # 128 lines
+
     max_text_report_size = 0
-    max_report_child_size = 500 * 2 ** 10
+
     new_file = False
     fuzzy_threshold = 60
     enforce_constraints = True
@@ -46,15 +51,13 @@ class Config(object):
     def __setattr__(self, k, v):
         super(Config, self).__setattr__(k, v)
 
-        if self.enforce_constraints:
-            self.check_constraints()
+    def check_ge(self, a, b):
+        va = getattr(self, a)
+        vb = getattr(self, b)
+        if va < vb:
+            raise ValueError("{0} ({1}) cannot be smaller than {2} ({3})".format(a, va, b, vb))
 
     def check_constraints(self):
-        max_ = self.max_diff_block_lines_html_dir_ratio * \
-            self.max_diff_block_lines
-        if self.max_diff_block_lines_saved < max_:  # noqa
-            raise ValueError("max_diff_block_lines_saved "
-                "({0.max_diff_block_lines_saved}) cannot be smaller than "
-                "{0.max_diff_block_lines_html_dir_ratio} * "
-                "max_diff_block_lines ({1})".format(self, max_),
-            )
+        self.check_ge("max_diff_block_lines", "max_diff_block_lines_parent")
+        self.check_ge("max_report_size", "max_page_size_parent")
+        self.check_ge("max_report_size", "max_page_size_child")
diff --git a/diffoscope/main.py b/diffoscope/main.py
index 504a8fc..8e3d2ae 100644
--- a/diffoscope/main.py
+++ b/diffoscope/main.py
@@ -116,46 +116,50 @@ def create_parser():
     group2.add_argument('--no-default-limits', action='store_true', default=False,
                         help='Disable most default limits. Note that text '
                         'output already ignores most of these.')
+    # everything marked with default=None below is affected by no-default-limits
     group2.add_argument('--max-text-report-size', metavar='BYTES', type=int,
                         help='Maximum bytes written in --text report. (0 to '
-                        'disable)', default=None).completer=RangeCompleter(0,
-                        Config().max_text_report_size, 200000)
+                        'disable)', default=None)
     group2.add_argument('--max-report-size', metavar='BYTES', type=int,
-                        help='Maximum bytes written in report. In html-dir '
-                        'output, this is the max bytes of the parent page. '
-                        '(0 to disable, default: %d)' %
-                        Config().max_report_size,
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_report_size, 200000)
-    group2.add_argument('--max-report-child-size', metavar='BYTES', type=int,
-                        help='In --html-dir output, this is the max bytes of '
-                        'each child page (0 to disable, default: %(default)s, '
-                        'remaining in effect even with --no-default-limits)',
-                        default=Config().max_report_child_size).completer=RangeCompleter(0,
-                        Config().max_report_child_size, 50000)
+                        help='Maximum bytes of a report in a given format, '
+                        'across all of its pages. Note that some formats, such '
+                        'as --html, may be restricted by even smaller limits '
+                        'such as --max-page-size. (0 to disable, default: %d)' %
+                        Config().max_report_size, default=None).completer=RangeCompleter(
+                        Config().max_report_size)
+    group2.add_argument('--max-page-size-parent', metavar='BYTES', type=int,
+                        help='Maximum bytes of the top-level (--html-dir) or sole '
+                        '(--html) page. (default: %(default)s, remains in effect '
+                        'even with --no-default-limits)', default=
+                        Config().max_page_size_parent).completer=RangeCompleter(
+                        Config().max_page_size_parent)
+    group2.add_argument('--max-page-size-child', metavar='BYTES', type=int,
+                        help='In --html-dir output, this is the maximum bytes of '
+                        'each child page (default: %(default)s, remains in '
+                        'effect even with --no-default-limits)', default=
+                        Config().max_page_size_child).completer=RangeCompleter(
+                        Config().max_page_size_child)
     group2.add_argument('--max-diff-block-lines', metavar='LINES', type=int,
-                        help='Maximum number of lines output per diff block. '
-                        'In --html-dir output, we use %d times this number instead, '
-                        'taken over all pages. (0 to disable, default: %d)' %
-                        (Config().max_diff_block_lines_html_dir_ratio,
-                        Config().max_diff_block_lines),
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_diff_block_lines, 5)
+                        help='In --html-dir output, this is the maximum number '
+                        'of lines output per unified-diff block, across all child '
+                        'pages. (0 to disable, default: %d)' %
+                        Config().max_diff_block_lines, default=None).completer=RangeCompleter(
+                        Config().max_diff_block_lines)
     group2.add_argument('--max-diff-block-lines-parent', metavar='LINES', type=int,
-                        help='In --html-dir output, this is maximum number of '
-                        'lines output per diff block on the parent page '
-                        'before spilling it into child pages (0 to disable, '
-                        'default: %(default)s, remaining in effect even with '
-                        '--no-default-limits)',
-                        default=Config().max_diff_block_lines_parent).completer=RangeCompleter(0,
-                        Config().max_diff_block_lines_parent, 200)
+                        help='Maximum number of lines output per diff block on '
+                        'the top-level (--html-dir) or sole (--html) page, before '
+                        'spilling it into child pages (--html-dir) or skipping the '
+                        'rest of the diff block. (default: %(default)s, remains '
+                        'in effect even with --no-default-limits)', default=
+                        Config().max_diff_block_lines_parent).completer=RangeCompleter(
+                        Config().max_diff_block_lines_parent)
     group2.add_argument('--max-diff-block-lines-saved', metavar='LINES', type=int,
                         help='Maximum number of lines saved per diff block. '
                         'Most users should not need this, unless you run out '
                         'of memory. This truncates diff(1) output before even '
                         'trying to emit it in a report. This also affects --text '
                         'output. (0 to disable, default: 0)',
-                        default=0).completer=RangeCompleter(0, 0, 200)
+                        default=0)
 
     group3 = parser.add_argument_group('diff calculation')
     group3.add_argument('--new-file', action='store_true',
@@ -173,14 +177,13 @@ def create_parser():
     group3.add_argument('--fuzzy-threshold', type=int,
                         help='Threshold for fuzzy-matching '
                         '(0 to disable, %(default)s is default, 400 is high fuzziness)',
-                        default=Config().fuzzy_threshold).completer=RangeCompleter(0,
-                        400, 20)
+                        default=Config().fuzzy_threshold).completer=RangeCompleter(400)
     group3.add_argument('--max-diff-input-lines', metavar='LINES', type=int,
                         help='Maximum number of lines fed to diff(1) '
                         '(0 to disable, default: %d)' %
                         Config().max_diff_input_lines,
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_diff_input_lines, 5000)
+                        default=None).completer=RangeCompleter(
+                        Config().max_diff_input_lines)
     group3.add_argument('--max-container-depth', metavar='DEPTH', type=int,
                         help='Maximum depth to recurse into containers. '
                         '(Cannot be disabled for security reasons, default: '
@@ -214,8 +217,12 @@ def create_parser():
 
 
 class RangeCompleter(object):
-    def __init__(self, start, end, step):
-        self.choices = range(start, end + 1, step)
+    def __init__(self, start, end=0, divisions=16):
+        if end < start:
+            tmp = end
+            end = start
+            start = tmp
+        self.choices = range(start, end + 1, int((end-start+1)/divisions))
 
     def __call__(self, prefix, **kwargs):
         return (str(i) for i in self.choices if str(i).startswith(prefix))
@@ -289,11 +296,11 @@ def run_diffoscope(parsed_args):
         logger.warning('Fuzzy-matching is currently disabled as the "tlsh" module is unavailable.')
     maybe_set_limit(Config(), parsed_args, "max_report_size")
     maybe_set_limit(Config(), parsed_args, "max_text_report_size")
-    maybe_set_limit(Config(), parsed_args, "max_report_child_size")
-    # need to set them in this order due to Config._check_constraints
-    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
-    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_parent")
+    Config().max_page_size_parent = parsed_args.max_page_size_parent
+    Config().max_page_size_child = parsed_args.max_page_size_child
     maybe_set_limit(Config(), parsed_args, "max_diff_block_lines")
+    Config().max_diff_block_lines_parent = parsed_args.max_diff_block_lines_parent
+    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
     maybe_set_limit(Config(), parsed_args, "max_diff_input_lines")
     Config().max_container_depth = parsed_args.max_container_depth
     Config().fuzzy_threshold = parsed_args.fuzzy_threshold
@@ -301,6 +308,7 @@ def run_diffoscope(parsed_args):
     Config().excludes = parsed_args.excludes
     Config().exclude_commands = parsed_args.exclude_commands
     Config().compute_visual_diffs = PresenterManager().compute_visual_diffs()
+    Config().check_constraints()
     set_path()
     set_locale()
     path1, path2 = parsed_args.path1, parsed_args.path2
diff --git a/diffoscope/presenters/html/html.py b/diffoscope/presenters/html/html.py
index dcc3843..b358bc5 100644
--- a/diffoscope/presenters/html/html.py
+++ b/diffoscope/presenters/html/html.py
@@ -190,7 +190,7 @@ def output_node(difference, path, indentstr, indentnum, css_url, directory):
     if difference.unified_diff:
         def print_func(x, force=False):
             udiff.write(x)
-        HTMLPresenter().output_unified_diff(print_func, css_url, directory, difference.unified_diff, difference.has_internal_linenos)
+        HTMLSideBySidePresenter().output_unified_diff(print_func, css_url, directory, difference.unified_diff, difference.has_internal_linenos)
 
     # Construct a PartialString for this node
     # {3} gets mapped to {-1}, a continuation hole for later child nodes
@@ -251,7 +251,7 @@ def spl_file_printer(directory, filename):
         yield recording_print_func
 
 
-class HTMLPresenter(Presenter):
+class HTMLSideBySidePresenter(object):
     supports_visual_diffs = True
 
     def __init__(self):
@@ -323,29 +323,30 @@ class HTMLPresenter(Presenter):
     def row_was_output(self):
         self.spl_rows += 1
         _, rotation_params = self.spl_print_ctrl
-        max_lines = Config().max_diff_block_lines
+        max_lines = Config().max_diff_block_lines # only for html-dir
         max_lines_parent = Config().max_diff_block_lines_parent
-        max_lines_ratio = Config().max_diff_block_lines_html_dir_ratio
-        max_report_child_size = Config().max_report_child_size
+        max_page_size_child = Config().max_page_size_child
         if not rotation_params:
             # html-dir single output, don't need to rotate
-            if self.spl_rows >= max_lines:
+            if self.spl_rows >= max_lines_parent:
                 raise DiffBlockLimitReached()
             return
         else:
             # html-dir output, perhaps need to rotate
             directory, mainname, css_url = rotation_params
-            if self.spl_rows >= max_lines_ratio * max_lines:
+            if self.spl_rows >= max_lines:
                 raise DiffBlockLimitReached()
 
             if self.spl_current_page == 0: # on parent page
                 if self.spl_rows < max_lines_parent:
                     return
+                logger.debug("new unified-diff subpage, parent page went over %s lines", max_lines_parent)
             else: # on child page
                 # TODO: make this stay below the max, instead of going 1 row over the max
                 # will require some backtracking...
-                if self.spl_print_func.bytes_written < max_report_child_size:
+                if self.spl_print_func.bytes_written < max_page_size_child:
                     return
+                logger.debug("new unified-diff subpage, previous subpage went over %s bytes", max_page_size_child)
 
         self.spl_current_page += 1
         filename = "%s-%s.html" % (mainname, self.spl_current_page)
@@ -414,6 +415,10 @@ class HTMLPresenter(Presenter):
             text = "load diff (%s %s%s)" % (self.spl_current_page, noun, (", truncated" if truncated else ""))
             print_func(templates.UD_TABLE_FOOTER % {"filename": html.escape("%s-1.html" % mainname), "text": text}, force=True)
 
+
+class HTMLPresenter(Presenter):
+    supports_visual_diffs = True
+
     def output_node_placeholder(self, anchor, lazy_load):
         if lazy_load:
             return templates.DIFFNODE_LAZY_LOAD % anchor
@@ -424,13 +429,17 @@ class HTMLPresenter(Presenter):
         partial_outputs = {} # nodes to their partial output
         partial_ancestor = {} # child nodes to ancestor nodes
         placeholder_len = len(self.output_node_placeholder("XXXXXXXXXXXXXXXX", not single_page))
+        report_printed = [0]
+        report_limit = Config().max_report_size
 
         with contextlib.ExitStack() as xstack:
             printers = {} # nodes to their printers
             def maybe_print(node):
                 if partial_outputs[node].holes:
                     return
-                printers[node](partial_outputs[node].format())
+                out = partial_outputs[node].format()
+                report_printed[0] += len(out)
+                printers[node](out)
                 del partial_outputs[node]
                 del printers[node]
 
@@ -443,35 +452,45 @@ class HTMLPresenter(Presenter):
                 ancestor = partial_ancestor.pop(node, None)
                 logger.debug('html output for %s', node.source1)
                 path = score[2] + [node.source1]
+                # TODO: can make this more efficient by doing a "size estimate" first
+                # TODO: take into account the size of the unified-diff child pages
                 node_output = output_node(node, path, "  ", len(path)-1, css_url, None if single_page else target)
                 anchor = output_anchor(path)
 
+                add_to_existing = False
                 if ancestor:
-                    limit = Config().max_report_child_size
-                    logger.debug("output size: %s, %s",
-                        partial_outputs[ancestor].size(placeholder_len), node_output.size(placeholder_len))
-                else:
-                    limit = Config().max_report_size
+                    page_limit = Config().max_page_size_parent if ancestor is difference else Config().max_page_size_child
+                    page_current = partial_outputs[ancestor].size(placeholder_len)
+                    report_current = report_printed[0] + sum(p.size(placeholder_len) for p in partial_outputs.values())
+                    want_to_add = node_output.size(placeholder_len)
+                    logger.debug("report size: %s/%s, page size: %s/%s, want to add %s)", report_current, report_limit, page_current, page_limit, want_to_add)
+                    if report_current + want_to_add > report_limit:
+                        make_new_subpage = False
+                    elif page_current + want_to_add < page_limit:
+                        add_to_existing = True
+                    else:
+                        make_new_subpage = not single_page
 
-                if ancestor and partial_outputs[ancestor].size(placeholder_len) + node_output.size(placeholder_len) < limit:
+                if add_to_existing:
                     # under limit, add it to an existing page
                     partial_outputs[ancestor] = partial_outputs[ancestor].pformat({node: node_output})
                     stored = ancestor
 
                 else:
-                    # over limit (or root), new subpage
+                    # over limit (or root), new subpage or continue/break
                     if ancestor:
-                        placeholder = self.output_node_placeholder(anchor, not single_page)
+                        placeholder = self.output_node_placeholder(anchor, make_new_subpage)
                         partial_outputs[ancestor] = partial_outputs[ancestor].pformat({node: placeholder})
                         maybe_print(ancestor)
                         footer = get_footer()
-                        if single_page:
+                        if not make_new_subpage: # we hit a limit, either max-report-size or single-page
                             if not partial_outputs:
-                                # already output a single page, don't iterate through any more children
+                                # no more holes, don't iterate through any more children
                                 break
                             else:
                                 continue
                     else:
+                        # unconditionally write the root node regardless of limits
                         assert node is difference
                         footer = get_footer(jquery_url)
                         anchor = "index"
@@ -479,6 +498,7 @@ class HTMLPresenter(Presenter):
                     partial_outputs[node] = node_output.frame(
                         get_header(css_url) + u'<div class="difference">\n',
                         u'</div>\n' + footer)
+                    assert not single_page or node is difference
                     printer = make_printer(target) if single_page else file_printer(target, "%s.html" % anchor)
                     printers[node] = xstack.enter_context(printer)
                     stored = node

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list