[diffoscope] 03/06: presenters: html: more intuitive "limit" flags, some backwards-incompatible changes:

Ximin Luo infinity0 at debian.org
Thu Jul 6 10:51:08 CEST 2017


This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch WIP/humungous-diffs
in repository diffoscope.

commit c6aaca0fc4caee4a6b2b2f672641ff8e5746b882
Author: Ximin Luo <infinity0 at debian.org>
Date:   Mon Jun 26 16:00:18 2017 +0200

    presenters: html: more intuitive "limit" flags, some backwards-incompatible changes:
    
    --max-report-size:
      Old: in --html-dir this limited only the parent page
      New: in --html-dir this applies across all pages
    
    --max-diff-block-lines:
      Old: in --html-dir 4 * this number applied across all pages (for a given diff block)
      New: in --html-dir this applies across all pages (for a given diff block)
    
    --max-page-size:
      New flag
      Applies to the sole --html page, or the top-level --html-dir page
    
    --max-report-child-size
      Renamed to
    --max-page-size-child:
      No behavioural changes
    
    --max-diff-block-lines-parent
      Renamed to
    --max-page-diff-block-lines:
      Old: Only applied to the top-level --html-dir page
      New: Applies to the sole --html page, or the top-level --html-dir page
    
    The reasoning behind these changes is that it's unlikely someone would want to
    generate a 500MB single html page, but they might in theory generate a 500MB html
    directory split up into several 200KB pages, plus a single 200KB html page as
    a summary. The new semantics for these flags allows both to be generated in one
    run using the same set of flags.
---
 diffoscope/config.py                    |  45 +++--
 diffoscope/main.py                      | 108 ++++++-----
 diffoscope/presenters/html/html.py      | 317 +++++++++++++++++++++++---------
 diffoscope/presenters/html/templates.py |  16 +-
 4 files changed, 317 insertions(+), 169 deletions(-)

diff --git a/diffoscope/config.py b/diffoscope/config.py
index d5f6007..4565573 100644
--- a/diffoscope/config.py
+++ b/diffoscope/config.py
@@ -20,16 +20,21 @@
 
 
 class Config(object):
-    max_diff_block_lines = 256
-    max_diff_block_lines_parent = 50
-    max_diff_block_lines_saved = float("inf")
-    # html-dir output uses ratio * max-diff-block-lines as its limit
-    max_diff_block_lines_html_dir_ratio = 4
     # GNU diff cannot process arbitrary large files :(
-    max_diff_input_lines = 2 ** 20
-    max_report_size = 2000 * 2 ** 10 # 2000 kB
+    max_diff_input_lines = 2 ** 22
+    max_diff_block_lines_saved = float("inf")
+
+    # hard limits, restricts single-file and multi-file formats
+    max_report_size = 40 * 2 ** 20 # 40 MB
+    max_diff_block_lines = 2 ** 10 # 1024 lines
+    # structural limits, restricts single-file formats
+    # semi-restricts multi-file formats
+    max_page_size = 400 * 2 ** 10 # 400 kB
+    max_page_size_child = 200 * 2 ** 10 # 200 kB
+    max_page_diff_block_lines = 2 ** 7 # 128 lines
+
     max_text_report_size = 0
-    max_report_child_size = 500 * 2 ** 10
+
     new_file = False
     fuzzy_threshold = 60
     enforce_constraints = True
@@ -47,21 +52,13 @@ class Config(object):
     def __setattr__(self, k, v):
         super(Config, self).__setattr__(k, v)
 
-        if self.enforce_constraints:
-            self.check_constraints()
+    def check_ge(self, a, b):
+        va = getattr(self, a)
+        vb = getattr(self, b)
+        if va < vb:
+            raise ValueError("{0} ({1}) cannot be smaller than {2} ({3})".format(a, va, b, vb))
 
     def check_constraints(self):
-        if self.max_diff_block_lines < self.max_diff_block_lines_parent:  # noqa
-            raise ValueError("max_diff_block_lines ({0.max_diff_block_lines}) "
-                "cannot be smaller than max_diff_block_lines_parent "
-                "({0.max_diff_block_lines_parent})".format(self),
-            )
-
-        max_ = self.max_diff_block_lines_html_dir_ratio * \
-            self.max_diff_block_lines
-        if self.max_diff_block_lines_saved < max_:  # noqa
-            raise ValueError("max_diff_block_lines_saved "
-                "({0.max_diff_block_lines_saved}) cannot be smaller than "
-                "{0.max_diff_block_lines_html_dir_ratio} * "
-                "max_diff_block_lines ({1})".format(self, max_),
-            )
+        self.check_ge("max_diff_block_lines", "max_page_diff_block_lines")
+        self.check_ge("max_report_size", "max_page_size")
+        self.check_ge("max_report_size", "max_page_size_child")
diff --git a/diffoscope/main.py b/diffoscope/main.py
index 07880ed..bf8f829 100644
--- a/diffoscope/main.py
+++ b/diffoscope/main.py
@@ -78,6 +78,8 @@ def create_parser():
                         const=True, help='Show an approximate progress bar')
     parser.add_argument('--no-progress', dest='progress', action='store_const',
                         const=False, help='Do not show any progress bar')
+    parser.add_argument('--no-default-limits', action='store_true', default=False,
+                        help='Disable most default output limits and diff calculation limits.')
 
     group1 = parser.add_argument_group('output types')
     group1.add_argument('--text', metavar='OUTPUT_FILE', dest='text_output',
@@ -114,49 +116,44 @@ def create_parser():
                         help='Write profiling info to given file (use - for stdout)')
 
     group2 = parser.add_argument_group('output limits')
-    group2.add_argument('--no-default-limits', action='store_true', default=False,
-                        help='Disable most default limits. Note that text '
-                        'output already ignores most of these.')
+    # everything marked with default=None below is affected by no-default-limits
     group2.add_argument('--max-text-report-size', metavar='BYTES', type=int,
                         help='Maximum bytes written in --text report. (0 to '
-                        'disable)', default=None).completer=RangeCompleter(0,
-                        Config().max_text_report_size, 200000)
+                        'disable, default: %d)' % Config().max_text_report_size,
+                        default=None)
     group2.add_argument('--max-report-size', metavar='BYTES', type=int,
-                        help='Maximum bytes written in report. In html-dir '
-                        'output, this is the max bytes of the parent page. '
-                        '(0 to disable, default: %d)' %
-                        Config().max_report_size,
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_report_size, 200000)
-    group2.add_argument('--max-report-child-size', metavar='BYTES', type=int,
-                        help='In --html-dir output, this is the max bytes of '
-                        'each child page (0 to disable, default: %(default)s, '
-                        'remaining in effect even with --no-default-limits)',
-                        default=Config().max_report_child_size).completer=RangeCompleter(0,
-                        Config().max_report_child_size, 50000)
+                        help='Maximum bytes of a report in a given format, '
+                        'across all of its pages. Note that some formats, such '
+                        'as --html, may be restricted by even smaller limits '
+                        'such as --max-page-size. (0 to disable, default: %d)' %
+                        Config().max_report_size, default=None).completer=RangeCompleter(
+                        Config().max_report_size)
     group2.add_argument('--max-diff-block-lines', metavar='LINES', type=int,
-                        help='Maximum number of lines output per diff block. '
-                        'In --html-dir output, we use %d times this number instead, '
-                        'taken over all pages. (0 to disable, default: %d)' %
-                        (Config().max_diff_block_lines_html_dir_ratio,
-                        Config().max_diff_block_lines),
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_diff_block_lines, 5)
-    group2.add_argument('--max-diff-block-lines-parent', metavar='LINES', type=int,
-                        help='In --html-dir output, this is maximum number of '
-                        'lines output per diff block on the parent page '
-                        'before spilling it into child pages (0 to disable, '
-                        'default: %(default)s, remaining in effect even with '
-                        '--no-default-limits)',
-                        default=Config().max_diff_block_lines_parent).completer=RangeCompleter(0,
-                        Config().max_diff_block_lines_parent, 200)
-    group2.add_argument('--max-diff-block-lines-saved', metavar='LINES', type=int,
-                        help='Maximum number of lines saved per diff block. '
-                        'Most users should not need this, unless you run out '
-                        'of memory. This truncates diff(1) output before even '
-                        'trying to emit it in a report. This also affects --text '
-                        'output. (0 to disable, default: 0)',
-                        default=0).completer=RangeCompleter(0, 0, 200)
+                        help='Maximum number of lines output per unified-diff '
+                        'block, across all pages. (0 to disable, default: %d)' %
+                        Config().max_diff_block_lines, default=None).completer=RangeCompleter(
+                        Config().max_diff_block_lines)
+    group2.add_argument('--max-page-size', metavar='BYTES', type=int,
+                        help='Maximum bytes of the top-level (--html-dir) or sole '
+                        '(--html) page. (default: %(default)s, remains in effect '
+                        'even with --no-default-limits)', default=
+                        Config().max_page_size).completer=RangeCompleter(
+                        Config().max_page_size)
+    group2.add_argument('--max-page-size-child', metavar='BYTES', type=int,
+                        help='In --html-dir output, this is the maximum bytes of '
+                        'each child page (default: %(default)s, remains in '
+                        'effect even with --no-default-limits)', default=
+                        Config().max_page_size_child).completer=RangeCompleter(
+                        Config().max_page_size_child)
+    group2.add_argument('--max-page-diff-block-lines', metavar='LINES', type=int,
+                        help='Maximum number of lines output per unified-diff block '
+                        'on the top-level (--html-dir) or sole (--html) page, before '
+                        'spilling it into child pages (--html-dir) or skipping the '
+                        'rest of the diff block. Child pages are limited instead by '
+                        '--max-page-size-child. (default: %(default)s, remains in '
+                        'effect even with --no-default-limits)', default=
+                        Config().max_page_diff_block_lines).completer=RangeCompleter(
+                        Config().max_page_diff_block_lines)
 
     group3 = parser.add_argument_group('diff calculation')
     group3.add_argument('--new-file', action='store_true',
@@ -185,19 +182,25 @@ def create_parser():
     group3.add_argument('--fuzzy-threshold', type=int,
                         help='Threshold for fuzzy-matching '
                         '(0 to disable, %(default)s is default, 400 is high fuzziness)',
-                        default=Config().fuzzy_threshold).completer=RangeCompleter(0,
-                        400, 20)
+                        default=Config().fuzzy_threshold).completer=RangeCompleter(400)
     group3.add_argument('--max-diff-input-lines', metavar='LINES', type=int,
                         help='Maximum number of lines fed to diff(1) '
                         '(0 to disable, default: %d)' %
                         Config().max_diff_input_lines,
-                        default=None).completer=RangeCompleter(0,
-                        Config().max_diff_input_lines, 5000)
+                        default=None).completer=RangeCompleter(
+                        Config().max_diff_input_lines)
     group3.add_argument('--max-container-depth', metavar='DEPTH', type=int,
                         help='Maximum depth to recurse into containers. '
                         '(Cannot be disabled for security reasons, default: '
                         '%(default)s)',
                         default=Config().max_container_depth)
+    group3.add_argument('--max-diff-block-lines-saved', metavar='LINES', type=int,
+                        help='Maximum number of lines saved per diff block. '
+                        'Most users should not need this, unless you run out '
+                        'of memory. This truncates diff(1) output before emitting '
+                        'it in a report, and affects all types of output, '
+                        'including --text and --json. (0 to disable, default: '
+                        '%(default)s)', default=0)
 
     group4 = parser.add_argument_group('information commands')
     group4.add_argument('--help', '-h', action='help',
@@ -226,8 +229,12 @@ def create_parser():
 
 
 class RangeCompleter(object):
-    def __init__(self, start, end, step):
-        self.choices = range(start, end + 1, step)
+    def __init__(self, start, end=0, divisions=16):
+        if end < start:
+            tmp = end
+            end = start
+            start = tmp
+        self.choices = range(start, end + 1, int((end-start+1)/divisions))
 
     def __call__(self, prefix, **kwargs):
         return (str(i) for i in self.choices if str(i).startswith(prefix))
@@ -284,6 +291,7 @@ class ListDebianSubstvarsAction(argparse._StoreTrueAction):
         sys.exit(0)
 
 def maybe_set_limit(config, parsed_args, key):
+    # apply limits affected by "no-default-limits"
     v = getattr(parsed_args, key)
     if v is not None:
         setattr(config, key, float("inf") if v == 0 else v)
@@ -301,11 +309,12 @@ def run_diffoscope(parsed_args):
         logger.warning('Fuzzy-matching is currently disabled as the "tlsh" module is unavailable.')
     maybe_set_limit(Config(), parsed_args, "max_report_size")
     maybe_set_limit(Config(), parsed_args, "max_text_report_size")
-    maybe_set_limit(Config(), parsed_args, "max_report_child_size")
-    # need to set them in this order due to Config._check_constraints
-    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
-    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_parent")
     maybe_set_limit(Config(), parsed_args, "max_diff_block_lines")
+    Config().max_page_size = parsed_args.max_page_size
+    Config().max_page_size_child = parsed_args.max_page_size_child
+    Config().max_page_diff_block_lines = parsed_args.max_page_diff_block_lines
+
+    maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
     maybe_set_limit(Config(), parsed_args, "max_diff_input_lines")
     Config().max_container_depth = parsed_args.max_container_depth
     Config().fuzzy_threshold = parsed_args.fuzzy_threshold
@@ -314,6 +323,7 @@ def run_diffoscope(parsed_args):
     Config().exclude_commands = parsed_args.exclude_commands
     Config().exclude_directory_metadata = parsed_args.exclude_directory_metadata
     Config().compute_visual_diffs = PresenterManager().compute_visual_diffs()
+    Config().check_constraints()
     set_path()
     set_locale()
     path1, path2 = parsed_args.path1, parsed_args.path2
diff --git a/diffoscope/presenters/html/html.py b/diffoscope/presenters/html/html.py
index 126bfbb..bbf5247 100644
--- a/diffoscope/presenters/html/html.py
+++ b/diffoscope/presenters/html/html.py
@@ -70,6 +70,20 @@ re_anchor_prefix = re.compile(r'^[^A-Za-z]')
 re_anchor_suffix = re.compile(r'[^A-Za-z-_:\.]')
 
 
+def send_and_exhaust(iterator, arg, default):
+    """Send a single value to a coroutine, exhaust it, and return the final
+    element or a default value if it was empty."""
+    # Python's coroutine syntax is still a bit rough when you want to do
+    # slightly more complex stuff. Watch this logic closely.
+    output = default
+    try:
+        output = iterator.send(arg)
+    except StopIteration:
+        pass
+    for output in iterator:
+        pass
+    return output
+
 def md5(s):
     return hashlib.md5(s.encode('utf-8')).hexdigest()
 
@@ -166,48 +180,63 @@ def output_node_frame(difference, path, indentstr, indentnum, body):
         html.escape(difference.source1),
         html.escape(difference.source2))
 
-    return u"""{0[1]}<div class="diffheader">
+    return PartialString.numl(u"""{0[1]}<div class="diffheader">
 {1}{0[1]}</div>
-{2}""".format(indent, header, body)
+{2}""", 3).pformatl(indent, header, body)
 
 def output_node(difference, path, indentstr, indentnum, css_url, directory):
+    """Returns a tuple (parent, continuation) where
+
+    - parent is a PartialString representing the body of the node, including
+      its comments, visuals, unified_diff and headers for its children - but
+      not the bodies of the children
+    - continuation is either None or (only in html-dir mode) a function which
+      when called with a single integer arg, the maximum size to print, will
+      print any remaining "split" pages for unified_diff up to the given size.
+    """
     indent = tuple(indentstr * (indentnum + x) for x in range(3))
     t, cont = PartialString.cont()
 
+    comments = u""
     if difference.comments:
         comments = u'{0[1]}<div class="comment">\n{1}{0[1]}</div>\n'.format(
             indent, "".join(u"{0[2]}{1}<br/>\n".format(indent, html.escape(x)) for x in difference.comments))
-    else:
-        comments = u""
 
     visuals = u""
     for visual in difference.visuals:
         visuals += output_visual(visual, path, indentstr, indentnum+1)
 
-    udiff = io.StringIO()
+    udiff = u""
+    ud_cont = None
     if difference.unified_diff:
-        def print_func(x, force=False):
-            udiff.write(x)
-        HTMLPresenter().output_unified_diff(print_func, css_url, directory, difference.unified_diff, difference.has_internal_linenos)
+        ud_cont = HTMLSideBySidePresenter().output_unified_diff(
+            css_url, directory, difference.unified_diff,
+            difference.has_internal_linenos)
+        udiff = next(ud_cont)
+        if isinstance(udiff, PartialString):
+            ud_cont = ud_cont.send
+            udiff = udiff.pformatl(PartialString.of(ud_cont))
+        else:
+            for _ in ud_cont: pass # exhaust the iterator, avoids GeneratorExit
+            ud_cont = None
 
-    # Construct a PartialString for this node
-    # {3} gets mapped to {-1}, a continuation hole for later child nodes
-    body = u"{0}{1}{2}{3}".format(comments, visuals, udiff.getvalue(), "{-1}")
+    # PartialString for this node
+    body = PartialString.numl(u"{0}{1}{2}{-1}", 3, cont).pformatl(comments, visuals, udiff)
     if len(path) == 1:
         # root node, frame it
-        t = cont(t, output_node_frame(difference, path, indentstr, indentnum, body))
-    else:
-        t = cont(t, body)
+        body = output_node_frame(difference, path, indentstr, indentnum, body)
+    t = cont(t, body)
 
     # Add holes for child nodes
     for d in difference.details:
-        # {0} hole, for the child node's contents
-        # {-1} continuation hole, for later child nodes
-        t = cont(t, u"""{0[1]}<div class="difference">
+        child = output_node_frame(d, path + [d], indentstr, indentnum+1, PartialString.of(d))
+        child = PartialString.numl(u"""{0[1]}<div class="difference">
 {1}{0[1]}</div>
-{{-1}}""".format(indent, output_node_frame(d, path + [d], indentstr, indentnum+1, "{0}")), d)
+{-1}""", 2, cont).pformatl(indent, child)
+        t = cont(t, child)
 
-    return cont(t, u"")
+    assert len(t.holes) >= len(difference.details) + 1 # there might be extra holes for the unified diff continuation
+    return cont(t, u""), ud_cont
 
 def output_header(css_url):
     if css_url:
@@ -232,32 +261,39 @@ def file_printer(directory, filename):
         yield f.write
 
 @contextlib.contextmanager
-def spl_file_printer(directory, filename):
+def spl_file_printer(directory, filename, accum):
     with codecs.open(os.path.join(directory,filename), 'w', encoding='utf-8') as f:
         print_func = f.write
-        def recording_print_func(s, force=False):
+        def recording_print_func(s):
             print_func(s)
             recording_print_func.bytes_written += len(s)
+            accum.bytes_written += len(s)
         recording_print_func.bytes_written = 0
         yield recording_print_func
 
 
-class HTMLPresenter(Presenter):
+class HTMLSideBySidePresenter(object):
     supports_visual_diffs = True
 
     def __init__(self):
-        self.new_unified_diff()
+        self.max_lines = Config().max_diff_block_lines # only for html-dir
+        self.max_lines_parent = Config().max_page_diff_block_lines
+        self.max_page_size_child = Config().max_page_size_child
 
     def new_unified_diff(self):
         self.spl_rows = 0
         self.spl_current_page = 0
         self.spl_print_func = None
         self.spl_print_ctrl = None
+        # the below apply to child pages only, the parent page limit works
+        # differently and is controlled by output_difference later below
+        self.bytes_max_total = 0
+        self.bytes_written = 0
+        self.error_row = None
 
     def output_hunk_header(self, hunk_off1, hunk_size1, hunk_off2, hunk_size2):
         self.spl_print_func(u'<tr class="diffhunk"><td colspan="2">Offset %d, %d lines modified</td>' % (hunk_off1, hunk_size1))
         self.spl_print_func(u'<td colspan="2">Offset %d, %d lines modified</td></tr>\n' % (hunk_off2, hunk_size2))
-        self.row_was_output()
 
     def output_line(self, has_internal_linenos, type_name, s1, line1, s2, line2):
         self.spl_print_func(u'<tr class="diff%s">' % type_name)
@@ -284,8 +320,7 @@ class HTMLPresenter(Presenter):
             else:
                 self.spl_print_func(u'<td colspan="2">\xa0</td>')
         finally:
-            self.spl_print_func(u"</tr>\n", force=True)
-            self.row_was_output()
+            self.spl_print_func(u"</tr>\n")
 
     def spl_print_enter(self, print_context, rotation_params):
         # Takes ownership of print_context
@@ -300,54 +335,71 @@ class HTMLPresenter(Presenter):
 
     def spl_print_exit(self, *exc_info):
         if not self.spl_had_entered_child(): return False
-        self.spl_print_func(output_footer(), force=True)
+        self.spl_print_func(output_footer())
         _exit, _ = self.spl_print_ctrl
         self.spl_print_func = None
         self.spl_print_ctrl = None
         return _exit(*exc_info)
 
-    def row_was_output(self):
-        self.spl_rows += 1
-        _, rotation_params = self.spl_print_ctrl
-        max_lines = Config().max_diff_block_lines
-        max_lines_parent = Config().max_diff_block_lines_parent
-        max_lines_ratio = Config().max_diff_block_lines_html_dir_ratio
-        max_report_child_size = Config().max_report_child_size
-        if not rotation_params:
+    def check_limits(self):
+        if not self.spl_print_ctrl[1]:
             # html-dir single output, don't need to rotate
-            if self.spl_rows >= max_lines:
+            if self.spl_rows >= self.max_lines_parent:
                 raise DiffBlockLimitReached()
-            return
+            return False
         else:
             # html-dir output, perhaps need to rotate
-            directory, mainname, css_url = rotation_params
-            if self.spl_rows >= max_lines_ratio * max_lines:
+            if self.spl_rows >= self.max_lines:
                 raise DiffBlockLimitReached()
 
             if self.spl_current_page == 0: # on parent page
-                if self.spl_rows < max_lines_parent:
-                    return
+                if self.spl_rows < self.max_lines_parent:
+                    return False
+                logger.debug("new unified-diff subpage, parent page went over %s lines", self.max_lines_parent)
             else: # on child page
-                # TODO: make this stay below the max, instead of going 1 row over the max
-                # will require some backtracking...
-                if self.spl_print_func.bytes_written < max_report_child_size:
-                    return
+                if self.bytes_max_total and self.bytes_written > self.bytes_max_total:
+                    raise PrintLimitReached()
+                if self.spl_print_func.bytes_written < self.max_page_size_child:
+                    return False
+                logger.debug("new unified-diff subpage, previous subpage went over %s bytes", self.max_page_size_child)
+            return True
 
+    def new_child_page(self):
+        _, rotation_params = self.spl_print_ctrl
+        directory, mainname, css_url = rotation_params
         self.spl_current_page += 1
         filename = "%s-%s.html" % (mainname, self.spl_current_page)
 
         if self.spl_current_page > 1:
             # previous page was a child, close it
-            self.spl_print_func(templates.UD_TABLE_FOOTER % {"filename": html.escape(filename), "text": "load diff"}, force=True)
+            self.spl_print_func(templates.UD_TABLE_FOOTER % {"filename": html.escape(filename), "text": "load diff"})
+            self.spl_print_func(u"</table>\n")
             self.spl_print_exit(None, None, None)
 
         # rotate to the next child page
-        context = spl_file_printer(directory, filename)
+        context = spl_file_printer(directory, filename, self)
         self.spl_print_enter(context, rotation_params)
         self.spl_print_func(templates.UD_TABLE_HEADER)
 
+    def output_limit_reached(self, limit_type, total, bytes_processed):
+        logger.debug('%s print limit reached', limit_type)
+        bytes_left = total - bytes_processed
+        self.error_row = templates.UD_TABLE_LIMIT_FOOTER % {
+            "limit_type": limit_type,
+            "bytes_left": bytes_left,
+            "bytes_total": total,
+            "percent": (bytes_left / total) * 100
+        }
+        self.spl_print_func(self.error_row)
+
     def output_unified_diff_table(self, unified_diff, has_internal_linenos):
-        self.spl_print_func(templates.UD_TABLE_HEADER)
+        """Output a unified diff <table> possibly over multiple pages.
+
+        It is the caller's responsibility to set up self.spl_* correctly.
+
+        Yields None for each extra child page, and then True or False depending
+        on whether the whole output was truncated.
+        """
         try:
             ydiff = SideBySideDiff(unified_diff)
             for t, args in ydiff.items():
@@ -359,67 +411,135 @@ class HTMLPresenter(Presenter):
                     self.spl_print_func(u'<td colspan="2">%s</td>\n' % args)
                 else:
                     raise AssertionError()
-            return True
+                self.spl_rows += 1
+                if not self.check_limits():
+                    continue
+                self.new_child_page()
+                new_limit = yield None
+                if new_limit:
+                    self.bytes_max_total = new_limit
+                    self.bytes_written = 0
+                    self.check_limits()
+            wrote_all = True
+        except GeneratorExit:
+            return
         except DiffBlockLimitReached:
-            total = len(unified_diff)
-            bytes_left = total - ydiff.bytes_processed
-            frac = bytes_left / total
-            self.spl_print_func(
-                u'<tr class="error">'
-                u'<td colspan="4">Max diff block lines reached; %s/%s bytes (%.2f%%) of diff not shown.'
-                u"</td></tr>" % (bytes_left, total, frac*100), force=True)
-            logger.debug('diff-block print limit reached')
-            return False
+            self.output_limit_reached("diff block lines", len(unified_diff), ydiff.bytes_processed)
+            wrote_all = False
         except PrintLimitReached:
-            assert not self.spl_had_entered_child() # limit reached on the parent page
-            self.spl_print_func(u'<tr class="error"><td colspan="4">Max output size reached.</td></tr>', force=True)
-            raise
+            self.output_limit_reached("report size", len(unified_diff), ydiff.bytes_processed)
+            wrote_all = False
         finally:
-            self.spl_print_func(u"</table>", force=True)
+            # no footer on the last page, just a close tag
+            self.spl_print_func(u"</table>")
+        yield wrote_all
 
-    def output_unified_diff(self, print_func, css_url, directory, unified_diff, has_internal_linenos):
+    def output_unified_diff(self, css_url, directory, unified_diff, has_internal_linenos):
         self.new_unified_diff()
         rotation_params = None
         if directory:
             mainname = md5(unified_diff)
             rotation_params = directory, mainname, css_url
+
         try:
-            self.spl_print_func = print_func
+            udiff = io.StringIO()
+            udiff.write(templates.UD_TABLE_HEADER)
+            self.spl_print_func = udiff.write
             self.spl_print_ctrl = None, rotation_params
-            truncated = not self.output_unified_diff_table(unified_diff, has_internal_linenos)
+
+            it = self.output_unified_diff_table(unified_diff, has_internal_linenos)
+            wrote_all = next(it)
+            if wrote_all is None:
+                assert self.spl_current_page == 1
+                # now pause the iteration and wait for consumer to give us a
+                # size-limit to write the remaining pages with
+                # exhaust the iterator and save the last item in wrote_all
+                new_limit = yield PartialString(PartialString.escape(udiff.getvalue()) + u"{0}</table>\n", None)
+                wrote_all = send_and_exhaust(it, new_limit, wrote_all)
+            else:
+                yield udiff.getvalue()
+                return
+
+        except GeneratorExit:
+            logger.debug("skip extra output for unified diff %s", mainname)
+            it.close()
+            self.spl_print_exit(None, None, None)
+            return
         except:
-            if not self.spl_print_exit(*sys.exc_info()): raise
+            import traceback
+            traceback.print_exc()
+            if self.spl_print_exit(*sys.exc_info()) is False: raise
         else:
             self.spl_print_exit(None, None, None)
         finally:
             self.spl_print_ctrl = None
             self.spl_print_func = None
 
-        if self.spl_current_page > 0:
+        truncated = not wrote_all
+        child_rows_written = self.spl_rows - self.max_lines_parent
+        if truncated and not child_rows_written:
+            # if we didn't write any child rows, just output the error message on the parent page
+            parent_last_row = self.error_row
+        else:
             noun = "pieces" if self.spl_current_page > 1 else "piece"
             text = "load diff (%s %s%s)" % (self.spl_current_page, noun, (", truncated" if truncated else ""))
-            print_func(templates.UD_TABLE_FOOTER % {"filename": html.escape("%s-1.html" % mainname), "text": text}, force=True)
+            parent_last_row = templates.UD_TABLE_FOOTER % {"filename": html.escape("%s-1.html" % mainname), "text": text}
+        yield self.bytes_written, parent_last_row
+
+
+class HTMLPresenter(Presenter):
+    supports_visual_diffs = True
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.report_printed = 0
+        self.report_limit = Config().max_report_size
+
+    @property
+    def report_remaining(self):
+        return self.report_limit - self.report_printed
+
+    def maybe_print(self, node, printers, outputs, continuations):
+        output = outputs[node]
+        node_cont = continuations[node]
+        if output.holes and set(output.holes) - set(node_cont):
+            return
+
+        # could be slightly more accurate, whatever
+        est_placeholder_len = max(len(templates.UD_TABLE_FOOTER), len(templates.UD_TABLE_LIMIT_FOOTER)) + 40
+        est_size = output.size(est_placeholder_len)
+
+        results = {}
+        for cont in node_cont:
+            remaining = self.report_remaining - est_size
+            printed, result = cont(remaining)
+            self.report_printed += printed
+            results[cont] = result
+
+        out = output.format(results)
+        printer_args = printers[node]
+        with printer_args[0](*printer_args[1:]) as printer:
+            printer(out)
+        self.report_printed += len(out)
+
+        del outputs[node]
+        del printers[node]
+        del continuations[node]
 
     def output_node_placeholder(self, anchor, lazy_load):
         if lazy_load:
             return templates.DIFFNODE_LAZY_LOAD % anchor
         else:
-            return '<div class="error">Max report size reached</div>\n'
+            return templates.DIFFNODE_LIMIT
 
     def output_difference(self, target, difference, css_url, jquery_url, single_page=False):
         outputs = {} # nodes to their partial output
         ancestors = {} # child nodes to ancestor nodes
         placeholder_len = len(self.output_node_placeholder("XXXXXXXXXXXXXXXX", not single_page))
-
+        continuations = {} # functions to print unified diff continuations (html-dir only)
         printers = {} # nodes to their printers
-        def maybe_print(node):
-            if outputs[node].holes:
-                return
-            printer_args = printers[node]
-            with printer_args[0](*printer_args[1:]) as printer:
-                printer(outputs[node].format())
-            del outputs[node]
-            del printers[node]
 
         def smallest_first(node, parscore):
             depth = parscore[0] + 1 if parscore else 0
@@ -433,34 +553,44 @@ class HTMLPresenter(Presenter):
             diff_path = output_diff_path(path)
             pagename = md5(diff_path)
             logger.debug('html output for %s', diff_path)
-            node_output = output_node(node, path, "  ", len(path)-1, css_url, None if single_page else target)
+            node_output, node_continuation = output_node(
+                node, path, "  ", len(path)-1, css_url, None if single_page else target)
 
+            add_to_existing = False
             if ancestor:
-                limit = Config().max_report_child_size
-                logger.debug("output size: %s, %s",
-                    outputs[ancestor].size(placeholder_len), node_output.size(placeholder_len))
-            else:
-                limit = Config().max_report_size
+                page_limit = Config().max_page_size if ancestor is difference else Config().max_page_size_child
+                page_current = outputs[ancestor].size(placeholder_len)
+                report_current = self.report_printed + sum(p.size(placeholder_len) for p in outputs.values())
+                want_to_add = node_output.size(placeholder_len)
+                logger.debug("report size: %s/%s, page size: %s/%s, want to add %s)", report_current, self.report_limit, page_current, page_limit, want_to_add)
+                if report_current + want_to_add > self.report_limit:
+                    make_new_subpage = False
+                elif page_current + want_to_add < page_limit:
+                    add_to_existing = True
+                else:
+                    make_new_subpage = not single_page
 
-            if ancestor and outputs[ancestor].size(placeholder_len) + node_output.size(placeholder_len) < limit:
+            if add_to_existing:
                 # under limit, add it to an existing page
                 outputs[ancestor] = outputs[ancestor].pformat({node: node_output})
                 stored = ancestor
 
             else:
-                # over limit (or root), new subpage
+                # over limit (or root), new subpage or continue/break
                 if ancestor:
-                    placeholder = self.output_node_placeholder(pagename, not single_page)
+                    placeholder = self.output_node_placeholder(pagename, make_new_subpage)
                     outputs[ancestor] = outputs[ancestor].pformat({node: placeholder})
-                    maybe_print(ancestor)
+                    self.maybe_print(ancestor, printers, outputs, continuations)
                     footer = output_footer()
-                    if single_page:
+                    if not make_new_subpage: # we hit a limit, either max-report-size or single-page
                         if not outputs:
-                            # already output a single page, don't iterate through any more children
+                            # no more holes, don't iterate through any more children
                             break
                         else:
+                            # more holes to fill up with "limit reached" placeholders
                             continue
                 else:
+                    # unconditionally write the root node regardless of limits
                     assert node is difference
                     footer = output_footer(jquery_url)
                     pagename = "index"
@@ -468,13 +598,18 @@ class HTMLPresenter(Presenter):
                 outputs[node] = node_output.frame(
                     output_header(css_url) + u'<div class="difference">\n',
                     u'</div>\n' + footer)
+                assert not single_page or node is difference
                 printers[node] = (make_printer, target) if single_page else (file_printer, target, "%s.html" % pagename)
                 stored = node
 
             for child in node.details:
                 ancestors[child] = stored
 
-            maybe_print(stored)
+            conts = continuations.setdefault(stored, [])
+            if node_continuation:
+                conts.append(node_continuation)
+
+            self.maybe_print(stored, printers, outputs, continuations)
 
         if outputs:
             import pprint
diff --git a/diffoscope/presenters/html/templates.py b/diffoscope/presenters/html/templates.py
index 595bc58..29ad061 100644
--- a/diffoscope/presenters/html/templates.py
+++ b/diffoscope/presenters/html/templates.py
@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
 
-HEADER = """<!DOCTYPE html>
+HEADER = u"""<!DOCTYPE html>
 <html lang="en">
 <head>
   <meta charset="utf-8" />
@@ -147,12 +147,12 @@ HEADER = """<!DOCTYPE html>
 <body class="diffoscope">
 """
 
-FOOTER = """<div class="footer">Generated by <a href="https://diffoscope.org" rel="noopener noreferrer" target="_blank">diffoscope</a> %(version)s</div>
+FOOTER = u"""<div class="footer">Generated by <a href="https://diffoscope.org" rel="noopener noreferrer" target="_blank">diffoscope</a> %(version)s</div>
 </body>
 </html>
 """
 
-SCRIPTS = """<script src="%(jquery_url)s"></script>
+SCRIPTS = u"""<script src="%(jquery_url)s"></script>
 <script type="text/javascript">
 $(function() {
   // activate "loading" controls
@@ -209,7 +209,10 @@ $(function() {
 </script>
 """
 
-DIFFNODE_LAZY_LOAD = """<div class="ondemand-details">... <a href="%s.html">load details</a> ...</div>
+DIFFNODE_LAZY_LOAD = u"""<div class="ondemand-details">... <a href="%s.html">load details</a> ...</div>
+"""
+
+DIFFNODE_LIMIT = u"""<div class="error">Max report size reached</div>
 """
 
 UD_TABLE_HEADER = u"""<table class="diff">
@@ -220,5 +223,8 @@ UD_TABLE_HEADER = u"""<table class="diff">
 UD_TABLE_FOOTER = u"""<tr class="ondemand"><td colspan="4">
 ... <a href="%(filename)s">%(text)s</a> ...
 </td></tr>
-</table>
 """
+
+UD_TABLE_LIMIT_FOOTER = u"""<tr class="error"><td colspan="4">
+Max %(limit_type)s reached; %(bytes_left)s/%(bytes_total)s bytes (%(percent).2f%%) of diff not shown.
+</td></tr>"""

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list