[diffoscope] 05/05: WIP: more intuitive "limit" flags
Ximin Luo
infinity0 at debian.org
Wed Jun 21 23:10:30 CEST 2017
This is an automated email from the git hooks/post-receive script.
infinity0 pushed a commit to branch WIP/humungous-diffs
in repository diffoscope.
commit 2fdd410fec6d0d4ccff520abb4058c384827e4e1
Author: Ximin Luo <infinity0 at debian.org>
Date: Wed Jun 21 23:10:01 2017 +0200
WIP: more intuitive "limit" flags
---
diffoscope/config.py | 35 +++++++++-------
diffoscope/main.py | 86 +++++++++++++++++++++-----------------
diffoscope/presenters/html/html.py | 58 ++++++++++++++++---------
3 files changed, 105 insertions(+), 74 deletions(-)
diff --git a/diffoscope/config.py b/diffoscope/config.py
index bd6b3a9..6572980 100644
--- a/diffoscope/config.py
+++ b/diffoscope/config.py
@@ -20,16 +20,21 @@
class Config(object):
- max_diff_block_lines = 256
- max_diff_block_lines_parent = 50
max_diff_block_lines_saved = float("inf")
- # html-dir output uses ratio * max-diff-block-lines as its limit
- max_diff_block_lines_html_dir_ratio = 4
# GNU diff cannot process arbitrary large files :(
max_diff_input_lines = 2 ** 20
- max_report_size = 2000 * 2 ** 10 # 2000 kB
+
+ # hard limits, restricts single-file and multi-file formats
+ max_report_size = 40 * 2 ** 20 # 40 MB
+ max_diff_block_lines = 2 ** 10 # 1024 lines
+ # structural limits, restricts single-file formats
+ # semi-restricts multi-file formats
+ max_page_size_parent = 400 * 2 ** 10 # 400 kB
+ max_page_size_child = 200 * 2 ** 10 # 200 kB
+ max_diff_block_lines_parent = 2 ** 7 # 128 lines
+
max_text_report_size = 0
- max_report_child_size = 500 * 2 ** 10
+
new_file = False
fuzzy_threshold = 60
enforce_constraints = True
@@ -46,15 +51,13 @@ class Config(object):
def __setattr__(self, k, v):
super(Config, self).__setattr__(k, v)
- if self.enforce_constraints:
- self.check_constraints()
+ def check_ge(self, a, b):
+ va = getattr(self, a)
+ vb = getattr(self, b)
+ if va < vb:
+ raise ValueError("{0} ({1}) cannot be smaller than {2} ({3})".format(a, va, b, vb))
def check_constraints(self):
- max_ = self.max_diff_block_lines_html_dir_ratio * \
- self.max_diff_block_lines
- if self.max_diff_block_lines_saved < max_: # noqa
- raise ValueError("max_diff_block_lines_saved "
- "({0.max_diff_block_lines_saved}) cannot be smaller than "
- "{0.max_diff_block_lines_html_dir_ratio} * "
- "max_diff_block_lines ({1})".format(self, max_),
- )
+ self.check_ge("max_diff_block_lines", "max_diff_block_lines_parent")
+ self.check_ge("max_report_size", "max_page_size_parent")
+ self.check_ge("max_report_size", "max_page_size_child")
diff --git a/diffoscope/main.py b/diffoscope/main.py
index 504a8fc..8e3d2ae 100644
--- a/diffoscope/main.py
+++ b/diffoscope/main.py
@@ -116,46 +116,50 @@ def create_parser():
group2.add_argument('--no-default-limits', action='store_true', default=False,
help='Disable most default limits. Note that text '
'output already ignores most of these.')
+ # everything marked with default=None below is affected by no-default-limits
group2.add_argument('--max-text-report-size', metavar='BYTES', type=int,
help='Maximum bytes written in --text report. (0 to '
- 'disable)', default=None).completer=RangeCompleter(0,
- Config().max_text_report_size, 200000)
+ 'disable)', default=None)
group2.add_argument('--max-report-size', metavar='BYTES', type=int,
- help='Maximum bytes written in report. In html-dir '
- 'output, this is the max bytes of the parent page. '
- '(0 to disable, default: %d)' %
- Config().max_report_size,
- default=None).completer=RangeCompleter(0,
- Config().max_report_size, 200000)
- group2.add_argument('--max-report-child-size', metavar='BYTES', type=int,
- help='In --html-dir output, this is the max bytes of '
- 'each child page (0 to disable, default: %(default)s, '
- 'remaining in effect even with --no-default-limits)',
- default=Config().max_report_child_size).completer=RangeCompleter(0,
- Config().max_report_child_size, 50000)
+ help='Maximum bytes of a report in a given format, '
+ 'across all of its pages. Note that some formats, such '
+ 'as --html, may be restricted by even smaller limits '
+ 'such as --max-page-size. (0 to disable, default: %d)' %
+ Config().max_report_size, default=None).completer=RangeCompleter(
+ Config().max_report_size)
+ group2.add_argument('--max-page-size-parent', metavar='BYTES', type=int,
+ help='Maximum bytes of the top-level (--html-dir) or sole '
+ '(--html) page. (default: %(default)s, remains in effect '
+ 'even with --no-default-limits)', default=
+ Config().max_page_size_parent).completer=RangeCompleter(
+ Config().max_page_size_parent)
+ group2.add_argument('--max-page-size-child', metavar='BYTES', type=int,
+ help='In --html-dir output, this is the maximum bytes of '
+ 'each child page (default: %(default)s, remains in '
+ 'effect even with --no-default-limits)', default=
+ Config().max_page_size_child).completer=RangeCompleter(
+ Config().max_page_size_child)
group2.add_argument('--max-diff-block-lines', metavar='LINES', type=int,
- help='Maximum number of lines output per diff block. '
- 'In --html-dir output, we use %d times this number instead, '
- 'taken over all pages. (0 to disable, default: %d)' %
- (Config().max_diff_block_lines_html_dir_ratio,
- Config().max_diff_block_lines),
- default=None).completer=RangeCompleter(0,
- Config().max_diff_block_lines, 5)
+ help='In --html-dir output, this is the maximum number '
+ 'of lines output per unified-diff block, across all child '
+ 'pages. (0 to disable, default: %d)' %
+ Config().max_diff_block_lines, default=None).completer=RangeCompleter(
+ Config().max_diff_block_lines)
group2.add_argument('--max-diff-block-lines-parent', metavar='LINES', type=int,
- help='In --html-dir output, this is maximum number of '
- 'lines output per diff block on the parent page '
- 'before spilling it into child pages (0 to disable, '
- 'default: %(default)s, remaining in effect even with '
- '--no-default-limits)',
- default=Config().max_diff_block_lines_parent).completer=RangeCompleter(0,
- Config().max_diff_block_lines_parent, 200)
+ help='Maximum number of lines output per diff block on '
+ 'the top-level (--html-dir) or sole (--html) page, before '
+ 'spilling it into child pages (--html-dir) or skipping the '
+ 'rest of the diff block. (default: %(default)s, remains '
+ 'in effect even with --no-default-limits)', default=
+ Config().max_diff_block_lines_parent).completer=RangeCompleter(
+ Config().max_diff_block_lines_parent)
group2.add_argument('--max-diff-block-lines-saved', metavar='LINES', type=int,
help='Maximum number of lines saved per diff block. '
'Most users should not need this, unless you run out '
'of memory. This truncates diff(1) output before even '
'trying to emit it in a report. This also affects --text '
'output. (0 to disable, default: 0)',
- default=0).completer=RangeCompleter(0, 0, 200)
+ default=0)
group3 = parser.add_argument_group('diff calculation')
group3.add_argument('--new-file', action='store_true',
@@ -173,14 +177,13 @@ def create_parser():
group3.add_argument('--fuzzy-threshold', type=int,
help='Threshold for fuzzy-matching '
'(0 to disable, %(default)s is default, 400 is high fuzziness)',
- default=Config().fuzzy_threshold).completer=RangeCompleter(0,
- 400, 20)
+ default=Config().fuzzy_threshold).completer=RangeCompleter(400)
group3.add_argument('--max-diff-input-lines', metavar='LINES', type=int,
help='Maximum number of lines fed to diff(1) '
'(0 to disable, default: %d)' %
Config().max_diff_input_lines,
- default=None).completer=RangeCompleter(0,
- Config().max_diff_input_lines, 5000)
+ default=None).completer=RangeCompleter(
+ Config().max_diff_input_lines)
group3.add_argument('--max-container-depth', metavar='DEPTH', type=int,
help='Maximum depth to recurse into containers. '
'(Cannot be disabled for security reasons, default: '
@@ -214,8 +217,12 @@ def create_parser():
class RangeCompleter(object):
- def __init__(self, start, end, step):
- self.choices = range(start, end + 1, step)
+ def __init__(self, start, end=0, divisions=16):
+ if end < start:
+ tmp = end
+ end = start
+ start = tmp
+ self.choices = range(start, end + 1, int((end-start+1)/divisions))
def __call__(self, prefix, **kwargs):
return (str(i) for i in self.choices if str(i).startswith(prefix))
@@ -289,11 +296,11 @@ def run_diffoscope(parsed_args):
logger.warning('Fuzzy-matching is currently disabled as the "tlsh" module is unavailable.')
maybe_set_limit(Config(), parsed_args, "max_report_size")
maybe_set_limit(Config(), parsed_args, "max_text_report_size")
- maybe_set_limit(Config(), parsed_args, "max_report_child_size")
- # need to set them in this order due to Config._check_constraints
- maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
- maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_parent")
+ Config().max_page_size_parent = parsed_args.max_page_size_parent
+ Config().max_page_size_child = parsed_args.max_page_size_child
maybe_set_limit(Config(), parsed_args, "max_diff_block_lines")
+ Config().max_diff_block_lines_parent = parsed_args.max_diff_block_lines_parent
+ maybe_set_limit(Config(), parsed_args, "max_diff_block_lines_saved")
maybe_set_limit(Config(), parsed_args, "max_diff_input_lines")
Config().max_container_depth = parsed_args.max_container_depth
Config().fuzzy_threshold = parsed_args.fuzzy_threshold
@@ -301,6 +308,7 @@ def run_diffoscope(parsed_args):
Config().excludes = parsed_args.excludes
Config().exclude_commands = parsed_args.exclude_commands
Config().compute_visual_diffs = PresenterManager().compute_visual_diffs()
+ Config().check_constraints()
set_path()
set_locale()
path1, path2 = parsed_args.path1, parsed_args.path2
diff --git a/diffoscope/presenters/html/html.py b/diffoscope/presenters/html/html.py
index dcc3843..b358bc5 100644
--- a/diffoscope/presenters/html/html.py
+++ b/diffoscope/presenters/html/html.py
@@ -190,7 +190,7 @@ def output_node(difference, path, indentstr, indentnum, css_url, directory):
if difference.unified_diff:
def print_func(x, force=False):
udiff.write(x)
- HTMLPresenter().output_unified_diff(print_func, css_url, directory, difference.unified_diff, difference.has_internal_linenos)
+ HTMLSideBySidePresenter().output_unified_diff(print_func, css_url, directory, difference.unified_diff, difference.has_internal_linenos)
# Construct a PartialString for this node
# {3} gets mapped to {-1}, a continuation hole for later child nodes
@@ -251,7 +251,7 @@ def spl_file_printer(directory, filename):
yield recording_print_func
-class HTMLPresenter(Presenter):
+class HTMLSideBySidePresenter(object):
supports_visual_diffs = True
def __init__(self):
@@ -323,29 +323,30 @@ class HTMLPresenter(Presenter):
def row_was_output(self):
self.spl_rows += 1
_, rotation_params = self.spl_print_ctrl
- max_lines = Config().max_diff_block_lines
+ max_lines = Config().max_diff_block_lines # only for html-dir
max_lines_parent = Config().max_diff_block_lines_parent
- max_lines_ratio = Config().max_diff_block_lines_html_dir_ratio
- max_report_child_size = Config().max_report_child_size
+ max_page_size_child = Config().max_page_size_child
if not rotation_params:
# html-dir single output, don't need to rotate
- if self.spl_rows >= max_lines:
+ if self.spl_rows >= max_lines_parent:
raise DiffBlockLimitReached()
return
else:
# html-dir output, perhaps need to rotate
directory, mainname, css_url = rotation_params
- if self.spl_rows >= max_lines_ratio * max_lines:
+ if self.spl_rows >= max_lines:
raise DiffBlockLimitReached()
if self.spl_current_page == 0: # on parent page
if self.spl_rows < max_lines_parent:
return
+ logger.debug("new unified-diff subpage, parent page went over %s lines", max_lines_parent)
else: # on child page
# TODO: make this stay below the max, instead of going 1 row over the max
# will require some backtracking...
- if self.spl_print_func.bytes_written < max_report_child_size:
+ if self.spl_print_func.bytes_written < max_page_size_child:
return
+ logger.debug("new unified-diff subpage, previous subpage went over %s bytes", max_page_size_child)
self.spl_current_page += 1
filename = "%s-%s.html" % (mainname, self.spl_current_page)
@@ -414,6 +415,10 @@ class HTMLPresenter(Presenter):
text = "load diff (%s %s%s)" % (self.spl_current_page, noun, (", truncated" if truncated else ""))
print_func(templates.UD_TABLE_FOOTER % {"filename": html.escape("%s-1.html" % mainname), "text": text}, force=True)
+
+class HTMLPresenter(Presenter):
+ supports_visual_diffs = True
+
def output_node_placeholder(self, anchor, lazy_load):
if lazy_load:
return templates.DIFFNODE_LAZY_LOAD % anchor
@@ -424,13 +429,17 @@ class HTMLPresenter(Presenter):
partial_outputs = {} # nodes to their partial output
partial_ancestor = {} # child nodes to ancestor nodes
placeholder_len = len(self.output_node_placeholder("XXXXXXXXXXXXXXXX", not single_page))
+ report_printed = [0]
+ report_limit = Config().max_report_size
with contextlib.ExitStack() as xstack:
printers = {} # nodes to their printers
def maybe_print(node):
if partial_outputs[node].holes:
return
- printers[node](partial_outputs[node].format())
+ out = partial_outputs[node].format()
+ report_printed[0] += len(out)
+ printers[node](out)
del partial_outputs[node]
del printers[node]
@@ -443,35 +452,45 @@ class HTMLPresenter(Presenter):
ancestor = partial_ancestor.pop(node, None)
logger.debug('html output for %s', node.source1)
path = score[2] + [node.source1]
+ # TODO: can make this more efficient by doing a "size estimate" first
+ # TODO: take into account the size of the unified-diff child pages
node_output = output_node(node, path, " ", len(path)-1, css_url, None if single_page else target)
anchor = output_anchor(path)
+ add_to_existing = False
if ancestor:
- limit = Config().max_report_child_size
- logger.debug("output size: %s, %s",
- partial_outputs[ancestor].size(placeholder_len), node_output.size(placeholder_len))
- else:
- limit = Config().max_report_size
+ page_limit = Config().max_page_size_parent if ancestor is difference else Config().max_page_size_child
+ page_current = partial_outputs[ancestor].size(placeholder_len)
+ report_current = report_printed[0] + sum(p.size(placeholder_len) for p in partial_outputs.values())
+ want_to_add = node_output.size(placeholder_len)
+ logger.debug("report size: %s/%s, page size: %s/%s, want to add %s)", report_current, report_limit, page_current, page_limit, want_to_add)
+ if report_current + want_to_add > report_limit:
+ make_new_subpage = False
+ elif page_current + want_to_add < page_limit:
+ add_to_existing = True
+ else:
+ make_new_subpage = not single_page
- if ancestor and partial_outputs[ancestor].size(placeholder_len) + node_output.size(placeholder_len) < limit:
+ if add_to_existing:
# under limit, add it to an existing page
partial_outputs[ancestor] = partial_outputs[ancestor].pformat({node: node_output})
stored = ancestor
else:
- # over limit (or root), new subpage
+ # over limit (or root), new subpage or continue/break
if ancestor:
- placeholder = self.output_node_placeholder(anchor, not single_page)
+ placeholder = self.output_node_placeholder(anchor, make_new_subpage)
partial_outputs[ancestor] = partial_outputs[ancestor].pformat({node: placeholder})
maybe_print(ancestor)
footer = get_footer()
- if single_page:
+ if not make_new_subpage: # we hit a limit, either max-report-size or single-page
if not partial_outputs:
- # already output a single page, don't iterate through any more children
+ # no more holes, don't iterate through any more children
break
else:
continue
else:
+ # unconditionally write the root node regardless of limits
assert node is difference
footer = get_footer(jquery_url)
anchor = "index"
@@ -479,6 +498,7 @@ class HTMLPresenter(Presenter):
partial_outputs[node] = node_output.frame(
get_header(css_url) + u'<div class="difference">\n',
u'</div>\n' + footer)
+ assert not single_page or node is difference
printer = make_printer(target) if single_page else file_printer(target, "%s.html" % anchor)
printers[node] = xstack.enter_context(printer)
stored = node
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the diffoscope
mailing list