[diffoscope] 01/01: Convert HTML character entity references to UTF-8 characters to save space

Jérémy Bobbio lunar at moszumanska.debian.org
Thu Dec 17 14:55:32 CET 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository diffoscope.

commit 73b1cf40be6d7a1f4a9d75f5334a08fae864187a
Author: Esa Peuha <esa.peuha at gmail.com>
Date:   Thu Dec 17 14:53:26 2015 +0100

    Convert HTML character entity references to UTF-8 characters to save space
    
    Let's convert HTML character entity references (which each use 6-8
    characters and as many bytes in the HTML file) to actual characters
    (which UTF-8 encodes as 2-3 bytes). Since all diffoscope output files
    are peppered with abundant amounts of these things, this could reduce
    the file sizes by a few percent at least.
    
    We use Python string literals instead of the actual characters in the
    Python file, because 1) the non-breaking and zero-width spaces would be
    very hard to distinguish from ordinary space and missing string content,
    respectively, and 2) it is impossible to be sure that every piece of
    software that is ever going to be used to view or edit the file would
    handle non-ASCII characters correctly.
---
 diffoscope/presenters/html.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/diffoscope/presenters/html.py b/diffoscope/presenters/html.py
index f425889..70763da 100644
--- a/diffoscope/presenters/html.py
+++ b/diffoscope/presenters/html.py
@@ -290,9 +290,9 @@ def convert(s, ponct=0, tag=''):
             n = TABSIZE-(i%TABSIZE)
             if n == 0:
                 n = TABSIZE
-            t.write('<span class="diffponct">»</span>'+' '*(n-1))
+            t.write('<span class="diffponct">\xbb</span>'+'\xa0'*(n-1))
         elif c == " " and ponct == 1:
-            t.write('<span class="diffponct">·</span>')
+            t.write('<span class="diffponct">\xb7</span>')
         elif c == "\n" and ponct == 1:
             t.write('<br/><span class="diffponct">\</span>')
         elif ord(c) < 32:
@@ -304,11 +304,11 @@ def convert(s, ponct=0, tag=''):
             i += 1
 
         if WORDBREAK.count(c) == 1:
-            t.write('​')
+            t.write('\u200b')
             i = 0
         if i > LINESIZE:
             i = 0
-            t.write("​")
+            t.write('\u200b')
 
     return t.getvalue()
 
@@ -353,7 +353,7 @@ def output_line(print_func, s1, s2):
             print_func(u'</td>')
         else:
             s1 = ""
-            print_func(u'<td colspan="2"> </td>')
+            print_func(u'<td colspan="2">\xa0</td>')
 
         if s2 is not None:
             print_func(u'<td class="diffline">%d </td>' % line2)
@@ -362,7 +362,7 @@ def output_line(print_func, s1, s2):
             print_func(u'</td>')
         else:
             s2 = ""
-            print_func(u'<td colspan="2"> </td>')
+            print_func(u'<td colspan="2">\xa0</td>')
     finally:
         print_func(u"</tr>\n", force=True)
 
@@ -522,7 +522,7 @@ def output_difference(difference, print_func, css_url, directory, parents):
             print_func(u"<div><span class='source'>%s</span>"
                        % escape(difference.source2))
         anchor = '/'.join(sources[1:])
-        print_func(u" <a class='anchor' href='#%s' name='%s'>¶</a>" % (anchor, anchor))
+        print_func(u" <a class='anchor' href='#%s' name='%s'>\xb6</a>" % (anchor, anchor))
         print_func(u"</div>")
         if difference.comments:
             print_func(u"<div class='comment'>%s</div>"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list