[diffoscope] 01/01: diff: Use bytes as much as possible, to prevent possible encoding issues

Thu May 10 15:57:20 CEST 2018

This is an automated email from the git hooks/post-receive script.

mattia pushed a commit to branch more-bytes
in repository diffoscope.

commit 340d200ccfebec033750e62550a0c67f6b0b5d4f
Author: Mattia Rizzolo <mattia at debian.org>
Date:   Thu May 10 15:31:22 2018 +0200

    diff: Use bytes as much as possible, to prevent possible encoding issues
    
    In practice, this prevents weird codepoints to behave like linebreaks of
    sorts.
    And while I believe it also decreses memory consume, especially for
    larger diffs.
    
    Signed-off-by: Mattia Rizzolo <mattia at debian.org>
---
 diffoscope/diff.py                       |  61 +++++++++++++++++--------------
 tests/comparators/test_pdf.py            |   6 +++
 tests/data/test_weird_encoding_crap1.pdf | Bin 0 -> 286703 bytes
 tests/data/test_weird_encoding_crap2.pdf | Bin 0 -> 286518 bytes
 4 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/diffoscope/diff.py b/diffoscope/diff.py
index dc8cbfa..fecfcd8 100644
--- a/diffoscope/diff.py
+++ b/diffoscope/diff.py
@@ -43,7 +43,7 @@ re_diff_change = re.compile(r'^([+-@]).*', re.MULTILINE)
 class DiffParser(object):
     RANGE_RE = re.compile(
         # example: '@@ -26814,9 +26814,8 @@'
-        r'''
+        rb'''
           ^
             @@\s+
               -
@@ -62,7 +62,7 @@ class DiffParser(object):
         self._end_nl_q1 = end_nl_q1
         self._end_nl_q2 = end_nl_q2
         self._action = self.read_headers
-        self._diff = io.StringIO()
+        self._diff = io.BytesIO()
         self._success = False
         self._remaining_hunk_lines = None
         self._block_len = None
@@ -72,27 +72,27 @@ class DiffParser(object):
 
     @property
     def diff(self):
-        return self._diff.getvalue()
+        return self._diff.getvalue().decode('UTF-8', errors='replace')
 
     @property
     def success(self):
         return self._success
 
     def parse(self):
-        for line in self._output.splitlines(True):
-            self._action = self._action(line.decode('utf-8', errors='replace'))
+        for line in self._output.split(b'\n'):
+            self._action = self._action(line)
 
-        self._action('')
+        self._action = None
         self._success = True
 
     def read_headers(self, line):
         if not line:
             return None
 
-        if line.startswith('---'):
+        if line.startswith(b'---'):
             return self.read_headers
 
-        if line.startswith('+++'):
+        if line.startswith(b'+++'):
             return self.read_headers
 
         found = DiffParser.RANGE_RE.match(line)
@@ -100,7 +100,7 @@ class DiffParser(object):
         if not found:
             raise ValueError('Unable to parse diff headers: %r' % line)
 
-        self._diff.write(line)
+        self._diff.write(line + b'\n')
         if found.group('len1'):
             self._remaining_hunk_lines = int(found.group('len1'))
         else:
@@ -118,13 +118,13 @@ class DiffParser(object):
         if not line:
             return None
 
-        if line[0] == ' ':
+        if line[:1] == b' ':
             self._remaining_hunk_lines -= 2
-        elif line[0] == '+':
+        elif line[:1] == b'+':
             self._remaining_hunk_lines -= 1
-        elif line[0] == '-':
+        elif line[:1] == b'-':
             self._remaining_hunk_lines -= 1
-        elif line[0] == '\\':
+        elif line[:1] == b'\\':
             # When both files don't end with \n, do not show it as a difference
             if self._end_nl is None:
                 end_nl1 = self._end_nl_q1.get()
@@ -137,28 +137,28 @@ class DiffParser(object):
         else:
             raise ValueError('Unable to parse diff hunk: %r' % line)
 
-        self._diff.write(line)
+        self._diff.write(line + b'\n')
 
-        if line[0] in ('-', '+'):
-            if line[0] == self._direction:
+        if line[:1] in (b'-', b'+'):
+            if line[:1] == self._direction:
                 self._block_len += 1
             else:
                 self._block_len = 1
-                self._direction = line[0]
+                self._direction = line[:1]
 
             if self._block_len >= self._max_lines:
                 return self.skip_block
         else:
             self._block_len = 1
-            self._direction = line[0]
+            self._direction = line[:1]
 
         return self.read_hunk
 
     def skip_block(self, line):
-        if self._remaining_hunk_lines == 0 or line[0] != self._direction:
+        if self._remaining_hunk_lines == 0 or line[:1] != self._direction:
             removed = self._block_len - Config().max_diff_block_lines_saved
             if removed:
-                self._diff.write('%s[ %d lines removed ]\n' % (
+                self._diff.write(b'%s[ %d lines removed ]\n' % (
                     self._direction,
                     removed,
                 ))
@@ -335,29 +335,34 @@ def diff_split_lines(diff, keepends=True):
 
 
 def reverse_unified_diff(diff):
+    assert isinstance(diff, str)
     res = []
+    # XXX - should move to just use plain bytes nearly everywhere until ready
+    # to print instead of using strings internally as well.
     for line in diff_split_lines(diff):
+        line = line.encode('utf-8', errors='replace')
+        assert isinstance(line, bytes)
         found = DiffParser.RANGE_RE.match(line)
 
         if found:
             before = found.group('start2')
             if found.group('len2') is not None:
-                before += ',' + found.group('len2')
+                before += b',' + found.group('len2')
 
             after = found.group('start1')
             if found.group('len1') is not None:
-                after += ',' + found.group('len1')
+                after += b',' + found.group('len1')
 
-            res.append('@@ -%s +%s @@\n' % (before, after))
-        elif line.startswith('-'):
-            res.append('+')
+            res.append(b'@@ -%s +%s @@\n' % (before, after))
+        elif line.startswith(b'-'):
+            res.append(b'+')
             res.append(line[1:])
-        elif line.startswith('+'):
-            res.append('-')
+        elif line.startswith(b'+'):
+            res.append(b'-')
             res.append(line[1:])
         else:
             res.append(line)
-    return ''.join(res)
+    return b''.join(res).decode('utf-8', errors='replace')
 
 
 def color_unified_diff(diff):
diff --git a/tests/comparators/test_pdf.py b/tests/comparators/test_pdf.py
index 3fbed1d..360f228 100644
--- a/tests/comparators/test_pdf.py
+++ b/tests/comparators/test_pdf.py
@@ -28,6 +28,8 @@ from ..utils.nonexisting import assert_non_existing
 
 pdf1 = load_fixture('test1.pdf')
 pdf2 = load_fixture('test2.pdf')
+pdf1a = load_fixture('test_weird_encoding_crap1.pdf')
+pdf2a = load_fixture('test_weird_encoding_crap2.pdf')
 
 
 def test_identification(pdf1):
@@ -38,6 +40,10 @@ def test_no_differences(pdf1):
     difference = pdf1.compare(pdf1)
     assert difference is None
 
+def test_differences_found_with_weird_encoding(pdf1a, pdf2a):
+    # diffoscope used to crash here due to weird encoding
+    difference = pdf1a.compare(pdf2a)
+    assert difference
 
 @pytest.fixture
 def differences(pdf1, pdf2):
diff --git a/tests/data/test_weird_encoding_crap1.pdf b/tests/data/test_weird_encoding_crap1.pdf
new file mode 100644
index 0000000..26b5115
Binary files /dev/null and b/tests/data/test_weird_encoding_crap1.pdf differ
diff --git a/tests/data/test_weird_encoding_crap2.pdf b/tests/data/test_weird_encoding_crap2.pdf
new file mode 100644
index 0000000..51d34f5
Binary files /dev/null and b/tests/data/test_weird_encoding_crap2.pdf differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git