[diffoscope] 01/01: More ydiff/linediff from diffoscope.{difference => diff} to group unified_diff related things together

Fri Jun 16 17:58:46 CEST 2017

This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch experimental
in repository diffoscope.

commit 56fcdec5c6948dc5fe91c36d2dff731da39fd610
Author: Ximin Luo <infinity0 at debian.org>
Date:   Fri Jun 16 17:55:39 2017 +0200

    More ydiff/linediff from diffoscope.{difference => diff} to group unified_diff related things together
---
 diffoscope/diff.py                 | 252 ++++++++++++++++++++++++++++++++++++
 diffoscope/difference.py           | 255 -------------------------------------
 diffoscope/presenters/html/html.py |   2 +-
 3 files changed, 253 insertions(+), 256 deletions(-)

diff --git a/diffoscope/diff.py b/diffoscope/diff.py
index 99b5230..6710361 100644
--- a/diffoscope/diff.py
+++ b/diffoscope/diff.py
@@ -289,3 +289,255 @@ def color_unified_diff(diff):
         }[m.group(1)], m.group(0), RESET)
 
     return re_diff_change.sub(repl, diff)
+
+DIFFON = "\x01"
+DIFFOFF = "\x02"
+
+def _linediff_sane(x):
+    r = ""
+    for i in x:
+        j = ord(i)
+        if i not in ['\t', '\n'] and (j < 32):
+            r = r + "."
+        else:
+            r = r + i
+    return r
+
+def linediff(s, t, diffon, diffoff):
+    '''
+    Original line diff algorithm of diff2html. It's character based.
+    '''
+    if len(s):
+        s = ''.join([ _linediff_sane(c) for c in s ])
+    if len(t):
+        t = ''.join([ _linediff_sane(c) for c in t ])
+
+    m, n = len(s), len(t)
+    d = [[(0, 0) for i in range(n+1)] for i in range(m+1)]
+
+
+    d[0][0] = (0, (0, 0))
+    for i in range(m+1)[1:]:
+        d[i][0] = (i,(i-1, 0))
+    for j in range(n+1)[1:]:
+        d[0][j] = (j,(0, j-1))
+
+    for i in range(m+1)[1:]:
+        for j in range(n+1)[1:]:
+            if s[i-1] == t[j-1]:
+                cost = 0
+            else:
+                cost = 1
+            d[i][j] = min((d[i-1][j][0] + 1, (i-1, j)),
+                          (d[i][j-1][0] + 1, (i, j-1)),
+                          (d[i-1][j-1][0] + cost, (i-1, j-1)))
+
+    l = []
+    coord = (m, n)
+    while coord != (0, 0):
+        l.insert(0, coord)
+        x, y = coord
+        coord = d[x][y][1]
+
+    l1 = []
+    l2 = []
+
+    for coord in l:
+        cx, cy = coord
+        child_val = d[cx][cy][0]
+
+        father_coord = d[cx][cy][1]
+        fx, fy = father_coord
+        father_val = d[fx][fy][0]
+
+        diff = (cx-fx, cy-fy)
+
+        if diff == (0, 1):
+            l1.append("")
+            l2.append(diffon + t[fy] + diffoff)
+        elif diff == (1, 0):
+            l1.append(diffon + s[fx] + diffoff)
+            l2.append("")
+        elif child_val-father_val == 1:
+            l1.append(diffon + s[fx] + diffoff)
+            l2.append(diffon + t[fy] + diffoff)
+        else:
+            l1.append(s[fx])
+            l2.append(t[fy])
+
+    return ''.join(l1).replace(diffoff + diffon, ''), ''.join(l2).replace(diffoff + diffon, '')
+
+
+class SideBySideDiff(object):
+    """Calculates a side-by-side diff from a unified diff."""
+
+    def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
+        self.unified_diff = unified_diff
+        self.diffon = diffon
+        self.diffoff = diffoff
+        self.reset()
+
+    def reset(self):
+        self.buf = []
+        self.add_cpt = 0
+        self.del_cpt = 0
+        self.line1 = 0
+        self.line2 = 0
+        self.hunk_off1 = 0
+        self.hunk_size1 = 0
+        self.hunk_off2 = 0
+        self.hunk_size2 = 0
+        self._bytes_processed = 0
+
+    @property
+    def bytes_processed(self):
+        return self._bytes_processed
+
+    def empty_buffer(self):
+        if self.del_cpt == 0 or self.add_cpt == 0:
+            for l in self.buf:
+                yield from self.yield_line(l[0], l[1])
+
+        elif self.del_cpt != 0 and self.add_cpt != 0:
+            l0, l1 = [], []
+            for l in self.buf:
+                if l[0] != None:
+                    l0.append(l[0])
+                if l[1] != None:
+                    l1.append(l[1])
+            max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
+            for i in range(max_len):
+                s0, s1 = "", ""
+                if i < len(l0):
+                    s0 = l0[i]
+                if i < len(l1):
+                    s1 = l1[i]
+                yield from self.yield_line(s0, s1)
+
+    def yield_line(self, s1, s2):
+        orig1 = s1
+        orig2 = s2
+
+        if s1 == None and s2 == None:
+            type_name = "unmodified"
+        elif s1 == "" and s2 == "":
+            type_name = "unmodified"
+        elif s1 == None or s1 == "":
+            type_name = "added"
+        elif s2 == None or s2 == "":
+            type_name = "deleted"
+        elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
+            type_name = "unmodified"
+        else:
+            type_name = "changed"
+            s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
+
+        yield "L", (type_name, s1, self.line1, s2, self.line2)
+
+        m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
+        if m:
+            self.line1 += int(m.group(1))
+        elif orig1:
+            self.line1 += 1
+        m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
+        if m:
+            self.line2 += int(m.group(1))
+        elif orig2:
+            self.line2 += 1
+
+        self.add_cpt = 0
+        self.del_cpt = 0
+        self.buf = []
+
+    def items(self):
+        """Yield the items that form the side-by-side diff.
+
+        Each item is a (type, value) tuple, as follows:
+
+        type == "H", value is a tuple representing a hunk header
+            hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
+            all ints
+
+        type == "L", value is a tuple representing a line of a hunk
+            mode, line1, lineno1, line2, lineno2 = value
+            where mode is one of {"unmodified", "added", "deleted", "changed"}
+            line* are strings
+            lineno* are ints
+
+        type == "C", value is a comment
+            comment = value
+            a string
+        """
+        self.reset()
+
+        for l in self.unified_diff.splitlines():
+            self._bytes_processed += len(l) + 1
+            m = re.match(r'^--- ([^\s]*)', l)
+            if m:
+                yield from self.empty_buffer()
+                continue
+            m = re.match(r'^\+\+\+ ([^\s]*)', l)
+            if m:
+                yield from self.empty_buffer()
+                continue
+
+            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
+            if m:
+                yield from self.empty_buffer()
+                hunk_data = map(lambda x:x=="" and 1 or int(x), m.groups())
+                self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2 = hunk_data
+                self.line1, self.line2 = self.hunk_off1, self.hunk_off2
+                yield "H", (self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2)
+                continue
+
+            if re.match(r'^\[', l):
+                yield from self.empty_buffer()
+                yield "C", l
+
+            if re.match(r"^\\ No newline", l):
+                if self.hunk_size2 == 0:
+                    self.buf[-1] = (self.buf[-1][0], self.buf[-1][1] + '\n' + l[2:])
+                else:
+                    self.buf[-1] = (buf[-1][0] + '\n' + l[2:], self.buf[-1][1])
+                continue
+
+            if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
+                yield from self.empty_buffer()
+                continue
+
+            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
+            if m:
+                self.add_cpt += int(m.group(1))
+                self.hunk_size2 -= int(m.group(1))
+                self.buf.append((None, l[1:]))
+                continue
+
+            if re.match(r"^\+", l):
+                self.add_cpt += 1
+                self.hunk_size2 -= 1
+                self.buf.append((None, l[1:]))
+                continue
+
+            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
+            if m:
+                self.del_cpt += int(m.group(1))
+                self.hunk_size1 -= int(m.group(1))
+                self.buf.append((l[1:], None))
+                continue
+
+            if re.match(r"^-", l):
+                self.del_cpt += 1
+                self.hunk_size1 -= 1
+                self.buf.append((l[1:], None))
+                continue
+
+            if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
+                yield from self.empty_buffer()
+                self.hunk_size1 -= 1
+                self.hunk_size2 -= 1
+                self.buf.append((l[1:], l[1:]))
+                continue
+
+            yield from self.empty_buffer()
+
+        yield from self.empty_buffer()
diff --git a/diffoscope/difference.py b/diffoscope/difference.py
index ce5b1e9..54cb686 100644
--- a/diffoscope/difference.py
+++ b/diffoscope/difference.py
@@ -17,9 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
 
-import hashlib
 import heapq
-import re
 import logging
 
 from . import feeders
@@ -325,256 +323,3 @@ class VisualDifference(object):
 
     def size(self):
         return len(self.data_type) + len(self.content) + len(self.source)
-
-
-DIFFON = "\x01"
-DIFFOFF = "\x02"
-
-def _linediff_sane(x):
-    r = ""
-    for i in x:
-        j = ord(i)
-        if i not in ['\t', '\n'] and (j < 32):
-            r = r + "."
-        else:
-            r = r + i
-    return r
-
-def linediff(s, t, diffon, diffoff):
-    '''
-    Original line diff algorithm of diff2html. It's character based.
-    '''
-    if len(s):
-        s = ''.join([ _linediff_sane(c) for c in s ])
-    if len(t):
-        t = ''.join([ _linediff_sane(c) for c in t ])
-
-    m, n = len(s), len(t)
-    d = [[(0, 0) for i in range(n+1)] for i in range(m+1)]
-
-
-    d[0][0] = (0, (0, 0))
-    for i in range(m+1)[1:]:
-        d[i][0] = (i,(i-1, 0))
-    for j in range(n+1)[1:]:
-        d[0][j] = (j,(0, j-1))
-
-    for i in range(m+1)[1:]:
-        for j in range(n+1)[1:]:
-            if s[i-1] == t[j-1]:
-                cost = 0
-            else:
-                cost = 1
-            d[i][j] = min((d[i-1][j][0] + 1, (i-1, j)),
-                          (d[i][j-1][0] + 1, (i, j-1)),
-                          (d[i-1][j-1][0] + cost, (i-1, j-1)))
-
-    l = []
-    coord = (m, n)
-    while coord != (0, 0):
-        l.insert(0, coord)
-        x, y = coord
-        coord = d[x][y][1]
-
-    l1 = []
-    l2 = []
-
-    for coord in l:
-        cx, cy = coord
-        child_val = d[cx][cy][0]
-
-        father_coord = d[cx][cy][1]
-        fx, fy = father_coord
-        father_val = d[fx][fy][0]
-
-        diff = (cx-fx, cy-fy)
-
-        if diff == (0, 1):
-            l1.append("")
-            l2.append(diffon + t[fy] + diffoff)
-        elif diff == (1, 0):
-            l1.append(diffon + s[fx] + diffoff)
-            l2.append("")
-        elif child_val-father_val == 1:
-            l1.append(diffon + s[fx] + diffoff)
-            l2.append(diffon + t[fy] + diffoff)
-        else:
-            l1.append(s[fx])
-            l2.append(t[fy])
-
-    return ''.join(l1).replace(diffoff + diffon, ''), ''.join(l2).replace(diffoff + diffon, '')
-
-
-class SideBySideDiff(object):
-    """Calculates a side-by-side diff from a unified diff."""
-
-    def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
-        self.unified_diff = unified_diff
-        self.diffon = diffon
-        self.diffoff = diffoff
-        self.reset()
-
-    def reset(self):
-        self.buf = []
-        self.add_cpt = 0
-        self.del_cpt = 0
-        self.line1 = 0
-        self.line2 = 0
-        self.hunk_off1 = 0
-        self.hunk_size1 = 0
-        self.hunk_off2 = 0
-        self.hunk_size2 = 0
-        self._bytes_processed = 0
-
-    @property
-    def bytes_processed(self):
-        return self._bytes_processed
-
-    def empty_buffer(self):
-        if self.del_cpt == 0 or self.add_cpt == 0:
-            for l in self.buf:
-                yield from self.yield_line(l[0], l[1])
-
-        elif self.del_cpt != 0 and self.add_cpt != 0:
-            l0, l1 = [], []
-            for l in self.buf:
-                if l[0] != None:
-                    l0.append(l[0])
-                if l[1] != None:
-                    l1.append(l[1])
-            max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
-            for i in range(max_len):
-                s0, s1 = "", ""
-                if i < len(l0):
-                    s0 = l0[i]
-                if i < len(l1):
-                    s1 = l1[i]
-                yield from self.yield_line(s0, s1)
-
-    def yield_line(self, s1, s2):
-        orig1 = s1
-        orig2 = s2
-
-        if s1 == None and s2 == None:
-            type_name = "unmodified"
-        elif s1 == "" and s2 == "":
-            type_name = "unmodified"
-        elif s1 == None or s1 == "":
-            type_name = "added"
-        elif s2 == None or s2 == "":
-            type_name = "deleted"
-        elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
-            type_name = "unmodified"
-        else:
-            type_name = "changed"
-            s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
-
-        yield "L", (type_name, s1, self.line1, s2, self.line2)
-
-        m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
-        if m:
-            self.line1 += int(m.group(1))
-        elif orig1:
-            self.line1 += 1
-        m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
-        if m:
-            self.line2 += int(m.group(1))
-        elif orig2:
-            self.line2 += 1
-
-        self.add_cpt = 0
-        self.del_cpt = 0
-        self.buf = []
-
-    def items(self):
-        """Yield the items that form the side-by-side diff.
-
-        Each item is a (type, value) tuple, as follows:
-
-        type == "H", value is a tuple representing a hunk header
-            hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
-            all ints
-
-        type == "L", value is a tuple representing a line of a hunk
-            mode, line1, lineno1, line2, lineno2 = value
-            where mode is one of {"unmodified", "added", "deleted", "changed"}
-            line* are strings
-            lineno* are ints
-
-        type == "C", value is a comment
-            comment = value
-            a string
-        """
-        self.reset()
-
-        for l in self.unified_diff.splitlines():
-            self._bytes_processed += len(l) + 1
-            m = re.match(r'^--- ([^\s]*)', l)
-            if m:
-                yield from self.empty_buffer()
-                continue
-            m = re.match(r'^\+\+\+ ([^\s]*)', l)
-            if m:
-                yield from self.empty_buffer()
-                continue
-
-            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
-            if m:
-                yield from self.empty_buffer()
-                hunk_data = map(lambda x:x=="" and 1 or int(x), m.groups())
-                self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2 = hunk_data
-                self.line1, self.line2 = self.hunk_off1, self.hunk_off2
-                yield "H", (self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2)
-                continue
-
-            if re.match(r'^\[', l):
-                yield from self.empty_buffer()
-                yield "C", l
-
-            if re.match(r"^\\ No newline", l):
-                if self.hunk_size2 == 0:
-                    self.buf[-1] = (self.buf[-1][0], self.buf[-1][1] + '\n' + l[2:])
-                else:
-                    self.buf[-1] = (buf[-1][0] + '\n' + l[2:], self.buf[-1][1])
-                continue
-
-            if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
-                yield from self.empty_buffer()
-                continue
-
-            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
-            if m:
-                self.add_cpt += int(m.group(1))
-                self.hunk_size2 -= int(m.group(1))
-                self.buf.append((None, l[1:]))
-                continue
-
-            if re.match(r"^\+", l):
-                self.add_cpt += 1
-                self.hunk_size2 -= 1
-                self.buf.append((None, l[1:]))
-                continue
-
-            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
-            if m:
-                self.del_cpt += int(m.group(1))
-                self.hunk_size1 -= int(m.group(1))
-                self.buf.append((l[1:], None))
-                continue
-
-            if re.match(r"^-", l):
-                self.del_cpt += 1
-                self.hunk_size1 -= 1
-                self.buf.append((l[1:], None))
-                continue
-
-            if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
-                yield from self.empty_buffer()
-                self.hunk_size1 -= 1
-                self.hunk_size2 -= 1
-                self.buf.append((l[1:], l[1:]))
-                continue
-
-            yield from self.empty_buffer()
-
-        yield from self.empty_buffer()
diff --git a/diffoscope/presenters/html/html.py b/diffoscope/presenters/html/html.py
index e7ad83c..bb847a7 100644
--- a/diffoscope/presenters/html/html.py
+++ b/diffoscope/presenters/html/html.py
@@ -43,7 +43,7 @@ import contextlib
 
 from diffoscope import VERSION
 from diffoscope.config import Config
-from diffoscope.difference import SideBySideDiff, DIFFON, DIFFOFF
+from diffoscope.diff import SideBySideDiff, DIFFON, DIFFOFF
 
 from ..icon import FAVICON_BASE64
 from ..utils import PrintLimitReached, DiffBlockLimitReached, \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git