[diffoscope] 01/01: Add support for .docx and .odt files via docx2txt & odt2txt. (Closes: #859056)

Chris Lamb chris at chris-lamb.co.uk
Wed Mar 29 21:55:54 CEST 2017


This is an automated email from the git hooks/post-receive script.

lamby pushed a commit to branch experimental
in repository diffoscope.

commit 5d96a92c783ddc595e840784b6630e654dc60c09
Author: Chris Lamb <lamby at debian.org>
Date:   Wed Mar 29 20:47:27 2017 +0100

    Add support for .docx and .odt files via docx2txt & odt2txt. (Closes: #859056)
    
    Signed-off-by: Chris Lamb <lamby at debian.org>
---
 debian/control                     |   2 ++
 diffoscope/comparators/__init__.py |   2 ++
 diffoscope/comparators/docx.py     |  48 +++++++++++++++++++++++++++++++++
 diffoscope/comparators/odt.py      |  48 +++++++++++++++++++++++++++++++++
 diffoscope/external_tools.py       |   6 +++++
 tests/comparators/test_docx.py     |  54 +++++++++++++++++++++++++++++++++++++
 tests/comparators/test_odt.py      |  54 +++++++++++++++++++++++++++++++++++++
 tests/data/docx_expected_diff      |   3 +++
 tests/data/odt_expected_diff       |   5 ++++
 tests/data/test1.docx              | Bin 0 -> 4046 bytes
 tests/data/test1.odt               | Bin 0 -> 7922 bytes
 tests/data/test1.txt               |   1 +
 tests/data/test2.docx              | Bin 0 -> 4046 bytes
 tests/data/test2.odt               | Bin 0 -> 7931 bytes
 14 files changed, 223 insertions(+)

diff --git a/debian/control b/debian/control
index 29259b8..a327610 100644
--- a/debian/control
+++ b/debian/control
@@ -17,6 +17,7 @@ Build-Depends:
  debhelper (>= 10),
  default-jdk-headless <!nocheck> | default-jdk <!nocheck>,
  dh-python (>= 2.20160818~),
+ docx2txt <!nocheck>,
  dpkg-dev (>= 1.17.14),
  enjarify <!nocheck>,
  fontforge-extras <!nocheck>,
@@ -33,6 +34,7 @@ Build-Depends:
  libjs-jquery-throttle-debounce <!nocheck>,
  llvm <!nocheck>,
  mono-utils <!nocheck>,
+ odt2txt <!nocheck>,
  openssh-client <!nocheck>,
  pdftk <!nocheck>,
  pgpdump <!nocheck>,
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 56fa166..45f6ca4 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -70,6 +70,8 @@ class ComparatorManager(object):
         ('tar.TarFile',),
         ('xz.XzFile',),
         ('apk.ApkFile',),
+        ('odt.OdtFile',),
+        ('docx.DocxFile',),
         ('zip.ZipFile',),
         ('zip.MozillaZipFile',),
         ('image.JPEGImageFile',),
diff --git a/diffoscope/comparators/docx.py b/diffoscope/comparators/docx.py
new file mode 100644
index 0000000..6988af6
--- /dev/null
+++ b/diffoscope/comparators/docx.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
+
+import re
+
+from diffoscope.tools import tool_required
+from diffoscope.difference import Difference
+
+from .utils.file import File
+from .utils.command import Command
+
+
+class Docx2txt(Command):
+    @tool_required('docx2txt')
+    def cmdline(self):
+        return (
+            'docx2txt',
+            self.path,
+            '-',
+        )
+
+
+class DocxFile(File):
+    RE_FILE_TYPE = re.compile(r'^Microsoft Word 2007+\b')
+
+    def compare_details(self, other, source=None):
+        return [Difference.from_command(
+            Docx2txt,
+            self.path,
+            other.path,
+            source='docx2txt',
+       )]
diff --git a/diffoscope/comparators/odt.py b/diffoscope/comparators/odt.py
new file mode 100644
index 0000000..78dead3
--- /dev/null
+++ b/diffoscope/comparators/odt.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
+
+import re
+
+from diffoscope.tools import tool_required
+from diffoscope.difference import Difference
+
+from .utils.file import File
+from .utils.command import Command
+
+
+class Odt2txt(Command):
+    @tool_required('odt2txt')
+    def cmdline(self):
+        return (
+            'odt2txt',
+            '--encoding=UTF-8',
+            self.path,
+        )
+
+
+class OdtFile(File):
+    RE_FILE_TYPE = re.compile(r'^OpenDocument Text\b')
+
+    def compare_details(self, other, source=None):
+        return [Difference.from_command(
+            Odt2txt,
+            self.path,
+            other.path,
+            source='odt2txt',
+       )]
diff --git a/diffoscope/external_tools.py b/diffoscope/external_tools.py
index 5579994..d68db1f 100644
--- a/diffoscope/external_tools.py
+++ b/diffoscope/external_tools.py
@@ -44,6 +44,9 @@ EXTERNAL_TOOLS = {
         'debian': 'diffutils',
         'arch': 'diffutils',
     },
+    'docx2txt': {
+        'debian': 'docx2txt',
+    },
     'enjarify': {
         'debian': 'enjarify',
         'arch': 'enjarify',
@@ -136,6 +139,9 @@ EXTERNAL_TOOLS = {
         'debian': 'binutils-multiarch',
         'arch': 'binutils',
     },
+    'odt2txt': {
+        'debian': 'odt2txt',
+    },
     'pgpdump': {
         'debian': 'pgpdump',
         'arch': 'pgpdump',
diff --git a/tests/comparators/test_docx.py b/tests/comparators/test_docx.py
new file mode 100644
index 0000000..675374c
--- /dev/null
+++ b/tests/comparators/test_docx.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
+
+import pytest
+
+from diffoscope.comparators.docx import DocxFile
+
+from utils.data import load_fixture, get_data
+from utils.tools import skip_unless_tools_exist
+from utils.nonexisting import assert_non_existing
+
+docx1 = load_fixture('test1.docx')
+docx2 = load_fixture('test2.docx')
+
+
+def test_identification(docx1):
+    assert isinstance(docx1, DocxFile)
+
+
+def test_no_differences(docx1):
+    difference = docx1.compare(docx1)
+    assert difference is None
+
+
+ at pytest.fixture
+def differences(docx1, docx2):
+    return docx1.compare(docx2).details
+
+
+ at skip_unless_tools_exist('docx2txt')
+def test_diff(differences):
+    expected_diff = get_data('docx_expected_diff')
+    assert differences[0].unified_diff == expected_diff
+
+
+ at skip_unless_tools_exist('docx2txt')
+def test_compare_non_existing(monkeypatch, docx1):
+    assert_non_existing(monkeypatch, docx1, has_null_source=False)
diff --git a/tests/comparators/test_odt.py b/tests/comparators/test_odt.py
new file mode 100644
index 0000000..5e2bcda
--- /dev/null
+++ b/tests/comparators/test_odt.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
+
+import pytest
+
+from diffoscope.comparators.odt import OdtFile
+
+from utils.data import load_fixture, get_data
+from utils.tools import skip_unless_tools_exist
+from utils.nonexisting import assert_non_existing
+
+odt1 = load_fixture('test1.odt')
+odt2 = load_fixture('test2.odt')
+
+
+def test_identification(odt1):
+    assert isinstance(odt1, OdtFile)
+
+
+def test_no_differences(odt1):
+    difference = odt1.compare(odt1)
+    assert difference is None
+
+
+ at pytest.fixture
+def differences(odt1, odt2):
+    return odt1.compare(odt2).details
+
+
+ at skip_unless_tools_exist('odt2txt')
+def test_diff(differences):
+    expected_diff = get_data('odt_expected_diff')
+    assert differences[0].unified_diff == expected_diff
+
+
+ at skip_unless_tools_exist('odt2txt')
+def test_compare_non_existing(monkeypatch, odt1):
+    assert_non_existing(monkeypatch, odt1, has_null_source=False)
diff --git a/tests/data/docx_expected_diff b/tests/data/docx_expected_diff
new file mode 100644
index 0000000..a2319c2
--- /dev/null
+++ b/tests/data/docx_expected_diff
@@ -0,0 +1,3 @@
+@@ -1 +1 @@
+-a
++b
diff --git a/tests/data/odt_expected_diff b/tests/data/odt_expected_diff
new file mode 100644
index 0000000..7a20d9a
--- /dev/null
+++ b/tests/data/odt_expected_diff
@@ -0,0 +1,5 @@
+@@ -1,3 +1,3 @@
+ 
+-a
++b
+ 
diff --git a/tests/data/test1.docx b/tests/data/test1.docx
new file mode 100644
index 0000000..f262842
Binary files /dev/null and b/tests/data/test1.docx differ
diff --git a/tests/data/test1.odt b/tests/data/test1.odt
new file mode 100644
index 0000000..d39ee4c
Binary files /dev/null and b/tests/data/test1.odt differ
diff --git a/tests/data/test1.txt b/tests/data/test1.txt
new file mode 100644
index 0000000..7898192
--- /dev/null
+++ b/tests/data/test1.txt
@@ -0,0 +1 @@
+a
diff --git a/tests/data/test2.docx b/tests/data/test2.docx
new file mode 100644
index 0000000..bc5b405
Binary files /dev/null and b/tests/data/test2.docx differ
diff --git a/tests/data/test2.odt b/tests/data/test2.odt
new file mode 100644
index 0000000..a02a7f8
Binary files /dev/null and b/tests/data/test2.odt differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git


More information about the diffoscope mailing list