[Git][reproducible-builds/diffoscope][master] Don't crash if we can open a PDF file with PyPDF, but cannot parse the...

Mon Aug 15 22:52:17 UTC 2022


Chris Lamb pushed to branch master at Reproducible Builds / diffoscope


Commits:
dbeab9e3 by Chris Lamb at 2022-08-15T15:51:52-07:00
Don't crash if we can open a PDF file with PyPDF, but cannot parse the annotations within. (Closes: reproducible-builds/diffoscope#311)

- - - - -


1 changed file:

- diffoscope/comparators/pdf.py


Changes:

=====================================
diffoscope/comparators/pdf.py
=====================================
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
 
+import logging
+import os
 import re
 
 from diffoscope.tools import python_module_missing, tool_required
@@ -25,6 +27,8 @@ from diffoscope.difference import Difference
 from .utils.file import File
 from .utils.command import Command
 
+logger = logging.getLogger(__name__)
+
 try:
     import PyPDF2
 
@@ -95,40 +99,44 @@ class PdfFile(File):
 
         return xs
 
-    @staticmethod
-    def dump_pypdf2_metadata(file):
+    def dump_pypdf2_metadata(self, file):
         try:
             pdf = PyPDF2.PdfFileReader(file.path)
             document_info = pdf.getDocumentInfo()
-        except PdfReadError as e:
-            return f"(Could not extract metadata: {e})"
 
-        if document_info is None:
-            return ""
+            if document_info is None:
+                return ""
 
-        xs = []
-        for k, v in sorted(document_info.items()):
-            xs.append("{}: {!r}".format(k.lstrip("/"), v))
+            xs = []
+            for k, v in sorted(document_info.items()):
+                xs.append("{}: {!r}".format(k.lstrip("/"), v))
 
-        return "\n".join(xs)
+            return "\n".join(xs)
+        except PdfReadError as e:
+            msg = f"Could not extract PyPDF2 metadata from {os.path.basename(file.name)}: {e}"
+            self.add_comment(msg)
+            logger.error(msg)
+            return ""
 
-    @staticmethod
-    def dump_pypdf2_annotations(file):
+    def dump_pypdf2_annotations(self, file):
         try:
             pdf = PyPDF2.PdfFileReader(file.path)
-        except PdfReadError as e:
-            return f"(Could not open file: {e})"
 
-        xs = []
-        for x in range(pdf.getNumPages()):
-            page = pdf.getPage(x)
-
-            try:
-                for annot in page["/Annots"]:
-                    subtype = annot.getObject()["/Subtype"]
-                    if subtype == "/Text":
-                        xs.append(annot.getObject()["/Contents"])
-            except:
-                pass
-
-        return "\n".join(xs)
+            xs = []
+            for x in range(pdf.getNumPages()):
+                page = pdf.getPage(x)
+
+                try:
+                    for annot in page["/Annots"]:
+                        subtype = annot.getObject()["/Subtype"]
+                        if subtype == "/Text":
+                            xs.append(annot.getObject()["/Contents"])
+                except:
+                    pass
+
+            return "\n".join(xs)
+        except PdfReadError as e:
+            msg = f"Could not extract PyPDF2 annotations from {os.path.basename(file.name)}: {e}"
+            file.add_comment(msg)
+            logger.error(msg)
+            return ""



View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/commit/dbeab9e365900736bea3285587e303241c8e996c

-- 
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/commit/dbeab9e365900736bea3285587e303241c8e996c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20220815/bf4d01cf/attachment.htm>