[Git][reproducible-builds/diffoscope][master] Don't crash if we can open a PDF file with PyPDF, but cannot parse the...
Chris Lamb (@lamby)
gitlab at salsa.debian.org
Mon Aug 15 22:52:17 UTC 2022
Chris Lamb pushed to branch master at Reproducible Builds / diffoscope
Commits:
dbeab9e3 by Chris Lamb at 2022-08-15T15:51:52-07:00
Don't crash if we can open a PDF file with PyPDF, but cannot parse the annotations within. (Closes: reproducible-builds/diffoscope#311)
- - - - -
1 changed file:
- diffoscope/comparators/pdf.py
Changes:
=====================================
diffoscope/comparators/pdf.py
=====================================
@@ -17,6 +17,8 @@
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+import logging
+import os
import re
from diffoscope.tools import python_module_missing, tool_required
@@ -25,6 +27,8 @@ from diffoscope.difference import Difference
from .utils.file import File
from .utils.command import Command
+logger = logging.getLogger(__name__)
+
try:
import PyPDF2
@@ -95,40 +99,44 @@ class PdfFile(File):
return xs
- @staticmethod
- def dump_pypdf2_metadata(file):
+ def dump_pypdf2_metadata(self, file):
try:
pdf = PyPDF2.PdfFileReader(file.path)
document_info = pdf.getDocumentInfo()
- except PdfReadError as e:
- return f"(Could not extract metadata: {e})"
- if document_info is None:
- return ""
+ if document_info is None:
+ return ""
- xs = []
- for k, v in sorted(document_info.items()):
- xs.append("{}: {!r}".format(k.lstrip("/"), v))
+ xs = []
+ for k, v in sorted(document_info.items()):
+ xs.append("{}: {!r}".format(k.lstrip("/"), v))
- return "\n".join(xs)
+ return "\n".join(xs)
+ except PdfReadError as e:
+ msg = f"Could not extract PyPDF2 metadata from {os.path.basename(file.name)}: {e}"
+ self.add_comment(msg)
+ logger.error(msg)
+ return ""
- @staticmethod
- def dump_pypdf2_annotations(file):
+ def dump_pypdf2_annotations(self, file):
try:
pdf = PyPDF2.PdfFileReader(file.path)
- except PdfReadError as e:
- return f"(Could not open file: {e})"
- xs = []
- for x in range(pdf.getNumPages()):
- page = pdf.getPage(x)
-
- try:
- for annot in page["/Annots"]:
- subtype = annot.getObject()["/Subtype"]
- if subtype == "/Text":
- xs.append(annot.getObject()["/Contents"])
- except:
- pass
-
- return "\n".join(xs)
+ xs = []
+ for x in range(pdf.getNumPages()):
+ page = pdf.getPage(x)
+
+ try:
+ for annot in page["/Annots"]:
+ subtype = annot.getObject()["/Subtype"]
+ if subtype == "/Text":
+ xs.append(annot.getObject()["/Contents"])
+ except:
+ pass
+
+ return "\n".join(xs)
+ except PdfReadError as e:
+ msg = f"Could not extract PyPDF2 annotations from {os.path.basename(file.name)}: {e}"
+ file.add_comment(msg)
+ logger.error(msg)
+ return ""
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/commit/dbeab9e365900736bea3285587e303241c8e996c
--
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/commit/dbeab9e365900736bea3285587e303241c8e996c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20220815/bf4d01cf/attachment.htm>
More information about the rb-commits
mailing list