[Git][reproducible-builds/diffoscope][master] 3 commits: Add support for comparing the 'eXtensible ARchive' (.XAR/.PKG) file format
Chris Lamb (@lamby)
gitlab at salsa.debian.org
Fri Jan 19 17:34:51 UTC 2024
Chris Lamb pushed to branch master at Reproducible Builds / diffoscope
Commits:
241c92af by Seth Michael Larson at 2024-01-08T10:15:12+00:00
Add support for comparing the 'eXtensible ARchive' (.XAR/.PKG) file format
- - - - -
9db36bc1 by Chris Lamb at 2024-01-08T10:15:35+00:00
Reflow according to black.
- - - - -
bc480005 by Chris Lamb at 2024-01-19T09:32:32-08:00
releasing package diffoscope version 254
- - - - -
4 changed files:
- debian/changelog
- diffoscope/__init__.py
- diffoscope/comparators/__init__.py
- + diffoscope/comparators/xar.py
Changes:
=====================================
debian/changelog
=====================================
@@ -1,8 +1,15 @@
-diffoscope (254) UNRELEASED; urgency=medium
+diffoscope (254) unstable; urgency=medium
- * WIP (generated upon release).
+ [ Chris Lamb ]
+ * Reflow some code according to black.
+
+ [ Seth Michael Larson ]
+ * Add support for comparing the 'eXtensible ARchive' (.XAR/.PKG) file format.
+
+ [ Vagrant Cascadian ]
+ * Add external tool on GNU Guix for 7z.
- -- Chris Lamb <lamby at debian.org> Fri, 08 Dec 2023 12:39:27 +0000
+ -- Chris Lamb <lamby at debian.org> Fri, 19 Jan 2024 09:32:28 -0800
diffoscope (253) unstable; urgency=medium
=====================================
diffoscope/__init__.py
=====================================
@@ -17,4 +17,4 @@
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
-VERSION = "253"
+VERSION = "254"
=====================================
diffoscope/comparators/__init__.py
=====================================
@@ -123,6 +123,7 @@ class ComparatorManager:
("zst.ZstFile",),
("vmlinuz.VmlinuzFile",),
("arsc.ArscFile",),
+ ("xar.XarFile",),
)
_singleton = {}
=====================================
diffoscope/comparators/xar.py
=====================================
@@ -0,0 +1,176 @@
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2024 Seth Michael Larson <seth at python.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+import hashlib
+import re
+import logging
+import struct
+import xml.etree.ElementTree as ET
+import zlib
+import os
+
+from .utils.file import File
+from .utils.archive import Archive
+from diffoscope.difference import Difference
+
+logger = logging.getLogger(__name__)
+
+
+class XarContainer(Archive):
+ def get_member_names(self):
+ toc_xml = self.parse_toc_xml()
+ for file_tag in toc_xml.iter("file"):
+ yield file_tag.get("id") # Use IDs instead of names for members.
+
+ def open_archive(self):
+ pass
+
+ def close_archive(self):
+ pass
+
+ def extract(self, member_name, dest_dir):
+ toc_xml = self.parse_toc_xml()
+
+ # Find the file name and data heap offset.
+ for file_tag in toc_xml.iter("file"):
+ if file_tag.get("id") == member_name:
+ file_name = file_tag.find(".//name").text
+ data_offset = int(file_tag.find(".//data/offset").text)
+ data_length = int(file_tag.find(".//data/length").text)
+ break
+ else:
+ raise KeyError(member_name)
+
+ # Write data from the heap into the temporary directory.
+ # We automatically handle gzipped data thanks to the header.
+ dest_path = os.path.join(dest_dir, file_name)
+ with open(dest_path, mode="wb") as fw, open(
+ self._source.path, mode="rb"
+ ) as fr:
+ fr.seek(self._heap_offset + data_offset, 0)
+ fw.write(fr.read(data_length))
+
+ return dest_path
+
+ def parse_toc_xml(self):
+ if getattr(self, "_toc_xml", None) is None:
+ with open(self._source.path, mode="rb") as f:
+ # Skip the magic and format, we're looking for
+ # header and TOC compressed lengths.
+ (
+ header_length,
+ toc_compressed_length,
+ ) = struct.unpack(">xxxxHxxQ", f.read(16))
+
+ # Read, decompress, and parse the TOC as XML. Save heap offset for later.
+ f.seek(header_length, 0)
+ toc_bytes = f.read(toc_compressed_length)
+ toc_as_text = zlib.decompress(toc_bytes).decode("utf-8")
+ self._toc_xml = ET.XML(toc_as_text)
+ self._heap_offset = header_length + toc_compressed_length
+
+ return self._toc_xml
+
+
+class XarFile(File):
+ DESCRIPTION = "eXtensible ARchive files"
+ CONTAINER_CLASSES = [XarContainer]
+ FILE_TYPE_RE = re.compile(r"\bpkg\b")
+ FALLBACK_FILE_EXTENSION_SUFFIX = {
+ ".xar",
+ ".pkg",
+ } # NOTE: Facebook's Executable Archive format also uses '.xar'.
+ FALLBACK_FILE_TYPE_HEADER_PREFIX = b"xar!"
+
+ def compare_details(self, other, source=None):
+ self_xar_header, self_xar_toc = describe_xar(self.path)
+ other_xar_header, other_xar_toc = describe_xar(other.path)
+ return [
+ Difference.from_text(
+ self_xar_header,
+ other_xar_header,
+ self.path,
+ other.path,
+ source="XAR Header",
+ ),
+ Difference.from_text(
+ self_xar_toc,
+ other_xar_toc,
+ self.path,
+ other.path,
+ source="XAR Table of Contents",
+ ),
+ ]
+
+
+def describe_xar(path):
+ with open(path, mode="rb") as f:
+ magic = f.read(4)
+
+ # Read the fixed portion of the XAR header
+ # Padding length is calculated using header length.
+ (
+ header_length,
+ format_version,
+ toc_compressed_length,
+ toc_uncompressed_length,
+ checksum_alg,
+ ) = struct.unpack(">HHQQI", f.read(24))
+
+ known_checksum_algs = {
+ 0: "NONE",
+ 1: "SHA1",
+ 2: "MD5",
+ 3: "SHA-256",
+ 4: "SHA-512",
+ }
+
+ header_lines = [
+ "magic: {}".format(magic),
+ "format version: {}".format(format_version),
+ "TOC compressed length: {}".format(toc_compressed_length),
+ "TOC uncompressed length: {}".format(toc_uncompressed_length),
+ "checksum: {} ({})".format(
+ checksum_alg, known_checksum_algs.get(checksum_alg, "???")
+ ),
+ ]
+
+ # Note that this 'header length' includes the 4 bytes of magic, hence 28.
+ padding_length = header_length - 28
+ if padding_length > 0: # Padding is optional.
+ padding = f.read(padding_length)
+ header_lines.append("padding: {}".format(padding))
+
+ # Read the TOC which is always DEFLATE compressed.
+ toc_bytes = f.read(toc_compressed_length)
+ toc_as_text = zlib.decompress(toc_bytes).decode("utf-8")
+
+ # Read the entire heap and add properties that allow detecting
+ # "invisible" differences in the heap, for example if data is inserted
+ # but isn't referenced in the TOC. This shouldn't happen in a normal XAR file.
+ heap_bytes = f.read()
+ header_lines.extend(
+ [
+ "heap length: {}".format(len(heap_bytes)),
+ "heap checksum: {}".format(
+ hashlib.sha256(heap_bytes).hexdigest()
+ ),
+ ]
+ )
+
+ header_as_text = "\n".join(header_lines)
+ return header_as_text, toc_as_text
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/f1822463eb39ba673b1037e105a5af59fd04262b...bc48000574be677284f5567cb8a8d6b3883176f2
--
View it on GitLab: https://salsa.debian.org/reproducible-builds/diffoscope/-/compare/f1822463eb39ba673b1037e105a5af59fd04262b...bc48000574be677284f5567cb8a8d6b3883176f2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.reproducible-builds.org/pipermail/rb-commits/attachments/20240119/9f6d282e/attachment.htm>
More information about the rb-commits
mailing list