bpo-32072: Fix issues with binary plists. (#4455)

* Fixed saving bytearrays. * Identical objects will be saved only once. * Equal references will be load as identical objects. * Added support for saving and loading recursive data structures.

bpo-32072: Fix issues with binary plists. (#4455)
* Fixed saving bytearrays. * Identical objects will be saved only once. * Equal references will be load as identical objects. * Added support for saving and loading recursive data structures.
a897aeee · Serhiy Storchaka · GitHub · b4d1e1f7 · a897aeee · a897aeee
Unverified Kaydet (Commit) a897aeee authored Kas 30, 2017 tarafından Serhiy Storchaka Kaydeden (comit) GitHub Kas 30, 2017
Showing with 112 additions and 36 deletions

plistlib.py Lib/plistlib.py +51 -36

test_plistlib.py Lib/test/test_plistlib.py +55 -0

2017-11-18-21-13-52.bpo-32072.nwDV8L.rst ...S.d/next/Library/2017-11-18-21-13-52.bpo-32072.nwDV8L.rst +6 -0

No files found.
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py
@@ -525,6 +525,8 @@ class InvalidFileException (ValueError):
 _BINARY_FORMAT = {1: 'B', 2: 'H', 4: 'L', 8: 'Q'}
+_undefined = object()
 class _BinaryPlistParser:
    """
    Read or write a binary plist file, following the description of the binary
@@ -555,7 +557,8 @@ class _BinaryPlistParser:
            ) = struct.unpack('>6xBBQQQ', trailer)
            self._fp.seek(offset_table_offset)
            self._object_offsets = self._read_ints(num_objects, offset_size)
-            return self._read_object(self._object_offsets[top_object])
+            self._objects = [_undefined] * num_objects
+            return self._read_object(top_object)
        except (OSError, IndexError, struct.error, OverflowError,
                UnicodeDecodeError):
@@ -584,62 +587,68 @@ class _BinaryPlistParser:
    def _read_refs(self, n):
        return self._read_ints(n, self._ref_size)
-    def _read_object(self, offset):
+    def _read_object(self, ref):
        """
-        read the object at offset.
+        read the object by reference.
        May recursively read sub-objects (content of an array/dict/set)
        """
+        result = self._objects[ref]
+        if result is not _undefined:
+            return result
+        offset = self._object_offsets[ref]
        self._fp.seek(offset)
        token = self._fp.read(1)[0]
        tokenH, tokenL = token & 0xF0, token & 0x0F
        if token == 0x00:
-            return None
+            result = None
        elif token == 0x08:
-            return False
+            result = False
        elif token == 0x09:
-            return True
+            result = True
        # The referenced source code also mentions URL (0x0c, 0x0d) and
        # UUID (0x0e), but neither can be generated using the Cocoa libraries.
        elif token == 0x0f:
-            return b''
+            result = b''
        elif tokenH == 0x10:  # int
-            return int.from_bytes(self._fp.read(1 << tokenL),
+            result = int.from_bytes(self._fp.read(1 << tokenL),
-                                  'big', signed=tokenL >= 3)
+                                    'big', signed=tokenL >= 3)
        elif token == 0x22: # real
-            return struct.unpack('>f', self._fp.read(4))[0]
+            result = struct.unpack('>f', self._fp.read(4))[0]
        elif token == 0x23: # real
-            return struct.unpack('>d', self._fp.read(8))[0]
+            result = struct.unpack('>d', self._fp.read(8))[0]
        elif token == 0x33:  # date
            f = struct.unpack('>d', self._fp.read(8))[0]
            # timestamp 0 of binary plists corresponds to 1/1/2001
            # (year of Mac OS X 10.0), instead of 1/1/1970.
-            return datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=f)
+            result = (datetime.datetime(2001, 1, 1) +
+                      datetime.timedelta(seconds=f))
        elif tokenH == 0x40:  # data
            s = self._get_size(tokenL)
            if self._use_builtin_types:
-                return self._fp.read(s)
+                result = self._fp.read(s)
            else:
-                return Data(self._fp.read(s))
+                result = Data(self._fp.read(s))
        elif tokenH == 0x50:  # ascii string
            s = self._get_size(tokenL)
            result =  self._fp.read(s).decode('ascii')
-            return result
+            result = result
        elif tokenH == 0x60:  # unicode string
            s = self._get_size(tokenL)
-            return self._fp.read(s * 2).decode('utf-16be')
+            result = self._fp.read(s * 2).decode('utf-16be')
        # tokenH == 0x80 is documented as 'UID' and appears to be used for
        # keyed-archiving, not in plists.
@@ -647,8 +656,9 @@ class _BinaryPlistParser:
        elif tokenH == 0xA0:  # array
            s = self._get_size(tokenL)
            obj_refs = self._read_refs(s)
-            return [self._read_object(self._object_offsets[x])
+            result = []
-                for x in obj_refs]
+            self._objects[ref] = result
+            result.extend(self._read_object(x) for x in obj_refs)
        # tokenH == 0xB0 is documented as 'ordset', but is not actually
        # implemented in the Apple reference code.
@@ -661,12 +671,15 @@ class _BinaryPlistParser:
            key_refs = self._read_refs(s)
            obj_refs = self._read_refs(s)
            result = self._dict_type()
+            self._objects[ref] = result
            for k, o in zip(key_refs, obj_refs):
-                result[self._read_object(self._object_offsets[k])
+                result[self._read_object(k)] = self._read_object(o)
-                    ] = self._read_object(self._object_offsets[o])
-            return result
-        raise InvalidFileException()
+        else:
+            raise InvalidFileException()
+        self._objects[ref] = result
+        return result
 def _count_to_size(count):
    if count < 1 << 8:
@@ -681,6 +694,8 @@ def _count_to_size(count):
    else:
        return 8
+_scalars = (str, int, float, datetime.datetime, bytes)
 class _BinaryPlistWriter (object):
    def __init__(self, fp, sort_keys, skipkeys):
        self._fp = fp
@@ -736,8 +751,7 @@ class _BinaryPlistWriter (object):
        # First check if the object is in the object table, not used for
        # containers to ensure that two subcontainers with the same contents
        # will be serialized as distinct values.
-        if isinstance(value, (
+        if isinstance(value, _scalars):
-                str, int, float, datetime.datetime, bytes, bytearray)):
            if (type(value), value) in self._objtable:
                return
@@ -745,15 +759,17 @@ class _BinaryPlistWriter (object):
            if (type(value.data), value.data) in self._objtable:
                return
+        elif id(value) in self._objidtable:
+            return
        # Add to objectreference map
        refnum = len(self._objlist)
        self._objlist.append(value)
-        try:
+        if isinstance(value, _scalars):
-            if isinstance(value, Data):
+            self._objtable[(type(value), value)] = refnum
-                self._objtable[(type(value.data), value.data)] = refnum
+        elif isinstance(value, Data):
-            else:
+            self._objtable[(type(value.data), value.data)] = refnum
-                self._objtable[(type(value), value)] = refnum
+        else:
-        except TypeError:
            self._objidtable[id(value)] = refnum
        # And finally recurse into containers
@@ -780,12 +796,11 @@ class _BinaryPlistWriter (object):
                self._flatten(o)
    def _getrefnum(self, value):
-        try:
+        if isinstance(value, _scalars):
-            if isinstance(value, Data):
+            return self._objtable[(type(value), value)]
-                return self._objtable[(type(value.data), value.data)]
+        elif isinstance(value, Data):
-            else:
+            return self._objtable[(type(value.data), value.data)]
-                return self._objtable[(type(value), value)]
+        else:
-        except TypeError:
            return self._objidtable[id(value)]
    def _write_size(self, token, size):

--- a/Lib/test/test_plistlib.py
+++ b/Lib/test/test_plistlib.py
@@ -169,6 +169,17 @@ class TestPlistlib(unittest.TestCase):
                    self.assertRaises(OverflowError, plistlib.dumps,
                                      pl, fmt=fmt)
+    def test_bytearray(self):
+        for pl in (b'<binary gunk>', b"<lots of binary gunk>\0\1\2\3" * 10):
+            for fmt in ALL_FORMATS:
+                with self.subTest(pl=pl, fmt=fmt):
+                    data = plistlib.dumps(bytearray(pl), fmt=fmt)
+                    pl2 = plistlib.loads(data)
+                    self.assertIsInstance(pl2, bytes)
+                    self.assertEqual(pl2, pl)
+                    data2 = plistlib.dumps(pl2, fmt=fmt)
+                    self.assertEqual(data, data2)
    def test_bytes(self):
        pl = self._create()
        data = plistlib.dumps(pl)
@@ -431,6 +442,9 @@ class TestPlistlib(unittest.TestCase):
                pl2 = plistlib.loads(data)
                self.assertEqual(dict(pl), dict(pl2))
+class TestBinaryPlistlib(unittest.TestCase):
    def test_nonstandard_refs_size(self):
        # Issue #21538: Refs and offsets are 24-bit integers
        data = (b'bplist00'
@@ -443,6 +457,47 @@ class TestPlistlib(unittest.TestCase):
                b'\x00\x00\x00\x00\x00\x00\x00\x13')
        self.assertEqual(plistlib.loads(data), {'a': 'b'})
+    def test_dump_duplicates(self):
+        # Test effectiveness of saving duplicated objects
+        for x in (None, False, True, 12345, 123.45, 'abcde', b'abcde',
+                  datetime.datetime(2004, 10, 26, 10, 33, 33),
+                  plistlib.Data(b'abcde'), bytearray(b'abcde'),
+                  [12, 345], (12, 345), {'12': 345}):
+            with self.subTest(x=x):
+                data = plistlib.dumps([x]*1000, fmt=plistlib.FMT_BINARY)
+                self.assertLess(len(data), 1100, repr(data))
+    def test_identity(self):
+        for x in (None, False, True, 12345, 123.45, 'abcde', b'abcde',
+                  datetime.datetime(2004, 10, 26, 10, 33, 33),
+                  plistlib.Data(b'abcde'), bytearray(b'abcde'),
+                  [12, 345], (12, 345), {'12': 345}):
+            with self.subTest(x=x):
+                data = plistlib.dumps([x]*2, fmt=plistlib.FMT_BINARY)
+                a, b = plistlib.loads(data)
+                if isinstance(x, tuple):
+                    x = list(x)
+                self.assertEqual(a, x)
+                self.assertEqual(b, x)
+                self.assertIs(a, b)
+    def test_cycles(self):
+        # recursive list
+        a = []
+        a.append(a)
+        b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
+        self.assertIs(b[0], b)
+        # recursive tuple
+        a = ([],)
+        a[0].append(a)
+        b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
+        self.assertIs(b[0][0], b)
+        # recursive dict
+        a = {}
+        a['x'] = a
+        b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
+        self.assertIs(b['x'], b)
    def test_large_timestamp(self):
        # Issue #26709: 32-bit timestamp out of range
        for ts in -2**31-1, 2**31:

--- a/Misc/NEWS.d/next/Library/2017-11-18-21-13-52.bpo-32072.nwDV8L.rst
+++ b/Misc/NEWS.d/next/Library/2017-11-18-21-13-52.bpo-32072.nwDV8L.rst
+Fixed issues with binary plists:
+* Fixed saving bytearrays.
+* Identical objects will be saved only once.
+* Equal references will be load as identical objects.
+* Added support for saving and loading recursive data structures.