This patch by Martin v. Loewis changes the UTF-16 codec to only

write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now *have* to start with a BOM mark in order to be readable by the codec.

This patch by Martin v. Loewis changes the UTF-16 codec to only
write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now *have* to start with a BOM mark in order to be readable by the codec.
92b550cd · Marc-André Lemburg · 8c78d3a5 · 92b550cd
Kaydet (Commit) 92b550cd authored Haz 19, 2001 tarafından Marc-André Lemburg
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 3 deletions

utf_16.py Lib/encodings/utf_16.py +33 -3

No files found.
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -6,7 +6,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

 """
-import codecs
+import codecs, sys

 ### Codec APIs

@@ -18,10 +18,40 @@ class Codec(codecs.Codec):
    decode = codecs.utf_16_decode

 class StreamWriter(Codec,codecs.StreamWriter):
-    pass
+    def __init__(self, stream, errors='strict'):
+        self.bom_written = 0
+        codecs.StreamWriter.__init__(self, stream, errors)
+
+    def write(self, data):
+        result = codecs.StreamWriter.write(self, data)
+        if not self.bom_written:
+            self.bom_written = 1
+            if sys.byteorder == 'little':
+                self.encode = codecs.utf_16_le_encode
+            else:
+                self.encode = codecs.utf_16_be_encode
+        return result
        
 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def __init__(self, stream, errors='strict'):
+        self.bom_read = 0
+        codecs.StreamReader.__init__(self, stream, errors)
+
+    def read(self, size=-1):
+        if not self.bom_read:
+            signature = self.stream.read(2)
+            if signature == codecs.BOM_BE:
+                self.decode = codecs.utf_16_be_decode
+            elif signature == codecs.BOM_LE:
+                self.decode = codecs.utf_16_le_decode
+            else:
+                raise UnicodeError,"UTF-16 stream does not start with BOM"
+            if size > 2:
+                size -= 2
+            elif size >= 0:
+                size = 0
+            self.bom_read = 1
+        return codecs.StreamReader.read(self, size)

 ### encodings module API