Kaydet (Commit) 92b550cd authored tarafından Marc-André Lemburg's avatar Marc-André Lemburg

This patch by Martin v. Loewis changes the UTF-16 codec to only

write a BOM at the start of the stream and also to only read it as
BOM at the start of a stream.

Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP
character. This is in sync with the Unicode specifications.

Note that UTF-16 files will now *have* to start with a BOM mark
in order to be readable by the codec.
üst 8c78d3a5
......@@ -6,7 +6,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""
import codecs
import codecs, sys
### Codec APIs
......@@ -18,10 +18,40 @@ class Codec(codecs.Codec):
decode = codecs.utf_16_decode
class StreamWriter(Codec,codecs.StreamWriter):
pass
def __init__(self, stream, errors='strict'):
self.bom_written = 0
codecs.StreamWriter.__init__(self, stream, errors)
def write(self, data):
result = codecs.StreamWriter.write(self, data)
if not self.bom_written:
self.bom_written = 1
if sys.byteorder == 'little':
self.encode = codecs.utf_16_le_encode
else:
self.encode = codecs.utf_16_be_encode
return result
class StreamReader(Codec,codecs.StreamReader):
pass
def __init__(self, stream, errors='strict'):
self.bom_read = 0
codecs.StreamReader.__init__(self, stream, errors)
def read(self, size=-1):
if not self.bom_read:
signature = self.stream.read(2)
if signature == codecs.BOM_BE:
self.decode = codecs.utf_16_be_decode
elif signature == codecs.BOM_LE:
self.decode = codecs.utf_16_le_decode
else:
raise UnicodeError,"UTF-16 stream does not start with BOM"
if size > 2:
size -= 2
elif size >= 0:
size = 0
self.bom_read = 1
return codecs.StreamReader.read(self, size)
### encodings module API
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment