source: trunk/packages/pyyaml/lib/yaml/reader.py @ 1046

Last change on this file since 1046 was 898, checked in by hartmans, 16 years ago

Add pyyaml and libyaml packages
backported from lenny.
There is discussion about how these should go in the repository; these are added in this form
in order to make forward progress.

File size: 7.9 KB
Line 
1# This module contains abstractions for the input stream. You don't have to
2# looks further, there are no pretty code.
3#
4# We define two classes here.
5#
6#   Mark(source, line, column)
7# It's just a record and its only use is producing nice error messages.
8# Parser does not use it for any other purposes.
9#
10#   Reader(source, data)
11# Reader determines the encoding of `data` and converts it to unicode.
12# Reader provides the following methods and attributes:
13#   reader.peek(length=1) - return the next `length` characters
14#   reader.forward(length=1) - move the current position to `length` characters.
15#   reader.index - the number of the current character.
16#   reader.line, stream.column - the line and the column of the current character.
17
18__all__ = ['Reader', 'ReaderError']
19
20from error import YAMLError, Mark
21
22import codecs, re
23
24# Unfortunately, codec functions in Python 2.3 does not support the `finish`
25# arguments, so we have to write our own wrappers.
26
27try:
28    codecs.utf_8_decode('', 'strict', False)
29    from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
30
31except TypeError:
32
33    def utf_16_le_decode(data, errors, finish=False):
34        if not finish and len(data) % 2 == 1:
35            data = data[:-1]
36        return codecs.utf_16_le_decode(data, errors)
37
38    def utf_16_be_decode(data, errors, finish=False):
39        if not finish and len(data) % 2 == 1:
40            data = data[:-1]
41        return codecs.utf_16_be_decode(data, errors)
42
43    def utf_8_decode(data, errors, finish=False):
44        if not finish:
45            # We are trying to remove a possible incomplete multibyte character
46            # from the suffix of the data.
47            # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
48            # All further bytes are in the range 0x80 to 0xbf.
49            # UTF-8 encoded UCS characters may be up to six bytes long.
50            count = 0
51            while count < 5 and count < len(data)   \
52                    and '\x80' <= data[-count-1] <= '\xBF':
53                count -= 1
54            if count < 5 and count < len(data)  \
55                    and '\xC0' <= data[-count-1] <= '\xFD':
56                data = data[:-count-1]
57        return codecs.utf_8_decode(data, errors)
58
59class ReaderError(YAMLError):
60
61    def __init__(self, name, position, character, encoding, reason):
62        self.name = name
63        self.character = character
64        self.position = position
65        self.encoding = encoding
66        self.reason = reason
67
68    def __str__(self):
69        if isinstance(self.character, str):
70            return "'%s' codec can't decode byte #x%02x: %s\n"  \
71                    "  in \"%s\", position %d"    \
72                    % (self.encoding, ord(self.character), self.reason,
73                            self.name, self.position)
74        else:
75            return "unacceptable character #x%04x: %s\n"    \
76                    "  in \"%s\", position %d"    \
77                    % (ord(self.character), self.reason,
78                            self.name, self.position)
79
80class Reader(object):
81    # Reader:
82    # - determines the data encoding and converts it to unicode,
83    # - checks if characters are in allowed range,
84    # - adds '\0' to the end.
85
86    # Reader accepts
87    #  - a `str` object,
88    #  - a `unicode` object,
89    #  - a file-like object with its `read` method returning `str`,
90    #  - a file-like object with its `read` method returning `unicode`.
91
92    # Yeah, it's ugly and slow.
93
94    def __init__(self, stream):
95        self.name = None
96        self.stream = None
97        self.stream_pointer = 0
98        self.eof = True
99        self.buffer = u''
100        self.pointer = 0
101        self.raw_buffer = None
102        self.raw_decode = None
103        self.encoding = None
104        self.index = 0
105        self.line = 0
106        self.column = 0
107        if isinstance(stream, unicode):
108            self.name = "<unicode string>"
109            self.check_printable(stream)
110            self.buffer = stream+u'\0'
111        elif isinstance(stream, str):
112            self.name = "<string>"
113            self.raw_buffer = stream
114            self.determine_encoding()
115        else:
116            self.stream = stream
117            self.name = getattr(stream, 'name', "<file>")
118            self.eof = False
119            self.raw_buffer = ''
120            self.determine_encoding()
121
122    def peek(self, index=0):
123        try:
124            return self.buffer[self.pointer+index]
125        except IndexError:
126            self.update(index+1)
127            return self.buffer[self.pointer+index]
128
129    def prefix(self, length=1):
130        if self.pointer+length >= len(self.buffer):
131            self.update(length)
132        return self.buffer[self.pointer:self.pointer+length]
133
134    def forward(self, length=1):
135        if self.pointer+length+1 >= len(self.buffer):
136            self.update(length+1)
137        while length:
138            ch = self.buffer[self.pointer]
139            self.pointer += 1
140            self.index += 1
141            if ch in u'\n\x85\u2028\u2029'  \
142                    or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
143                self.line += 1
144                self.column = 0
145            elif ch != u'\uFEFF':
146                self.column += 1
147            length -= 1
148
149    def get_mark(self):
150        if self.stream is None:
151            return Mark(self.name, self.index, self.line, self.column,
152                    self.buffer, self.pointer)
153        else:
154            return Mark(self.name, self.index, self.line, self.column,
155                    None, None)
156
157    def determine_encoding(self):
158        while not self.eof and len(self.raw_buffer) < 2:
159            self.update_raw()
160        if not isinstance(self.raw_buffer, unicode):
161            if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
162                self.raw_decode = utf_16_le_decode
163                self.encoding = 'utf-16-le'
164            elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
165                self.raw_decode = utf_16_be_decode
166                self.encoding = 'utf-16-be'
167            else:
168                self.raw_decode = utf_8_decode
169                self.encoding = 'utf-8'
170        self.update(1)
171
172    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
173    def check_printable(self, data):
174        match = self.NON_PRINTABLE.search(data)
175        if match:
176            character = match.group()
177            position = self.index+(len(self.buffer)-self.pointer)+match.start()
178            raise ReaderError(self.name, position, character,
179                    'unicode', "special characters are not allowed")
180
181    def update(self, length):
182        if self.raw_buffer is None:
183            return
184        self.buffer = self.buffer[self.pointer:]
185        self.pointer = 0
186        while len(self.buffer) < length:
187            if not self.eof:
188                self.update_raw()
189            if self.raw_decode is not None:
190                try:
191                    data, converted = self.raw_decode(self.raw_buffer,
192                            'strict', self.eof)
193                except UnicodeDecodeError, exc:
194                    character = exc.object[exc.start]
195                    if self.stream is not None:
196                        position = self.stream_pointer-len(self.raw_buffer)+exc.start
197                    else:
198                        position = exc.start
199                    raise ReaderError(self.name, position, character,
200                            exc.encoding, exc.reason)
201            else:
202                data = self.raw_buffer
203                converted = len(data)
204            self.check_printable(data)
205            self.buffer += data
206            self.raw_buffer = self.raw_buffer[converted:]
207            if self.eof:
208                self.buffer += u'\0'
209                self.raw_buffer = None
210                break
211
212    def update_raw(self, size=1024):
213        data = self.stream.read(size)
214        if data:
215            self.raw_buffer += data
216            self.stream_pointer += len(data)
217        else:
218            self.eof = True
219
220#try:
221#    import psyco
222#    psyco.bind(Reader)
223#except ImportError:
224#    pass
225
Note: See TracBrowser for help on using the repository browser.