1 | # This module contains abstractions for the input stream. You don't have to |
---|
2 | # looks further, there are no pretty code. |
---|
3 | # |
---|
4 | # We define two classes here. |
---|
5 | # |
---|
6 | # Mark(source, line, column) |
---|
7 | # It's just a record and its only use is producing nice error messages. |
---|
8 | # Parser does not use it for any other purposes. |
---|
9 | # |
---|
10 | # Reader(source, data) |
---|
11 | # Reader determines the encoding of `data` and converts it to unicode. |
---|
12 | # Reader provides the following methods and attributes: |
---|
13 | # reader.peek(length=1) - return the next `length` characters |
---|
14 | # reader.forward(length=1) - move the current position to `length` characters. |
---|
15 | # reader.index - the number of the current character. |
---|
16 | # reader.line, stream.column - the line and the column of the current character. |
---|
17 | |
---|
18 | __all__ = ['Reader', 'ReaderError'] |
---|
19 | |
---|
20 | from error import YAMLError, Mark |
---|
21 | |
---|
22 | import codecs, re |
---|
23 | |
---|
24 | # Unfortunately, codec functions in Python 2.3 does not support the `finish` |
---|
25 | # arguments, so we have to write our own wrappers. |
---|
26 | |
---|
27 | try: |
---|
28 | codecs.utf_8_decode('', 'strict', False) |
---|
29 | from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode |
---|
30 | |
---|
31 | except TypeError: |
---|
32 | |
---|
33 | def utf_16_le_decode(data, errors, finish=False): |
---|
34 | if not finish and len(data) % 2 == 1: |
---|
35 | data = data[:-1] |
---|
36 | return codecs.utf_16_le_decode(data, errors) |
---|
37 | |
---|
38 | def utf_16_be_decode(data, errors, finish=False): |
---|
39 | if not finish and len(data) % 2 == 1: |
---|
40 | data = data[:-1] |
---|
41 | return codecs.utf_16_be_decode(data, errors) |
---|
42 | |
---|
43 | def utf_8_decode(data, errors, finish=False): |
---|
44 | if not finish: |
---|
45 | # We are trying to remove a possible incomplete multibyte character |
---|
46 | # from the suffix of the data. |
---|
47 | # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd. |
---|
48 | # All further bytes are in the range 0x80 to 0xbf. |
---|
49 | # UTF-8 encoded UCS characters may be up to six bytes long. |
---|
50 | count = 0 |
---|
51 | while count < 5 and count < len(data) \ |
---|
52 | and '\x80' <= data[-count-1] <= '\xBF': |
---|
53 | count -= 1 |
---|
54 | if count < 5 and count < len(data) \ |
---|
55 | and '\xC0' <= data[-count-1] <= '\xFD': |
---|
56 | data = data[:-count-1] |
---|
57 | return codecs.utf_8_decode(data, errors) |
---|
58 | |
---|
59 | class ReaderError(YAMLError): |
---|
60 | |
---|
61 | def __init__(self, name, position, character, encoding, reason): |
---|
62 | self.name = name |
---|
63 | self.character = character |
---|
64 | self.position = position |
---|
65 | self.encoding = encoding |
---|
66 | self.reason = reason |
---|
67 | |
---|
68 | def __str__(self): |
---|
69 | if isinstance(self.character, str): |
---|
70 | return "'%s' codec can't decode byte #x%02x: %s\n" \ |
---|
71 | " in \"%s\", position %d" \ |
---|
72 | % (self.encoding, ord(self.character), self.reason, |
---|
73 | self.name, self.position) |
---|
74 | else: |
---|
75 | return "unacceptable character #x%04x: %s\n" \ |
---|
76 | " in \"%s\", position %d" \ |
---|
77 | % (ord(self.character), self.reason, |
---|
78 | self.name, self.position) |
---|
79 | |
---|
80 | class Reader(object): |
---|
81 | # Reader: |
---|
82 | # - determines the data encoding and converts it to unicode, |
---|
83 | # - checks if characters are in allowed range, |
---|
84 | # - adds '\0' to the end. |
---|
85 | |
---|
86 | # Reader accepts |
---|
87 | # - a `str` object, |
---|
88 | # - a `unicode` object, |
---|
89 | # - a file-like object with its `read` method returning `str`, |
---|
90 | # - a file-like object with its `read` method returning `unicode`. |
---|
91 | |
---|
92 | # Yeah, it's ugly and slow. |
---|
93 | |
---|
94 | def __init__(self, stream): |
---|
95 | self.name = None |
---|
96 | self.stream = None |
---|
97 | self.stream_pointer = 0 |
---|
98 | self.eof = True |
---|
99 | self.buffer = u'' |
---|
100 | self.pointer = 0 |
---|
101 | self.raw_buffer = None |
---|
102 | self.raw_decode = None |
---|
103 | self.encoding = None |
---|
104 | self.index = 0 |
---|
105 | self.line = 0 |
---|
106 | self.column = 0 |
---|
107 | if isinstance(stream, unicode): |
---|
108 | self.name = "<unicode string>" |
---|
109 | self.check_printable(stream) |
---|
110 | self.buffer = stream+u'\0' |
---|
111 | elif isinstance(stream, str): |
---|
112 | self.name = "<string>" |
---|
113 | self.raw_buffer = stream |
---|
114 | self.determine_encoding() |
---|
115 | else: |
---|
116 | self.stream = stream |
---|
117 | self.name = getattr(stream, 'name', "<file>") |
---|
118 | self.eof = False |
---|
119 | self.raw_buffer = '' |
---|
120 | self.determine_encoding() |
---|
121 | |
---|
122 | def peek(self, index=0): |
---|
123 | try: |
---|
124 | return self.buffer[self.pointer+index] |
---|
125 | except IndexError: |
---|
126 | self.update(index+1) |
---|
127 | return self.buffer[self.pointer+index] |
---|
128 | |
---|
129 | def prefix(self, length=1): |
---|
130 | if self.pointer+length >= len(self.buffer): |
---|
131 | self.update(length) |
---|
132 | return self.buffer[self.pointer:self.pointer+length] |
---|
133 | |
---|
134 | def forward(self, length=1): |
---|
135 | if self.pointer+length+1 >= len(self.buffer): |
---|
136 | self.update(length+1) |
---|
137 | while length: |
---|
138 | ch = self.buffer[self.pointer] |
---|
139 | self.pointer += 1 |
---|
140 | self.index += 1 |
---|
141 | if ch in u'\n\x85\u2028\u2029' \ |
---|
142 | or (ch == u'\r' and self.buffer[self.pointer] != u'\n'): |
---|
143 | self.line += 1 |
---|
144 | self.column = 0 |
---|
145 | elif ch != u'\uFEFF': |
---|
146 | self.column += 1 |
---|
147 | length -= 1 |
---|
148 | |
---|
149 | def get_mark(self): |
---|
150 | if self.stream is None: |
---|
151 | return Mark(self.name, self.index, self.line, self.column, |
---|
152 | self.buffer, self.pointer) |
---|
153 | else: |
---|
154 | return Mark(self.name, self.index, self.line, self.column, |
---|
155 | None, None) |
---|
156 | |
---|
157 | def determine_encoding(self): |
---|
158 | while not self.eof and len(self.raw_buffer) < 2: |
---|
159 | self.update_raw() |
---|
160 | if not isinstance(self.raw_buffer, unicode): |
---|
161 | if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): |
---|
162 | self.raw_decode = utf_16_le_decode |
---|
163 | self.encoding = 'utf-16-le' |
---|
164 | elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): |
---|
165 | self.raw_decode = utf_16_be_decode |
---|
166 | self.encoding = 'utf-16-be' |
---|
167 | else: |
---|
168 | self.raw_decode = utf_8_decode |
---|
169 | self.encoding = 'utf-8' |
---|
170 | self.update(1) |
---|
171 | |
---|
172 | NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') |
---|
173 | def check_printable(self, data): |
---|
174 | match = self.NON_PRINTABLE.search(data) |
---|
175 | if match: |
---|
176 | character = match.group() |
---|
177 | position = self.index+(len(self.buffer)-self.pointer)+match.start() |
---|
178 | raise ReaderError(self.name, position, character, |
---|
179 | 'unicode', "special characters are not allowed") |
---|
180 | |
---|
181 | def update(self, length): |
---|
182 | if self.raw_buffer is None: |
---|
183 | return |
---|
184 | self.buffer = self.buffer[self.pointer:] |
---|
185 | self.pointer = 0 |
---|
186 | while len(self.buffer) < length: |
---|
187 | if not self.eof: |
---|
188 | self.update_raw() |
---|
189 | if self.raw_decode is not None: |
---|
190 | try: |
---|
191 | data, converted = self.raw_decode(self.raw_buffer, |
---|
192 | 'strict', self.eof) |
---|
193 | except UnicodeDecodeError, exc: |
---|
194 | character = exc.object[exc.start] |
---|
195 | if self.stream is not None: |
---|
196 | position = self.stream_pointer-len(self.raw_buffer)+exc.start |
---|
197 | else: |
---|
198 | position = exc.start |
---|
199 | raise ReaderError(self.name, position, character, |
---|
200 | exc.encoding, exc.reason) |
---|
201 | else: |
---|
202 | data = self.raw_buffer |
---|
203 | converted = len(data) |
---|
204 | self.check_printable(data) |
---|
205 | self.buffer += data |
---|
206 | self.raw_buffer = self.raw_buffer[converted:] |
---|
207 | if self.eof: |
---|
208 | self.buffer += u'\0' |
---|
209 | self.raw_buffer = None |
---|
210 | break |
---|
211 | |
---|
212 | def update_raw(self, size=1024): |
---|
213 | data = self.stream.read(size) |
---|
214 | if data: |
---|
215 | self.raw_buffer += data |
---|
216 | self.stream_pointer += len(data) |
---|
217 | else: |
---|
218 | self.eof = True |
---|
219 | |
---|
220 | #try: |
---|
221 | # import psyco |
---|
222 | # psyco.bind(Reader) |
---|
223 | #except ImportError: |
---|
224 | # pass |
---|
225 | |
---|