Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/libyaml/src/reader.c @ 898

Last change on this file since 898 was 898, checked in by hartmans, 16 years ago
Add pyyaml and libyaml packages backported from lenny. There is discussion about how these should go in the repository; these are added in this form in order to make forward progress.
File size: 16.0 KB

Line
1
2	#include "yaml_private.h"
3
4	/*
5	* Declarations.
6	*/
7
8	static int
9	yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
10	size_t offset, int value);
11
12	static int
13	yaml_parser_update_raw_buffer(yaml_parser_t *parser);
14
15	static int
16	yaml_parser_determine_encoding(yaml_parser_t *parser);
17
18	YAML_DECLARE(int)
19	yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
20
21	/*
22	* Set the reader error and return 0.
23	*/
24
25	static int
26	yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
27	size_t offset, int value)
28	{
29	parser->error = YAML_READER_ERROR;
30	parser->problem = problem;
31	parser->problem_offset = offset;
32	parser->problem_value = value;
33
34	return 0;
35	}
36
37	/*
38	* Byte order marks.
39	*/
40
41	#define BOM_UTF8 "\xef\xbb\xbf"
42	#define BOM_UTF16LE "\xff\xfe"
43	#define BOM_UTF16BE "\xfe\xff"
44
45	/*
46	* Determine the input stream encoding by checking the BOM symbol. If no BOM is
47	* found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
48	*/
49
50	static int
51	yaml_parser_determine_encoding(yaml_parser_t *parser)
52	{
53	/* Ensure that we had enough bytes in the raw buffer. */
54
55	while (!parser->eof
56	&& parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
57	if (!yaml_parser_update_raw_buffer(parser)) {
58	return 0;
59	}
60	}
61
62	/* Determine the encoding. */
63
64	if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
65	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
66	parser->encoding = YAML_UTF16LE_ENCODING;
67	parser->raw_buffer.pointer += 2;
68	parser->offset += 2;
69	}
70	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
71	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
72	parser->encoding = YAML_UTF16BE_ENCODING;
73	parser->raw_buffer.pointer += 2;
74	parser->offset += 2;
75	}
76	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
77	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
78	parser->encoding = YAML_UTF8_ENCODING;
79	parser->raw_buffer.pointer += 3;
80	parser->offset += 3;
81	}
82	else {
83	parser->encoding = YAML_UTF8_ENCODING;
84	}
85
86	return 1;
87	}
88
89	/*
90	* Update the raw buffer.
91	*/
92
93	static int
94	yaml_parser_update_raw_buffer(yaml_parser_t *parser)
95	{
96	size_t size_read = 0;
97
98	/* Return if the raw buffer is full. */
99
100	if (parser->raw_buffer.start == parser->raw_buffer.pointer
101	&& parser->raw_buffer.last == parser->raw_buffer.end)
102	return 1;
103
104	/* Return on EOF. */
105
106	if (parser->eof) return 1;
107
108	/* Move the remaining bytes in the raw buffer to the beginning. */
109
110	if (parser->raw_buffer.start < parser->raw_buffer.pointer
111	&& parser->raw_buffer.pointer < parser->raw_buffer.last) {
112	memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
113	parser->raw_buffer.last - parser->raw_buffer.pointer);
114	}
115	parser->raw_buffer.last -=
116	parser->raw_buffer.pointer - parser->raw_buffer.start;
117	parser->raw_buffer.pointer = parser->raw_buffer.start;
118
119	/* Call the read handler to fill the buffer. */
120
121	if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
122	parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
123	return yaml_parser_set_reader_error(parser, "Input error",
124	parser->offset, -1);
125	}
126	parser->raw_buffer.last += size_read;
127	if (!size_read) {
128	parser->eof = 1;
129	}
130
131	return 1;
132	}
133
134	/*
135	* Ensure that the buffer contains at least `length` characters.
136	* Return 1 on success, 0 on failure.
137	*
138	* The length is supposed to be significantly less that the buffer size.
139	*/
140
141	YAML_DECLARE(int)
142	yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
143	{
144	assert(parser->read_handler); /* Read handler must be set. */
145
146	/* If the EOF flag is set and the raw buffer is empty, do nothing. */
147
148	if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
149	return 1;
150
151	/* Return if the buffer contains enough characters. */
152
153	if (parser->unread >= length)
154	return 1;
155
156	/* Determine the input encoding if it is not known yet. */
157
158	if (!parser->encoding) {
159	if (!yaml_parser_determine_encoding(parser))
160	return 0;
161	}
162
163	/* Move the unread characters to the beginning of the buffer. */
164
165	if (parser->buffer.start < parser->buffer.pointer
166	&& parser->buffer.pointer < parser->buffer.last) {
167	size_t size = parser->buffer.last - parser->buffer.pointer;
168	memmove(parser->buffer.start, parser->buffer.pointer, size);
169	parser->buffer.pointer = parser->buffer.start;
170	parser->buffer.last = parser->buffer.start + size;
171	}
172	else if (parser->buffer.pointer == parser->buffer.last) {
173	parser->buffer.pointer = parser->buffer.start;
174	parser->buffer.last = parser->buffer.start;
175	}
176
177	/* Fill the buffer until it has enough characters. */
178
179	while (parser->unread < length)
180	{
181	/* Fill the raw buffer. */
182
183	if (!yaml_parser_update_raw_buffer(parser)) return 0;
184
185	/* Decode the raw buffer. */
186
187	while (parser->raw_buffer.pointer != parser->raw_buffer.last)
188	{
189	unsigned int value = 0, value2 = 0;
190	int incomplete = 0;
191	unsigned char octet;
192	unsigned int width = 0;
193	int low, high;
194	size_t k;
195	size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
196
197	/* Decode the next character. */
198
199	switch (parser->encoding)
200	{
201	case YAML_UTF8_ENCODING:
202
203	/*
204	* Decode a UTF-8 character. Check RFC 3629
205	* (http://www.ietf.org/rfc/rfc3629.txt) for more details.
206	*
207	* The following table (taken from the RFC) is used for
208	* decoding.
209	*
210	* Char. number range \| UTF-8 octet sequence
211	* (hexadecimal) \| (binary)
212	* --------------------+------------------------------------
213	* 0000 0000-0000 007F \| 0xxxxxxx
214	* 0000 0080-0000 07FF \| 110xxxxx 10xxxxxx
215	* 0000 0800-0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
216	* 0001 0000-0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
217	*
218	* Additionally, the characters in the range 0xD800-0xDFFF
219	* are prohibited as they are reserved for use with UTF-16
220	* surrogate pairs.
221	*/
222
223	/* Determine the length of the UTF-8 sequence. */
224
225	octet = parser->raw_buffer.pointer[0];
226	width = (octet & 0x80) == 0x00 ? 1 :
227	(octet & 0xE0) == 0xC0 ? 2 :
228	(octet & 0xF0) == 0xE0 ? 3 :
229	(octet & 0xF8) == 0xF0 ? 4 : 0;
230
231	/* Check if the leading octet is valid. */
232
233	if (!width)
234	return yaml_parser_set_reader_error(parser,
235	"Invalid leading UTF-8 octet",
236	parser->offset, octet);
237
238	/* Check if the raw buffer contains an incomplete character. */
239
240	if (width > raw_unread) {
241	if (parser->eof) {
242	return yaml_parser_set_reader_error(parser,
243	"Incomplete UTF-8 octet sequence",
244	parser->offset, -1);
245	}
246	incomplete = 1;
247	break;
248	}
249
250	/* Decode the leading octet. */
251
252	value = (octet & 0x80) == 0x00 ? octet & 0x7F :
253	(octet & 0xE0) == 0xC0 ? octet & 0x1F :
254	(octet & 0xF0) == 0xE0 ? octet & 0x0F :
255	(octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
256
257	/* Check and decode the trailing octets. */
258
259	for (k = 1; k < width; k ++)
260	{
261	octet = parser->raw_buffer.pointer[k];
262
263	/* Check if the octet is valid. */
264
265	if ((octet & 0xC0) != 0x80)
266	return yaml_parser_set_reader_error(parser,
267	"Invalid trailing UTF-8 octet",
268	parser->offset+k, octet);
269
270	/* Decode the octet. */
271
272	value = (value << 6) + (octet & 0x3F);
273	}
274
275	/* Check the length of the sequence against the value. */
276
277	if (!((width == 1) \|\|
278	(width == 2 && value >= 0x80) \|\|
279	(width == 3 && value >= 0x800) \|\|
280	(width == 4 && value >= 0x10000)))
281	return yaml_parser_set_reader_error(parser,
282	"Invalid length of a UTF-8 sequence",
283	parser->offset, -1);
284
285	/* Check the range of the value. */
286
287	if ((value >= 0xD800 && value <= 0xDFFF) \|\| value > 0x10FFFF)
288	return yaml_parser_set_reader_error(parser,
289	"Invalid Unicode character",
290	parser->offset, value);
291
292	break;
293
294	case YAML_UTF16LE_ENCODING:
295	case YAML_UTF16BE_ENCODING:
296
297	low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
298	high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
299
300	/*
301	* The UTF-16 encoding is not as simple as one might
302	* naively think. Check RFC 2781
303	* (http://www.ietf.org/rfc/rfc2781.txt).
304	*
305	* Normally, two subsequent bytes describe a Unicode
306	* character. However a special technique (called a
307	* surrogate pair) is used for specifying character
308	* values larger than 0xFFFF.
309	*
310	* A surrogate pair consists of two pseudo-characters:
311	* high surrogate area (0xD800-0xDBFF)
312	* low surrogate area (0xDC00-0xDFFF)
313	*
314	* The following formulas are used for decoding
315	* and encoding characters using surrogate pairs:
316	*
317	* U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
318	* U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
319	* W1 = 110110yyyyyyyyyy
320	* W2 = 110111xxxxxxxxxx
321	*
322	* where U is the character value, W1 is the high surrogate
323	* area, W2 is the low surrogate area.
324	*/
325
326	/* Check for incomplete UTF-16 character. */
327
328	if (raw_unread < 2) {
329	if (parser->eof) {
330	return yaml_parser_set_reader_error(parser,
331	"Incomplete UTF-16 character",
332	parser->offset, -1);
333	}
334	incomplete = 1;
335	break;
336	}
337
338	/* Get the character. */
339
340	value = parser->raw_buffer.pointer[low]
341	+ (parser->raw_buffer.pointer[high] << 8);
342
343	/* Check for unexpected low surrogate area. */
344
345	if ((value & 0xFC00) == 0xDC00)
346	return yaml_parser_set_reader_error(parser,
347	"Unexpected low surrogate area",
348	parser->offset, value);
349
350	/* Check for a high surrogate area. */
351
352	if ((value & 0xFC00) == 0xD800) {
353
354	width = 4;
355
356	/* Check for incomplete surrogate pair. */
357
358	if (raw_unread < 4) {
359	if (parser->eof) {
360	return yaml_parser_set_reader_error(parser,
361	"Incomplete UTF-16 surrogate pair",
362	parser->offset, -1);
363	}
364	incomplete = 1;
365	break;
366	}
367
368	/* Get the next character. */
369
370	value2 = parser->raw_buffer.pointer[low+2]
371	+ (parser->raw_buffer.pointer[high+2] << 8);
372
373	/* Check for a low surrogate area. */
374
375	if ((value2 & 0xFC00) != 0xDC00)
376	return yaml_parser_set_reader_error(parser,
377	"Expected low surrogate area",
378	parser->offset+2, value2);
379
380	/* Generate the value of the surrogate pair. */
381
382	value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
383	}
384
385	else {
386	width = 2;
387	}
388
389	break;
390
391	default:
392	assert(1); /* Impossible. */
393	}
394
395	/* Check if the raw buffer contains enough bytes to form a character. */
396
397	if (incomplete) break;
398
399	/*
400	* Check if the character is in the allowed range:
401	* #x9 \| #xA \| #xD \| [#x20-#x7E] (8 bit)
402	* \| #x85 \| [#xA0-#xD7FF] \| [#xE000-#xFFFD] (16 bit)
403	* \| [#x10000-#x10FFFF] (32 bit)
404	*/
405
406	if (! (value == 0x09 \|\| value == 0x0A \|\| value == 0x0D
407	\|\| (value >= 0x20 && value <= 0x7E)
408	\|\| (value == 0x85) \|\| (value >= 0xA0 && value <= 0xD7FF)
409	\|\| (value >= 0xE000 && value <= 0xFFFD)
410	\|\| (value >= 0x10000 && value <= 0x10FFFF)))
411	return yaml_parser_set_reader_error(parser,
412	"Control characters are not allowed",
413	parser->offset, value);
414
415	/* Move the raw pointers. */
416
417	parser->raw_buffer.pointer += width;
418	parser->offset += width;
419
420	/* Finally put the character into the buffer. */
421
422	/* 0000 0000-0000 007F -> 0xxxxxxx */
423	if (value <= 0x7F) {
424	*(parser->buffer.last++) = value;
425	}
426	/* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
427	else if (value <= 0x7FF) {
428	*(parser->buffer.last++) = 0xC0 + (value >> 6);
429	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
430	}
431	/* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
432	else if (value <= 0xFFFF) {
433	*(parser->buffer.last++) = 0xE0 + (value >> 12);
434	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
435	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
436	}
437	/* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
438	else {
439	*(parser->buffer.last++) = 0xF0 + (value >> 18);
440	*(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
441	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
442	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
443	}
444
445	parser->unread ++;
446	}
447
448	/* On EOF, put NUL into the buffer and return. */
449
450	if (parser->eof) {
451	*(parser->buffer.last++) = '\0';
452	parser->unread ++;
453	return 1;
454	}
455
456	}
457
458	return 1;
459	}
460

Note: See TracBrowser for help on using the repository browser.

Download in other formats: