Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/libyaml/src/reader.c @ 1550

Last change on this file since 1550 was 898, checked in by hartmans, 18 years ago
Add pyyaml and libyaml packages backported from lenny. There is discussion about how these should go in the repository; these are added in this form in order to make forward progress.
File size: 16.0 KB

Rev	Line
[898]	1
	2	#include "yaml_private.h"
	3
	4	/*
	5	* Declarations.
	6	*/
	7
	8	static int
	9	yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
	10	size_t offset, int value);
	11
	12	static int
	13	yaml_parser_update_raw_buffer(yaml_parser_t *parser);
	14
	15	static int
	16	yaml_parser_determine_encoding(yaml_parser_t *parser);
	17
	18	YAML_DECLARE(int)
	19	yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
	20
	21	/*
	22	* Set the reader error and return 0.
	23	*/
	24
	25	static int
	26	yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
	27	size_t offset, int value)
	28	{
	29	parser->error = YAML_READER_ERROR;
	30	parser->problem = problem;
	31	parser->problem_offset = offset;
	32	parser->problem_value = value;
	33
	34	return 0;
	35	}
	36
	37	/*
	38	* Byte order marks.
	39	*/
	40
	41	#define BOM_UTF8 "\xef\xbb\xbf"
	42	#define BOM_UTF16LE "\xff\xfe"
	43	#define BOM_UTF16BE "\xfe\xff"
	44
	45	/*
	46	* Determine the input stream encoding by checking the BOM symbol. If no BOM is
	47	* found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
	48	*/
	49
	50	static int
	51	yaml_parser_determine_encoding(yaml_parser_t *parser)
	52	{
	53	/* Ensure that we had enough bytes in the raw buffer. */
	54
	55	while (!parser->eof
	56	&& parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
	57	if (!yaml_parser_update_raw_buffer(parser)) {
	58	return 0;
	59	}
	60	}
	61
	62	/* Determine the encoding. */
	63
	64	if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
	65	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
	66	parser->encoding = YAML_UTF16LE_ENCODING;
	67	parser->raw_buffer.pointer += 2;
	68	parser->offset += 2;
	69	}
	70	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
	71	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
	72	parser->encoding = YAML_UTF16BE_ENCODING;
	73	parser->raw_buffer.pointer += 2;
	74	parser->offset += 2;
	75	}
	76	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
	77	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
	78	parser->encoding = YAML_UTF8_ENCODING;
	79	parser->raw_buffer.pointer += 3;
	80	parser->offset += 3;
	81	}
	82	else {
	83	parser->encoding = YAML_UTF8_ENCODING;
	84	}
	85
	86	return 1;
	87	}
	88
	89	/*
	90	* Update the raw buffer.
	91	*/
	92
	93	static int
	94	yaml_parser_update_raw_buffer(yaml_parser_t *parser)
	95	{
	96	size_t size_read = 0;
	97
	98	/* Return if the raw buffer is full. */
	99
	100	if (parser->raw_buffer.start == parser->raw_buffer.pointer
	101	&& parser->raw_buffer.last == parser->raw_buffer.end)
	102	return 1;
	103
	104	/* Return on EOF. */
	105
	106	if (parser->eof) return 1;
	107
	108	/* Move the remaining bytes in the raw buffer to the beginning. */
	109
	110	if (parser->raw_buffer.start < parser->raw_buffer.pointer
	111	&& parser->raw_buffer.pointer < parser->raw_buffer.last) {
	112	memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
	113	parser->raw_buffer.last - parser->raw_buffer.pointer);
	114	}
	115	parser->raw_buffer.last -=
	116	parser->raw_buffer.pointer - parser->raw_buffer.start;
	117	parser->raw_buffer.pointer = parser->raw_buffer.start;
	118
	119	/* Call the read handler to fill the buffer. */
	120
	121	if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
	122	parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
	123	return yaml_parser_set_reader_error(parser, "Input error",
	124	parser->offset, -1);
	125	}
	126	parser->raw_buffer.last += size_read;
	127	if (!size_read) {
	128	parser->eof = 1;
	129	}
	130
	131	return 1;
	132	}
	133
	134	/*
	135	* Ensure that the buffer contains at least `length` characters.
	136	* Return 1 on success, 0 on failure.
	137	*
	138	* The length is supposed to be significantly less that the buffer size.
	139	*/
	140
	141	YAML_DECLARE(int)
	142	yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
	143	{
	144	assert(parser->read_handler); /* Read handler must be set. */
	145
	146	/* If the EOF flag is set and the raw buffer is empty, do nothing. */
	147
	148	if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
	149	return 1;
	150
	151	/* Return if the buffer contains enough characters. */
	152
	153	if (parser->unread >= length)
	154	return 1;
	155
	156	/* Determine the input encoding if it is not known yet. */
	157
	158	if (!parser->encoding) {
	159	if (!yaml_parser_determine_encoding(parser))
	160	return 0;
	161	}
	162
	163	/* Move the unread characters to the beginning of the buffer. */
	164
	165	if (parser->buffer.start < parser->buffer.pointer
	166	&& parser->buffer.pointer < parser->buffer.last) {
	167	size_t size = parser->buffer.last - parser->buffer.pointer;
	168	memmove(parser->buffer.start, parser->buffer.pointer, size);
	169	parser->buffer.pointer = parser->buffer.start;
	170	parser->buffer.last = parser->buffer.start + size;
	171	}
	172	else if (parser->buffer.pointer == parser->buffer.last) {
	173	parser->buffer.pointer = parser->buffer.start;
	174	parser->buffer.last = parser->buffer.start;
	175	}
	176
	177	/* Fill the buffer until it has enough characters. */
	178
	179	while (parser->unread < length)
	180	{
	181	/* Fill the raw buffer. */
	182
	183	if (!yaml_parser_update_raw_buffer(parser)) return 0;
	184
	185	/* Decode the raw buffer. */
	186
	187	while (parser->raw_buffer.pointer != parser->raw_buffer.last)
	188	{
	189	unsigned int value = 0, value2 = 0;
	190	int incomplete = 0;
	191	unsigned char octet;
	192	unsigned int width = 0;
	193	int low, high;
	194	size_t k;
	195	size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
	196
	197	/* Decode the next character. */
	198
	199	switch (parser->encoding)
	200	{
	201	case YAML_UTF8_ENCODING:
	202
	203	/*
	204	* Decode a UTF-8 character. Check RFC 3629
	205	* (http://www.ietf.org/rfc/rfc3629.txt) for more details.
	206	*
	207	* The following table (taken from the RFC) is used for
	208	* decoding.
	209	*
	210	* Char. number range \| UTF-8 octet sequence
	211	* (hexadecimal) \| (binary)
	212	* --------------------+------------------------------------
	213	* 0000 0000-0000 007F \| 0xxxxxxx
	214	* 0000 0080-0000 07FF \| 110xxxxx 10xxxxxx
	215	* 0000 0800-0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
	216	* 0001 0000-0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	217	*
	218	* Additionally, the characters in the range 0xD800-0xDFFF
	219	* are prohibited as they are reserved for use with UTF-16
	220	* surrogate pairs.
	221	*/
	222
	223	/* Determine the length of the UTF-8 sequence. */
	224
	225	octet = parser->raw_buffer.pointer[0];
	226	width = (octet & 0x80) == 0x00 ? 1 :
	227	(octet & 0xE0) == 0xC0 ? 2 :
	228	(octet & 0xF0) == 0xE0 ? 3 :
	229	(octet & 0xF8) == 0xF0 ? 4 : 0;
	230
	231	/* Check if the leading octet is valid. */
	232
	233	if (!width)
	234	return yaml_parser_set_reader_error(parser,
	235	"Invalid leading UTF-8 octet",
	236	parser->offset, octet);
	237
	238	/* Check if the raw buffer contains an incomplete character. */
	239
	240	if (width > raw_unread) {
	241	if (parser->eof) {
	242	return yaml_parser_set_reader_error(parser,
	243	"Incomplete UTF-8 octet sequence",
	244	parser->offset, -1);
	245	}
	246	incomplete = 1;
	247	break;
	248	}
	249
	250	/* Decode the leading octet. */
	251
	252	value = (octet & 0x80) == 0x00 ? octet & 0x7F :
	253	(octet & 0xE0) == 0xC0 ? octet & 0x1F :
	254	(octet & 0xF0) == 0xE0 ? octet & 0x0F :
	255	(octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
	256
	257	/* Check and decode the trailing octets. */
	258
	259	for (k = 1; k < width; k ++)
	260	{
	261	octet = parser->raw_buffer.pointer[k];
	262
	263	/* Check if the octet is valid. */
	264
	265	if ((octet & 0xC0) != 0x80)
	266	return yaml_parser_set_reader_error(parser,
	267	"Invalid trailing UTF-8 octet",
	268	parser->offset+k, octet);
	269
	270	/* Decode the octet. */
	271
	272	value = (value << 6) + (octet & 0x3F);
	273	}
	274
	275	/* Check the length of the sequence against the value. */
	276
	277	if (!((width == 1) \|\|
	278	(width == 2 && value >= 0x80) \|\|
	279	(width == 3 && value >= 0x800) \|\|
	280	(width == 4 && value >= 0x10000)))
	281	return yaml_parser_set_reader_error(parser,
	282	"Invalid length of a UTF-8 sequence",
	283	parser->offset, -1);
	284
	285	/* Check the range of the value. */
	286
	287	if ((value >= 0xD800 && value <= 0xDFFF) \|\| value > 0x10FFFF)
	288	return yaml_parser_set_reader_error(parser,
	289	"Invalid Unicode character",
	290	parser->offset, value);
	291
	292	break;
	293
	294	case YAML_UTF16LE_ENCODING:
	295	case YAML_UTF16BE_ENCODING:
	296
	297	low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
	298	high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
	299
	300	/*
	301	* The UTF-16 encoding is not as simple as one might
	302	* naively think. Check RFC 2781
	303	* (http://www.ietf.org/rfc/rfc2781.txt).
	304	*
	305	* Normally, two subsequent bytes describe a Unicode
	306	* character. However a special technique (called a
	307	* surrogate pair) is used for specifying character
	308	* values larger than 0xFFFF.
	309	*
	310	* A surrogate pair consists of two pseudo-characters:
	311	* high surrogate area (0xD800-0xDBFF)
	312	* low surrogate area (0xDC00-0xDFFF)
	313	*
	314	* The following formulas are used for decoding
	315	* and encoding characters using surrogate pairs:
	316	*
	317	* U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
	318	* U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
	319	* W1 = 110110yyyyyyyyyy
	320	* W2 = 110111xxxxxxxxxx
	321	*
	322	* where U is the character value, W1 is the high surrogate
	323	* area, W2 is the low surrogate area.
	324	*/
	325
	326	/* Check for incomplete UTF-16 character. */
	327
	328	if (raw_unread < 2) {
	329	if (parser->eof) {
	330	return yaml_parser_set_reader_error(parser,
	331	"Incomplete UTF-16 character",
	332	parser->offset, -1);
	333	}
	334	incomplete = 1;
	335	break;
	336	}
	337
	338	/* Get the character. */
	339
	340	value = parser->raw_buffer.pointer[low]
	341	+ (parser->raw_buffer.pointer[high] << 8);
	342
	343	/* Check for unexpected low surrogate area. */
	344
	345	if ((value & 0xFC00) == 0xDC00)
	346	return yaml_parser_set_reader_error(parser,
	347	"Unexpected low surrogate area",
	348	parser->offset, value);
	349
	350	/* Check for a high surrogate area. */
	351
	352	if ((value & 0xFC00) == 0xD800) {
	353
	354	width = 4;
	355
	356	/* Check for incomplete surrogate pair. */
	357
	358	if (raw_unread < 4) {
	359	if (parser->eof) {
	360	return yaml_parser_set_reader_error(parser,
	361	"Incomplete UTF-16 surrogate pair",
	362	parser->offset, -1);
	363	}
	364	incomplete = 1;
	365	break;
	366	}
	367
	368	/* Get the next character. */
	369
	370	value2 = parser->raw_buffer.pointer[low+2]
	371	+ (parser->raw_buffer.pointer[high+2] << 8);
	372
	373	/* Check for a low surrogate area. */
	374
	375	if ((value2 & 0xFC00) != 0xDC00)
	376	return yaml_parser_set_reader_error(parser,
	377	"Expected low surrogate area",
	378	parser->offset+2, value2);
	379
	380	/* Generate the value of the surrogate pair. */
	381
	382	value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
	383	}
	384
	385	else {
	386	width = 2;
	387	}
	388
	389	break;
	390
	391	default:
	392	assert(1); /* Impossible. */
	393	}
	394
	395	/* Check if the raw buffer contains enough bytes to form a character. */
	396
	397	if (incomplete) break;
	398
	399	/*
	400	* Check if the character is in the allowed range:
	401	* #x9 \| #xA \| #xD \| [#x20-#x7E] (8 bit)
	402	* \| #x85 \| [#xA0-#xD7FF] \| [#xE000-#xFFFD] (16 bit)
	403	* \| [#x10000-#x10FFFF] (32 bit)
	404	*/
	405
	406	if (! (value == 0x09 \|\| value == 0x0A \|\| value == 0x0D
	407	\|\| (value >= 0x20 && value <= 0x7E)
	408	\|\| (value == 0x85) \|\| (value >= 0xA0 && value <= 0xD7FF)
	409	\|\| (value >= 0xE000 && value <= 0xFFFD)
	410	\|\| (value >= 0x10000 && value <= 0x10FFFF)))
	411	return yaml_parser_set_reader_error(parser,
	412	"Control characters are not allowed",
	413	parser->offset, value);
	414
	415	/* Move the raw pointers. */
	416
	417	parser->raw_buffer.pointer += width;
	418	parser->offset += width;
	419
	420	/* Finally put the character into the buffer. */
	421
	422	/* 0000 0000-0000 007F -> 0xxxxxxx */
	423	if (value <= 0x7F) {
	424	*(parser->buffer.last++) = value;
	425	}
	426	/* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
	427	else if (value <= 0x7FF) {
	428	*(parser->buffer.last++) = 0xC0 + (value >> 6);
	429	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
	430	}
	431	/* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
	432	else if (value <= 0xFFFF) {
	433	*(parser->buffer.last++) = 0xE0 + (value >> 12);
	434	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
	435	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
	436	}
	437	/* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	438	else {
	439	*(parser->buffer.last++) = 0xF0 + (value >> 18);
	440	*(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
	441	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
	442	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
	443	}
	444
	445	parser->unread ++;
	446	}
	447
	448	/* On EOF, put NUL into the buffer and return. */
	449
	450	if (parser->eof) {
	451	*(parser->buffer.last++) = '\0';
	452	parser->unread ++;
	453	return 1;
	454	}
	455
	456	}
	457
	458	return 1;
	459	}
	460

Note: See TracBrowser for help on using the repository browser.

Download in other formats: