Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/libyaml/tests/test-reader.c @ 1242

Last change on this file since 1242 was 898, checked in by hartmans, 16 years ago
Add pyyaml and libyaml packages backported from lenny. There is discussion about how these should go in the repository; these are added in this form in order to make forward progress.
File size: 12.0 KB

Rev	Line
[898]	1	#include <yaml.h>
	2
	3	YAML_DECLARE(int)
	4	yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
	5
	6	#include <stdlib.h>
	7	#include <stdio.h>
	8
	9	#ifdef NDEBUG
	10	#undef NDEBUG
	11	#endif
	12	#include <assert.h>
	13
	14	/*
	15	* Test cases are stolen from
	16	* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
	17	*/
	18
	19	typedef struct {
	20	char *title;
	21	char *test;
	22	int result;
	23	} test_case;
	24
	25	test_case utf8_sequences[] = {
	26	/* {"title", "test 1\|test 2\|...\|test N!", (0 or 1)}, */
	27
	28	{"a simple test", "'test' is '\xd0\xbf\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x80\xd0\xba\xd0\xb0' in Russian!", 1},
	29	{"an empty line", "!", 1},
	30
	31	{"u-0 is a control character", "\x00!", 0},
	32	{"u-80 is a control character", "\xc2\x80!", 0},
	33	{"u-800 is valid", "\xe0\xa0\x80!", 1},
	34	{"u-10000 is valid", "\xf0\x90\x80\x80!", 1},
	35	{"5 bytes sequences are not allowed", "\xf8\x88\x80\x80\x80!", 0},
	36	{"6 bytes sequences are not allowed", "\xfc\x84\x80\x80\x80\x80!", 0},
	37
	38	{"u-7f is a control character", "\x7f!", 0},
	39	{"u-7FF is valid", "\xdf\xbf!", 1},
	40	{"u-FFFF is a control character", "\xef\xbf\xbf!", 0},
	41	{"u-1FFFFF is too large", "\xf7\xbf\xbf\xbf!", 0},
	42	{"u-3FFFFFF is 5 bytes", "\xfb\xbf\xbf\xbf\xbf!", 0},
	43	{"u-7FFFFFFF is 6 bytes", "\xfd\xbf\xbf\xbf\xbf\xbf!", 0},
	44
	45	{"u-D7FF", "\xed\x9f\xbf!", 1},
	46	{"u-E000", "\xee\x80\x80!", 1},
	47	{"u-FFFD", "\xef\xbf\xbd!", 1},
	48	{"u-10FFFF", "\xf4\x8f\xbf\xbf!", 1},
	49	{"u-110000", "\xf4\x90\x80\x80!", 0},
	50
	51	{"first continuation byte", "\x80!", 0},
	52	{"last continuation byte", "\xbf!", 0},
	53
	54	{"2 continuation bytes", "\x80\xbf!", 0},
	55	{"3 continuation bytes", "\x80\xbf\x80!", 0},
	56	{"4 continuation bytes", "\x80\xbf\x80\xbf!", 0},
	57	{"5 continuation bytes", "\x80\xbf\x80\xbf\x80!", 0},
	58	{"6 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf!", 0},
	59	{"7 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf\x80!", 0},
	60
	61	{"sequence of all 64 possible continuation bytes",
	62	"\x80\|\x81\|\x82\|\x83\|\x84\|\x85\|\x86\|\x87\|\x88\|\x89\|\x8a\|\x8b\|\x8c\|\x8d\|\x8e\|\x8f\|"
	63	"\x90\|\x91\|\x92\|\x93\|\x94\|\x95\|\x96\|\x97\|\x98\|\x99\|\x9a\|\x9b\|\x9c\|\x9d\|\x9e\|\x9f\|"
	64	"\xa0\|\xa1\|\xa2\|\xa3\|\xa4\|\xa5\|\xa6\|\xa7\|\xa8\|\xa9\|\xaa\|\xab\|\xac\|\xad\|\xae\|\xaf\|"
	65	"\xb0\|\xb1\|\xb2\|\xb3\|\xb4\|\xb5\|\xb6\|\xb7\|\xb8\|\xb9\|\xba\|\xbb\|\xbc\|\xbd\|\xbe\|\xbf!", 0},
	66	{"32 first bytes of 2-byte sequences {0xc0-0xdf}",
	67	"\xc0 \|\xc1 \|\xc2 \|\xc3 \|\xc4 \|\xc5 \|\xc6 \|\xc7 \|\xc8 \|\xc9 \|\xca \|\xcb \|\xcc \|\xcd \|\xce \|\xcf \|"
	68	"\xd0 \|\xd1 \|\xd2 \|\xd3 \|\xd4 \|\xd5 \|\xd6 \|\xd7 \|\xd8 \|\xd9 \|\xda \|\xdb \|\xdc \|\xdd \|\xde \|\xdf !", 0},
	69	{"16 first bytes of 3-byte sequences {0xe0-0xef}",
	70	"\xe0 \|\xe1 \|\xe2 \|\xe3 \|\xe4 \|\xe5 \|\xe6 \|\xe7 \|\xe8 \|\xe9 \|\xea \|\xeb \|\xec \|\xed \|\xee \|\xef !", 0},
	71	{"8 first bytes of 4-byte sequences {0xf0-0xf7}", "\xf0 \|\xf1 \|\xf2 \|\xf3 \|\xf4 \|\xf5 \|\xf6 \|\xf7 !", 0},
	72	{"4 first bytes of 5-byte sequences {0xf8-0xfb}", "\xf8 \|\xf9 \|\xfa \|\xfb !", 0},
	73	{"2 first bytes of 6-byte sequences {0xfc-0xfd}", "\xfc \|\xfd !", 0},
	74
	75	{"sequences with last byte missing {u-0}",
	76	"\xc0\|\xe0\x80\|\xf0\x80\x80\|\xf8\x80\x80\x80\|\xfc\x80\x80\x80\x80!", 0},
	77	{"sequences with last byte missing {u-...FF}",
	78	"\xdf\|\xef\xbf\|\xf7\xbf\xbf\|\xfb\xbf\xbf\xbf\|\xfd\xbf\xbf\xbf\xbf!", 0},
	79
	80	{"impossible bytes", "\xfe\|\xff\|\xfe\xfe\xff\xff!", 0},
	81
	82	{"overlong sequences {u-2f}",
	83	"\xc0\xaf\|\xe0\x80\xaf\|\xf0\x80\x80\xaf\|\xf8\x80\x80\x80\xaf\|\xfc\x80\x80\x80\x80\xaf!", 0},
	84
	85	{"maximum overlong sequences",
	86	"\xc1\xbf\|\xe0\x9f\xbf\|\xf0\x8f\xbf\xbf\|\xf8\x87\xbf\xbf\xbf\|\xfc\x83\xbf\xbf\xbf\xbf!", 0},
	87
	88	{"overlong representation of the NUL character",
	89	"\xc0\x80\|\xe0\x80\x80\|\xf0\x80\x80\x80\|\xf8\x80\x80\x80\x80\|\xfc\x80\x80\x80\x80\x80!", 0},
	90
	91	{"single UTF-16 surrogates",
	92	"\xed\xa0\x80\|\xed\xad\xbf\|\xed\xae\x80\|\xed\xaf\xbf\|\xed\xb0\x80\|\xed\xbe\x80\|\xed\xbf\xbf!", 0},
	93
	94	{"paired UTF-16 surrogates",
	95	"\xed\xa0\x80\xed\xb0\x80\|\xed\xa0\x80\xed\xbf\xbf\|\xed\xad\xbf\xed\xb0\x80\|"
	96	"\xed\xad\xbf\xed\xbf\xbf\|\xed\xae\x80\xed\xb0\x80\|\xed\xae\x80\xed\xbf\xbf\|"
	97	"\xed\xaf\xbf\xed\xb0\x80\|\xed\xaf\xbf\xed\xbf\xbf!", 0},
	98
	99	{"other illegal code positions", "\xef\xbf\xbe\|\xef\xbf\xbf!", 0},
	100
	101	{NULL, NULL, 0}
	102	};
	103
	104	test_case boms[] = {
	105
	106	/* {"title", "test!", lenth}, */
	107
	108	{"no bom (utf-8)", "Hi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82!", 13},
	109	{"bom (utf-8)", "\xef\xbb\xbfHi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82!", 13},
	110	{"bom (utf-16-le)", "\xff\xfeH\x00i\x00 \x00i\x00s\x00 \x00\x1f\x04@\x04""8\x04""2\x04""5\x04""B\x04!", 13},
	111	{"bom (utf-16-be)", "\xfe\xff\x00H\x00i\x00 \x00i\x00s\x00 \x04\x1f\x04@\x04""8\x04""2\x04""5\x04""B!", 13},
	112	{NULL, NULL, 0}
	113	};
	114
	115	char *bom_original = "Hi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82";
	116
	117	int check_utf8_sequences(void)
	118	{
	119	yaml_parser_t parser;
	120	int failed = 0;
	121	int k;
	122	printf("checking utf-8 sequences...\n");
	123	for (k = 0; utf8_sequences[k].test; k++) {
	124	char *title = utf8_sequences[k].title;
	125	int check = utf8_sequences[k].result;
	126	int result;
	127	char *start = utf8_sequences[k].test;
	128	char *end = start;
	129	printf("\t%s:\n", title);
	130	while(1) {
	131	while (end != '\|' && end != '!') end++;
	132	yaml_parser_initialize(&parser);
	133	yaml_parser_set_input_string(&parser, (unsigned char *)start, end-start);
	134	result = yaml_parser_update_buffer(&parser, end-start);
	135	if (result != check) {
	136	printf("\t\t- ");
	137	failed ++;
	138	}
	139	else {
	140	printf("\t\t+ ");
	141	}
	142	if (!parser.error) {
	143	printf("(no error)\n");
	144	}
	145	else if (parser.error == YAML_READER_ERROR) {
	146	if (parser.problem_value != -1) {
	147	printf("(reader error: %s: #%X at %d)\n",
	148	parser.problem, parser.problem_value, parser.problem_offset);
	149	}
	150	else {
	151	printf("(reader error: %s at %d)\n",
	152	parser.problem, parser.problem_offset);
	153	}
	154	}
	155	if (*end == '!') break;
	156	start = ++end;
	157	yaml_parser_delete(&parser);
	158	};
	159	printf("\n");
	160	}
	161	printf("checking utf-8 sequences: %d fail(s)\n", failed);
	162	return failed;
	163	}
	164
	165	int check_boms(void)
	166	{
	167	yaml_parser_t parser;
	168	int failed = 0;
	169	int k;
	170	printf("checking boms...\n");
	171	for (k = 0; boms[k].test; k++) {
	172	char *title = boms[k].title;
	173	int check = boms[k].result;
	174	int result;
	175	char *start = boms[k].test;
	176	char *end = start;
	177	while (*end != '!') end++;
	178	printf("\t%s: ", title);
	179	yaml_parser_initialize(&parser);
	180	yaml_parser_set_input_string(&parser, (unsigned char *)start, end-start);
	181	result = yaml_parser_update_buffer(&parser, end-start);
	182	if (!result) {
	183	printf("- (reader error: %s at %d)\n", parser.problem, parser.problem_offset);
	184	failed++;
	185	}
	186	else {
	187	if (parser.unread != check) {
	188	printf("- (length=%d while expected length=%d)\n", parser.unread, check);
	189	failed++;
	190	}
	191	else if (memcmp(parser.buffer.start, bom_original, check) != 0) {
	192	printf("- (value '%s' does not equal to the original value '%s')\n", parser.buffer.start, bom_original);
	193	failed++;
	194	}
	195	else {
	196	printf("+\n");
	197	}
	198	}
	199	yaml_parser_delete(&parser);
	200	}
	201	printf("checking boms: %d fail(s)\n", failed);
	202	return failed;
	203	}
	204
	205	#define LONG 100000
	206
	207	int check_long_utf8(void)
	208	{
	209	yaml_parser_t parser;
	210	int k = 0;
	211	int j;
	212	int failed = 0;
	213	unsigned char ch0, ch1;
	214	unsigned char buffer = malloc(3+LONG2);
	215	assert(buffer);
	216	printf("checking a long utf8 sequence...\n");
	217	buffer[k++] = '\xef';
	218	buffer[k++] = '\xbb';
	219	buffer[k++] = '\xbf';
	220	for (j = 0; j < LONG; j ++) {
	221	if (j % 2) {
	222	buffer[k++] = '\xd0';
	223	buffer[k++] = '\x90';
	224	}
	225	else {
	226	buffer[k++] = '\xd0';
	227	buffer[k++] = '\xaf';
	228	}
	229	}
	230	yaml_parser_initialize(&parser);
	231	yaml_parser_set_input_string(&parser, buffer, 3+LONG*2);
	232	for (k = 0; k < LONG; k++) {
	233	if (!parser.unread) {
	234	if (!yaml_parser_update_buffer(&parser, 1)) {
	235	printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset);
	236	failed = 1;
	237	break;
	238	}
	239	}
	240	if (!parser.unread) {
	241	printf("\tnot enough characters at %d\n", k);
	242	failed = 1;
	243	break;
	244	}
	245	if (k % 2) {
	246	ch0 = '\xd0';
	247	ch1 = '\x90';
	248	}
	249	else {
	250	ch0 = '\xd0';
	251	ch1 = '\xaf';
	252	}
	253	if (parser.buffer.pointer[0] != ch0 \|\| parser.buffer.pointer[1] != ch1) {
	254	printf("\tincorrect UTF-8 sequence: %X %X instead of %X %X\n",
	255	(int)parser.buffer.pointer[0], (int)parser.buffer.pointer[1],
	256	(int)ch0, (int)ch1);
	257	failed = 1;
	258	break;
	259	}
	260	parser.buffer.pointer += 2;
	261	parser.unread -= 1;
	262	}
	263	if (!failed) {
	264	if (!yaml_parser_update_buffer(&parser, 1)) {
	265	printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset);
	266	failed = 1;
	267	}
	268	else if (parser.buffer.pointer[0] != '\0') {
	269	printf("\texpected NUL, found %X (eof=%d, unread=%d)\n", (int)parser.buffer.pointer[0], parser.eof, parser.unread);
	270	failed = 1;
	271	}
	272	}
	273	yaml_parser_delete(&parser);
	274	free(buffer);
	275	printf("checking a long utf8 sequence: %d fail(s)\n", failed);
	276	return failed;
	277	}
	278
	279	int check_long_utf16(void)
	280	{
	281	yaml_parser_t parser;
	282	int k = 0;
	283	int j;
	284	int failed = 0;
	285	unsigned char ch0, ch1;
	286	unsigned char buffer = malloc(2+LONG2);
	287	assert(buffer);
	288	printf("checking a long utf16 sequence...\n");
	289	buffer[k++] = '\xff';
	290	buffer[k++] = '\xfe';
	291	for (j = 0; j < LONG; j ++) {
	292	if (j % 2) {
	293	buffer[k++] = '\x10';
	294	buffer[k++] = '\x04';
	295	}
	296	else {
	297	buffer[k++] = '/';
	298	buffer[k++] = '\x04';
	299	}
	300	}
	301	yaml_parser_initialize(&parser);
	302	yaml_parser_set_input_string(&parser, buffer, 2+LONG*2);
	303	for (k = 0; k < LONG; k++) {
	304	if (!parser.unread) {
	305	if (!yaml_parser_update_buffer(&parser, 1)) {
	306	printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset);
	307	failed = 1;
	308	break;
	309	}
	310	}
	311	if (!parser.unread) {
	312	printf("\tnot enough characters at %d\n", k);
	313	failed = 1;
	314	break;
	315	}
	316	if (k % 2) {
	317	ch0 = '\xd0';
	318	ch1 = '\x90';
	319	}
	320	else {
	321	ch0 = '\xd0';
	322	ch1 = '\xaf';
	323	}
	324	if (parser.buffer.pointer[0] != ch0 \|\| parser.buffer.pointer[1] != ch1) {
	325	printf("\tincorrect UTF-8 sequence: %X %X instead of %X %X\n",
	326	(int)parser.buffer.pointer[0], (int)parser.buffer.pointer[1],
	327	(int)ch0, (int)ch1);
	328	failed = 1;
	329	break;
	330	}
	331	parser.buffer.pointer += 2;
	332	parser.unread -= 1;
	333	}
	334	if (!failed) {
	335	if (!yaml_parser_update_buffer(&parser, 1)) {
	336	printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset);
	337	failed = 1;
	338	}
	339	else if (parser.buffer.pointer[0] != '\0') {
	340	printf("\texpected NUL, found %X (eof=%d, unread=%d)\n", (int)parser.buffer.pointer[0], parser.eof, parser.unread);
	341	failed = 1;
	342	}
	343	}
	344	yaml_parser_delete(&parser);
	345	free(buffer);
	346	printf("checking a long utf16 sequence: %d fail(s)\n", failed);
	347	return failed;
	348	}
	349
	350	int
	351	main(void)
	352	{
	353	return check_utf8_sequences() + check_boms() + check_long_utf8() + check_long_utf16();
	354	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: