Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/pyyaml/lib/yaml/scanner.py @ 1069

Last change on this file since 1069 was 898, checked in by hartmans, 16 years ago
Add pyyaml and libyaml packages backported from lenny. There is discussion about how these should go in the repository; these are added in this form in order to make forward progress.
File size: 51.4 KB

Rev	Line
[898]	1
	2	# Scanner produces tokens of the following types:
	3	# STREAM-START
	4	# STREAM-END
	5	# DIRECTIVE(name, value)
	6	# DOCUMENT-START
	7	# DOCUMENT-END
	8	# BLOCK-SEQUENCE-START
	9	# BLOCK-MAPPING-START
	10	# BLOCK-END
	11	# FLOW-SEQUENCE-START
	12	# FLOW-MAPPING-START
	13	# FLOW-SEQUENCE-END
	14	# FLOW-MAPPING-END
	15	# BLOCK-ENTRY
	16	# FLOW-ENTRY
	17	# KEY
	18	# VALUE
	19	# ALIAS(value)
	20	# ANCHOR(value)
	21	# TAG(value)
	22	# SCALAR(value, plain, style)
	23	#
	24	# Read comments in the Scanner code for more details.
	25	#
	26
	27	__all__ = ['Scanner', 'ScannerError']
	28
	29	from error import MarkedYAMLError
	30	from tokens import *
	31
	32	class ScannerError(MarkedYAMLError):
	33	pass
	34
	35	class SimpleKey(object):
	36	# See below simple keys treatment.
	37
	38	def __init__(self, token_number, required, index, line, column, mark):
	39	self.token_number = token_number
	40	self.required = required
	41	self.index = index
	42	self.line = line
	43	self.column = column
	44	self.mark = mark
	45
	46	class Scanner(object):
	47
	48	def __init__(self):
	49	"""Initialize the scanner."""
	50	# It is assumed that Scanner and Reader will have a common descendant.
	51	# Reader do the dirty work of checking for BOM and converting the
	52	# input data to Unicode. It also adds NUL to the end.
	53	#
	54	# Reader supports the following methods
	55	# self.peek(i=0) # peek the next i-th character
	56	# self.prefix(l=1) # peek the next l characters
	57	# self.forward(l=1) # read the next l characters and move the pointer.
	58
	59	# Had we reached the end of the stream?
	60	self.done = False
	61
	62	# The number of unclosed '{' and '['. `flow_level == 0` means block
	63	# context.
	64	self.flow_level = 0
	65
	66	# List of processed tokens that are not yet emitted.
	67	self.tokens = []
	68
	69	# Add the STREAM-START token.
	70	self.fetch_stream_start()
	71
	72	# Number of tokens that were emitted through the `get_token` method.
	73	self.tokens_taken = 0
	74
	75	# The current indentation level.
	76	self.indent = -1
	77
	78	# Past indentation levels.
	79	self.indents = []
	80
	81	# Variables related to simple keys treatment.
	82
	83	# A simple key is a key that is not denoted by the '?' indicator.
	84	# Example of simple keys:
	85	# ---
	86	# block simple key: value
	87	# ? not a simple key:
	88	# : { flow simple key: value }
	89	# We emit the KEY token before all keys, so when we find a potential
	90	# simple key, we try to locate the corresponding ':' indicator.
	91	# Simple keys should be limited to a single line and 1024 characters.
	92
	93	# Can a simple key start at the current position? A simple key may
	94	# start:
	95	# - at the beginning of the line, not counting indentation spaces
	96	# (in block context),
	97	# - after '{', '[', ',' (in the flow context),
	98	# - after '?', ':', '-' (in the block context).
	99	# In the block context, this flag also signifies if a block collection
	100	# may start at the current position.
	101	self.allow_simple_key = True
	102
	103	# Keep track of possible simple keys. This is a dictionary. The key
	104	# is `flow_level`; there can be no more that one possible simple key
	105	# for each level. The value is a SimpleKey record:
	106	# (token_number, required, index, line, column, mark)
	107	# A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
	108	# '[', or '{' tokens.
	109	self.possible_simple_keys = {}
	110
	111	# Public methods.
	112
	113	def check_token(self, *choices):
	114	# Check if the next token is one of the given types.
	115	while self.need_more_tokens():
	116	self.fetch_more_tokens()
	117	if self.tokens:
	118	if not choices:
	119	return True
	120	for choice in choices:
	121	if isinstance(self.tokens[0], choice):
	122	return True
	123	return False
	124
	125	def peek_token(self):
	126	# Return the next token, but do not delete if from the queue.
	127	while self.need_more_tokens():
	128	self.fetch_more_tokens()
	129	if self.tokens:
	130	return self.tokens[0]
	131
	132	def get_token(self):
	133	# Return the next token.
	134	while self.need_more_tokens():
	135	self.fetch_more_tokens()
	136	if self.tokens:
	137	self.tokens_taken += 1
	138	return self.tokens.pop(0)
	139
	140	# Private methods.
	141
	142	def need_more_tokens(self):
	143	if self.done:
	144	return False
	145	if not self.tokens:
	146	return True
	147	# The current token may be a potential simple key, so we
	148	# need to look further.
	149	self.stale_possible_simple_keys()
	150	if self.next_possible_simple_key() == self.tokens_taken:
	151	return True
	152
	153	def fetch_more_tokens(self):
	154
	155	# Eat whitespaces and comments until we reach the next token.
	156	self.scan_to_next_token()
	157
	158	# Remove obsolete possible simple keys.
	159	self.stale_possible_simple_keys()
	160
	161	# Compare the current indentation and column. It may add some tokens
	162	# and decrease the current indentation level.
	163	self.unwind_indent(self.column)
	164
	165	# Peek the next character.
	166	ch = self.peek()
	167
	168	# Is it the end of stream?
	169	if ch == u'\0':
	170	return self.fetch_stream_end()
	171
	172	# Is it a directive?
	173	if ch == u'%' and self.check_directive():
	174	return self.fetch_directive()
	175
	176	# Is it the document start?
	177	if ch == u'-' and self.check_document_start():
	178	return self.fetch_document_start()
	179
	180	# Is it the document end?
	181	if ch == u'.' and self.check_document_end():
	182	return self.fetch_document_end()
	183
	184	# TODO: support for BOM within a stream.
	185	#if ch == u'\uFEFF':
	186	# return self.fetch_bom() <-- issue BOMToken
	187
	188	# Note: the order of the following checks is NOT significant.
	189
	190	# Is it the flow sequence start indicator?
	191	if ch == u'[':
	192	return self.fetch_flow_sequence_start()
	193
	194	# Is it the flow mapping start indicator?
	195	if ch == u'{':
	196	return self.fetch_flow_mapping_start()
	197
	198	# Is it the flow sequence end indicator?
	199	if ch == u']':
	200	return self.fetch_flow_sequence_end()
	201
	202	# Is it the flow mapping end indicator?
	203	if ch == u'}':
	204	return self.fetch_flow_mapping_end()
	205
	206	# Is it the flow entry indicator?
	207	if ch == u',':
	208	return self.fetch_flow_entry()
	209
	210	# Is it the block entry indicator?
	211	if ch == u'-' and self.check_block_entry():
	212	return self.fetch_block_entry()
	213
	214	# Is it the key indicator?
	215	if ch == u'?' and self.check_key():
	216	return self.fetch_key()
	217
	218	# Is it the value indicator?
	219	if ch == u':' and self.check_value():
	220	return self.fetch_value()
	221
	222	# Is it an alias?
	223	if ch == u'*':
	224	return self.fetch_alias()
	225
	226	# Is it an anchor?
	227	if ch == u'&':
	228	return self.fetch_anchor()
	229
	230	# Is it a tag?
	231	if ch == u'!':
	232	return self.fetch_tag()
	233
	234	# Is it a literal scalar?
	235	if ch == u'\|' and not self.flow_level:
	236	return self.fetch_literal()
	237
	238	# Is it a folded scalar?
	239	if ch == u'>' and not self.flow_level:
	240	return self.fetch_folded()
	241
	242	# Is it a single quoted scalar?
	243	if ch == u'\'':
	244	return self.fetch_single()
	245
	246	# Is it a double quoted scalar?
	247	if ch == u'\"':
	248	return self.fetch_double()
	249
	250	# It must be a plain scalar then.
	251	if self.check_plain():
	252	return self.fetch_plain()
	253
	254	# No? It's an error. Let's produce a nice error message.
	255	raise ScannerError("while scanning for the next token", None,
	256	"found character %r that cannot start any token"
	257	% ch.encode('utf-8'), self.get_mark())
	258
	259	# Simple keys treatment.
	260
	261	def next_possible_simple_key(self):
	262	# Return the number of the nearest possible simple key. Actually we
	263	# don't need to loop through the whole dictionary. We may replace it
	264	# with the following code:
	265	# if not self.possible_simple_keys:
	266	# return None
	267	# return self.possible_simple_keys[
	268	# min(self.possible_simple_keys.keys())].token_number
	269	min_token_number = None
	270	for level in self.possible_simple_keys:
	271	key = self.possible_simple_keys[level]
	272	if min_token_number is None or key.token_number < min_token_number:
	273	min_token_number = key.token_number
	274	return min_token_number
	275
	276	def stale_possible_simple_keys(self):
	277	# Remove entries that are no longer possible simple keys. According to
	278	# the YAML specification, simple keys
	279	# - should be limited to a single line,
	280	# - should be no longer than 1024 characters.
	281	# Disabling this procedure will allow simple keys of any length and
	282	# height (may cause problems if indentation is broken though).
	283	for level in self.possible_simple_keys.keys():
	284	key = self.possible_simple_keys[level]
	285	if key.line != self.line \
	286	or self.index-key.index > 1024:
	287	if key.required:
	288	raise ScannerError("while scanning a simple key", key.mark,
	289	"could not found expected ':'", self.get_mark())
	290	del self.possible_simple_keys[level]
	291
	292	def save_possible_simple_key(self):
	293	# The next token may start a simple key. We check if it's possible
	294	# and save its position. This function is called for
	295	# ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
	296
	297	# Check if a simple key is required at the current position.
	298	required = not self.flow_level and self.indent == self.column
	299
	300	# A simple key is required only if it is the first token in the current
	301	# line. Therefore it is always allowed.
	302	assert self.allow_simple_key or not required
	303
	304	# The next token might be a simple key. Let's save it's number and
	305	# position.
	306	if self.allow_simple_key:
	307	self.remove_possible_simple_key()
	308	token_number = self.tokens_taken+len(self.tokens)
	309	key = SimpleKey(token_number, required,
	310	self.index, self.line, self.column, self.get_mark())
	311	self.possible_simple_keys[self.flow_level] = key
	312
	313	def remove_possible_simple_key(self):
	314	# Remove the saved possible key position at the current flow level.
	315	if self.flow_level in self.possible_simple_keys:
	316	key = self.possible_simple_keys[self.flow_level]
	317
	318	if key.required:
	319	raise ScannerError("while scanning a simple key", key.mark,
	320	"could not found expected ':'", self.get_mark())
	321
	322	del self.possible_simple_keys[self.flow_level]
	323
	324	# Indentation functions.
	325
	326	def unwind_indent(self, column):
	327
	328	## In flow context, tokens should respect indentation.
	329	## Actually the condition should be `self.indent >= column` according to
	330	## the spec. But this condition will prohibit intuitively correct
	331	## constructions such as
	332	## key : {
	333	## }
	334	#if self.flow_level and self.indent > column:
	335	# raise ScannerError(None, None,
	336	# "invalid intendation or unclosed '[' or '{'",
	337	# self.get_mark())
	338
	339	# In the flow context, indentation is ignored. We make the scanner less
	340	# restrictive then specification requires.
	341	if self.flow_level:
	342	return
	343
	344	# In block context, we may need to issue the BLOCK-END tokens.
	345	while self.indent > column:
	346	mark = self.get_mark()
	347	self.indent = self.indents.pop()
	348	self.tokens.append(BlockEndToken(mark, mark))
	349
	350	def add_indent(self, column):
	351	# Check if we need to increase indentation.
	352	if self.indent < column:
	353	self.indents.append(self.indent)
	354	self.indent = column
	355	return True
	356	return False
	357
	358	# Fetchers.
	359
	360	def fetch_stream_start(self):
	361	# We always add STREAM-START as the first token and STREAM-END as the
	362	# last token.
	363
	364	# Read the token.
	365	mark = self.get_mark()
	366
	367	# Add STREAM-START.
	368	self.tokens.append(StreamStartToken(mark, mark,
	369	encoding=self.encoding))
	370
	371
	372	def fetch_stream_end(self):
	373
	374	# Set the current intendation to -1.
	375	self.unwind_indent(-1)
	376
	377	# Reset everything (not really needed).
	378	self.allow_simple_key = False
	379	self.possible_simple_keys = {}
	380
	381	# Read the token.
	382	mark = self.get_mark()
	383
	384	# Add STREAM-END.
	385	self.tokens.append(StreamEndToken(mark, mark))
	386
	387	# The steam is finished.
	388	self.done = True
	389
	390	def fetch_directive(self):
	391
	392	# Set the current intendation to -1.
	393	self.unwind_indent(-1)
	394
	395	# Reset simple keys.
	396	self.remove_possible_simple_key()
	397	self.allow_simple_key = False
	398
	399	# Scan and add DIRECTIVE.
	400	self.tokens.append(self.scan_directive())
	401
	402	def fetch_document_start(self):
	403	self.fetch_document_indicator(DocumentStartToken)
	404
	405	def fetch_document_end(self):
	406	self.fetch_document_indicator(DocumentEndToken)
	407
	408	def fetch_document_indicator(self, TokenClass):
	409
	410	# Set the current intendation to -1.
	411	self.unwind_indent(-1)
	412
	413	# Reset simple keys. Note that there could not be a block collection
	414	# after '---'.
	415	self.remove_possible_simple_key()
	416	self.allow_simple_key = False
	417
	418	# Add DOCUMENT-START or DOCUMENT-END.
	419	start_mark = self.get_mark()
	420	self.forward(3)
	421	end_mark = self.get_mark()
	422	self.tokens.append(TokenClass(start_mark, end_mark))
	423
	424	def fetch_flow_sequence_start(self):
	425	self.fetch_flow_collection_start(FlowSequenceStartToken)
	426
	427	def fetch_flow_mapping_start(self):
	428	self.fetch_flow_collection_start(FlowMappingStartToken)
	429
	430	def fetch_flow_collection_start(self, TokenClass):
	431
	432	# '[' and '{' may start a simple key.
	433	self.save_possible_simple_key()
	434
	435	# Increase the flow level.
	436	self.flow_level += 1
	437
	438	# Simple keys are allowed after '[' and '{'.
	439	self.allow_simple_key = True
	440
	441	# Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
	442	start_mark = self.get_mark()
	443	self.forward()
	444	end_mark = self.get_mark()
	445	self.tokens.append(TokenClass(start_mark, end_mark))
	446
	447	def fetch_flow_sequence_end(self):
	448	self.fetch_flow_collection_end(FlowSequenceEndToken)
	449
	450	def fetch_flow_mapping_end(self):
	451	self.fetch_flow_collection_end(FlowMappingEndToken)
	452
	453	def fetch_flow_collection_end(self, TokenClass):
	454
	455	# Reset possible simple key on the current level.
	456	self.remove_possible_simple_key()
	457
	458	# Decrease the flow level.
	459	self.flow_level -= 1
	460
	461	# No simple keys after ']' or '}'.
	462	self.allow_simple_key = False
	463
	464	# Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
	465	start_mark = self.get_mark()
	466	self.forward()
	467	end_mark = self.get_mark()
	468	self.tokens.append(TokenClass(start_mark, end_mark))
	469
	470	def fetch_flow_entry(self):
	471
	472	# Simple keys are allowed after ','.
	473	self.allow_simple_key = True
	474
	475	# Reset possible simple key on the current level.
	476	self.remove_possible_simple_key()
	477
	478	# Add FLOW-ENTRY.
	479	start_mark = self.get_mark()
	480	self.forward()
	481	end_mark = self.get_mark()
	482	self.tokens.append(FlowEntryToken(start_mark, end_mark))
	483
	484	def fetch_block_entry(self):
	485
	486	# Block context needs additional checks.
	487	if not self.flow_level:
	488
	489	# Are we allowed to start a new entry?
	490	if not self.allow_simple_key:
	491	raise ScannerError(None, None,
	492	"sequence entries are not allowed here",
	493	self.get_mark())
	494
	495	# We may need to add BLOCK-SEQUENCE-START.
	496	if self.add_indent(self.column):
	497	mark = self.get_mark()
	498	self.tokens.append(BlockSequenceStartToken(mark, mark))
	499
	500	# It's an error for the block entry to occur in the flow context,
	501	# but we let the parser detect this.
	502	else:
	503	pass
	504
	505	# Simple keys are allowed after '-'.
	506	self.allow_simple_key = True
	507
	508	# Reset possible simple key on the current level.
	509	self.remove_possible_simple_key()
	510
	511	# Add BLOCK-ENTRY.
	512	start_mark = self.get_mark()
	513	self.forward()
	514	end_mark = self.get_mark()
	515	self.tokens.append(BlockEntryToken(start_mark, end_mark))
	516
	517	def fetch_key(self):
	518
	519	# Block context needs additional checks.
	520	if not self.flow_level:
	521
	522	# Are we allowed to start a key (not nessesary a simple)?
	523	if not self.allow_simple_key:
	524	raise ScannerError(None, None,
	525	"mapping keys are not allowed here",
	526	self.get_mark())
	527
	528	# We may need to add BLOCK-MAPPING-START.
	529	if self.add_indent(self.column):
	530	mark = self.get_mark()
	531	self.tokens.append(BlockMappingStartToken(mark, mark))
	532
	533	# Simple keys are allowed after '?' in the block context.
	534	self.allow_simple_key = not self.flow_level
	535
	536	# Reset possible simple key on the current level.
	537	self.remove_possible_simple_key()
	538
	539	# Add KEY.
	540	start_mark = self.get_mark()
	541	self.forward()
	542	end_mark = self.get_mark()
	543	self.tokens.append(KeyToken(start_mark, end_mark))
	544
	545	def fetch_value(self):
	546
	547	# Do we determine a simple key?
	548	if self.flow_level in self.possible_simple_keys:
	549
	550	# Add KEY.
	551	key = self.possible_simple_keys[self.flow_level]
	552	del self.possible_simple_keys[self.flow_level]
	553	self.tokens.insert(key.token_number-self.tokens_taken,
	554	KeyToken(key.mark, key.mark))
	555
	556	# If this key starts a new block mapping, we need to add
	557	# BLOCK-MAPPING-START.
	558	if not self.flow_level:
	559	if self.add_indent(key.column):
	560	self.tokens.insert(key.token_number-self.tokens_taken,
	561	BlockMappingStartToken(key.mark, key.mark))
	562
	563	# There cannot be two simple keys one after another.
	564	self.allow_simple_key = False
	565
	566	# It must be a part of a complex key.
	567	else:
	568
	569	# Block context needs additional checks.
	570	# (Do we really need them? They will be catched by the parser
	571	# anyway.)
	572	if not self.flow_level:
	573
	574	# We are allowed to start a complex value if and only if
	575	# we can start a simple key.
	576	if not self.allow_simple_key:
	577	raise ScannerError(None, None,
	578	"mapping values are not allowed here",
	579	self.get_mark())
	580
	581	# If this value starts a new block mapping, we need to add
	582	# BLOCK-MAPPING-START. It will be detected as an error later by
	583	# the parser.
	584	if not self.flow_level:
	585	if self.add_indent(self.column):
	586	mark = self.get_mark()
	587	self.tokens.append(BlockMappingStartToken(mark, mark))
	588
	589	# Simple keys are allowed after ':' in the block context.
	590	self.allow_simple_key = not self.flow_level
	591
	592	# Reset possible simple key on the current level.
	593	self.remove_possible_simple_key()
	594
	595	# Add VALUE.
	596	start_mark = self.get_mark()
	597	self.forward()
	598	end_mark = self.get_mark()
	599	self.tokens.append(ValueToken(start_mark, end_mark))
	600
	601	def fetch_alias(self):
	602
	603	# ALIAS could be a simple key.
	604	self.save_possible_simple_key()
	605
	606	# No simple keys after ALIAS.
	607	self.allow_simple_key = False
	608
	609	# Scan and add ALIAS.
	610	self.tokens.append(self.scan_anchor(AliasToken))
	611
	612	def fetch_anchor(self):
	613
	614	# ANCHOR could start a simple key.
	615	self.save_possible_simple_key()
	616
	617	# No simple keys after ANCHOR.
	618	self.allow_simple_key = False
	619
	620	# Scan and add ANCHOR.
	621	self.tokens.append(self.scan_anchor(AnchorToken))
	622
	623	def fetch_tag(self):
	624
	625	# TAG could start a simple key.
	626	self.save_possible_simple_key()
	627
	628	# No simple keys after TAG.
	629	self.allow_simple_key = False
	630
	631	# Scan and add TAG.
	632	self.tokens.append(self.scan_tag())
	633
	634	def fetch_literal(self):
	635	self.fetch_block_scalar(style='\|')
	636
	637	def fetch_folded(self):
	638	self.fetch_block_scalar(style='>')
	639
	640	def fetch_block_scalar(self, style):
	641
	642	# A simple key may follow a block scalar.
	643	self.allow_simple_key = True
	644
	645	# Reset possible simple key on the current level.
	646	self.remove_possible_simple_key()
	647
	648	# Scan and add SCALAR.
	649	self.tokens.append(self.scan_block_scalar(style))
	650
	651	def fetch_single(self):
	652	self.fetch_flow_scalar(style='\'')
	653
	654	def fetch_double(self):
	655	self.fetch_flow_scalar(style='"')
	656
	657	def fetch_flow_scalar(self, style):
	658
	659	# A flow scalar could be a simple key.
	660	self.save_possible_simple_key()
	661
	662	# No simple keys after flow scalars.
	663	self.allow_simple_key = False
	664
	665	# Scan and add SCALAR.
	666	self.tokens.append(self.scan_flow_scalar(style))
	667
	668	def fetch_plain(self):
	669
	670	# A plain scalar could be a simple key.
	671	self.save_possible_simple_key()
	672
	673	# No simple keys after plain scalars. But note that `scan_plain` will
	674	# change this flag if the scan is finished at the beginning of the
	675	# line.
	676	self.allow_simple_key = False
	677
	678	# Scan and add SCALAR. May change `allow_simple_key`.
	679	self.tokens.append(self.scan_plain())
	680
	681	# Checkers.
	682
	683	def check_directive(self):
	684
	685	# DIRECTIVE: ^ '%' ...
	686	# The '%' indicator is already checked.
	687	if self.column == 0:
	688	return True
	689
	690	def check_document_start(self):
	691
	692	# DOCUMENT-START: ^ '---' (' '\|'\n')
	693	if self.column == 0:
	694	if self.prefix(3) == u'---' \
	695	and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
	696	return True
	697
	698	def check_document_end(self):
	699
	700	# DOCUMENT-END: ^ '...' (' '\|'\n')
	701	if self.column == 0:
	702	if self.prefix(3) == u'...' \
	703	and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
	704	return True
	705
	706	def check_block_entry(self):
	707
	708	# BLOCK-ENTRY: '-' (' '\|'\n')
	709	return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
	710
	711	def check_key(self):
	712
	713	# KEY(flow context): '?'
	714	if self.flow_level:
	715	return True
	716
	717	# KEY(block context): '?' (' '\|'\n')
	718	else:
	719	return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
	720
	721	def check_value(self):
	722
	723	# VALUE(flow context): ':'
	724	if self.flow_level:
	725	return True
	726
	727	# VALUE(block context): ':' (' '\|'\n')
	728	else:
	729	return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
	730
	731	def check_plain(self):
	732
	733	# A plain scalar may start with any non-space character except:
	734	# '-', '?', ':', ',', '[', ']', '{', '}',
	735	# '#', '&', '*', '!', '\|', '>', '\'', '\"',
	736	# '%', '@', '`'.
	737	#
	738	# It may also start with
	739	# '-', '?', ':'
	740	# if it is followed by a non-space character.
	741	#
	742	# Note that we limit the last rule to the block context (except the
	743	# '-' character) because we want the flow context to be space
	744	# independent.
	745	ch = self.peek()
	746	return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!\|>\'\"%@`' \
	747	or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
	748	and (ch == u'-' or (not self.flow_level and ch in u'?:')))
	749
	750	# Scanners.
	751
	752	def scan_to_next_token(self):
	753	# We ignore spaces, line breaks and comments.
	754	# If we find a line break in the block context, we set the flag
	755	# `allow_simple_key` on.
	756	# The byte order mark is stripped if it's the first character in the
	757	# stream. We do not yet support BOM inside the stream as the
	758	# specification requires. Any such mark will be considered as a part
	759	# of the document.
	760	#
	761	# TODO: We need to make tab handling rules more sane. A good rule is
	762	# Tabs cannot precede tokens
	763	# BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
	764	# KEY(block), VALUE(block), BLOCK-ENTRY
	765	# So the checking code is
	766	# if <TAB>:
	767	# self.allow_simple_keys = False
	768	# We also need to add the check for `allow_simple_keys == True` to
	769	# `unwind_indent` before issuing BLOCK-END.
	770	# Scanners for block, flow, and plain scalars need to be modified.
	771
	772	if self.index == 0 and self.peek() == u'\uFEFF':
	773	self.forward()
	774	found = False
	775	while not found:
	776	while self.peek() == u' ':
	777	self.forward()
	778	if self.peek() == u'#':
	779	while self.peek() not in u'\0\r\n\x85\u2028\u2029':
	780	self.forward()
	781	if self.scan_line_break():
	782	if not self.flow_level:
	783	self.allow_simple_key = True
	784	else:
	785	found = True
	786
	787	def scan_directive(self):
	788	# See the specification for details.
	789	start_mark = self.get_mark()
	790	self.forward()
	791	name = self.scan_directive_name(start_mark)
	792	value = None
	793	if name == u'YAML':
	794	value = self.scan_yaml_directive_value(start_mark)
	795	end_mark = self.get_mark()
	796	elif name == u'TAG':
	797	value = self.scan_tag_directive_value(start_mark)
	798	end_mark = self.get_mark()
	799	else:
	800	end_mark = self.get_mark()
	801	while self.peek() not in u'\0\r\n\x85\u2028\u2029':
	802	self.forward()
	803	self.scan_directive_ignored_line(start_mark)
	804	return DirectiveToken(name, value, start_mark, end_mark)
	805
	806	def scan_directive_name(self, start_mark):
	807	# See the specification for details.
	808	length = 0
	809	ch = self.peek(length)
	810	while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
	811	or ch in u'-_':
	812	length += 1
	813	ch = self.peek(length)
	814	if not length:
	815	raise ScannerError("while scanning a directive", start_mark,
	816	"expected alphabetic or numeric character, but found %r"
	817	% ch.encode('utf-8'), self.get_mark())
	818	value = self.prefix(length)
	819	self.forward(length)
	820	ch = self.peek()
	821	if ch not in u'\0 \r\n\x85\u2028\u2029':
	822	raise ScannerError("while scanning a directive", start_mark,
	823	"expected alphabetic or numeric character, but found %r"
	824	% ch.encode('utf-8'), self.get_mark())
	825	return value
	826
	827	def scan_yaml_directive_value(self, start_mark):
	828	# See the specification for details.
	829	while self.peek() == u' ':
	830	self.forward()
	831	major = self.scan_yaml_directive_number(start_mark)
	832	if self.peek() != '.':
	833	raise ScannerError("while scanning a directive", start_mark,
	834	"expected a digit or '.', but found %r"
	835	% self.peek().encode('utf-8'),
	836	self.get_mark())
	837	self.forward()
	838	minor = self.scan_yaml_directive_number(start_mark)
	839	if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
	840	raise ScannerError("while scanning a directive", start_mark,
	841	"expected a digit or ' ', but found %r"
	842	% self.peek().encode('utf-8'),
	843	self.get_mark())
	844	return (major, minor)
	845
	846	def scan_yaml_directive_number(self, start_mark):
	847	# See the specification for details.
	848	ch = self.peek()
	849	if not (u'0' <= ch <= '9'):
	850	raise ScannerError("while scanning a directive", start_mark,
	851	"expected a digit, but found %r" % ch.encode('utf-8'),
	852	self.get_mark())
	853	length = 0
	854	while u'0' <= self.peek(length) <= u'9':
	855	length += 1
	856	value = int(self.prefix(length))
	857	self.forward(length)
	858	return value
	859
	860	def scan_tag_directive_value(self, start_mark):
	861	# See the specification for details.
	862	while self.peek() == u' ':
	863	self.forward()
	864	handle = self.scan_tag_directive_handle(start_mark)
	865	while self.peek() == u' ':
	866	self.forward()
	867	prefix = self.scan_tag_directive_prefix(start_mark)
	868	return (handle, prefix)
	869
	870	def scan_tag_directive_handle(self, start_mark):
	871	# See the specification for details.
	872	value = self.scan_tag_handle('directive', start_mark)
	873	ch = self.peek()
	874	if ch != u' ':
	875	raise ScannerError("while scanning a directive", start_mark,
	876	"expected ' ', but found %r" % ch.encode('utf-8'),
	877	self.get_mark())
	878	return value
	879
	880	def scan_tag_directive_prefix(self, start_mark):
	881	# See the specification for details.
	882	value = self.scan_tag_uri('directive', start_mark)
	883	ch = self.peek()
	884	if ch not in u'\0 \r\n\x85\u2028\u2029':
	885	raise ScannerError("while scanning a directive", start_mark,
	886	"expected ' ', but found %r" % ch.encode('utf-8'),
	887	self.get_mark())
	888	return value
	889
	890	def scan_directive_ignored_line(self, start_mark):
	891	# See the specification for details.
	892	while self.peek() == u' ':
	893	self.forward()
	894	if self.peek() == u'#':
	895	while self.peek() not in u'\0\r\n\x85\u2028\u2029':
	896	self.forward()
	897	ch = self.peek()
	898	if ch not in u'\0\r\n\x85\u2028\u2029':
	899	raise ScannerError("while scanning a directive", start_mark,
	900	"expected a comment or a line break, but found %r"
	901	% ch.encode('utf-8'), self.get_mark())
	902	self.scan_line_break()
	903
	904	def scan_anchor(self, TokenClass):
	905	# The specification does not restrict characters for anchors and
	906	# aliases. This may lead to problems, for instance, the document:
	907	# [ *alias, value ]
	908	# can be interpteted in two ways, as
	909	# [ "value" ]
	910	# and
	911	# [ *alias , "value" ]
	912	# Therefore we restrict aliases to numbers and ASCII letters.
	913	start_mark = self.get_mark()
	914	indicator = self.peek()
	915	if indicator == '*':
	916	name = 'alias'
	917	else:
	918	name = 'anchor'
	919	self.forward()
	920	length = 0
	921	ch = self.peek(length)
	922	while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
	923	or ch in u'-_':
	924	length += 1
	925	ch = self.peek(length)
	926	if not length:
	927	raise ScannerError("while scanning an %s" % name, start_mark,
	928	"expected alphabetic or numeric character, but found %r"
	929	% ch.encode('utf-8'), self.get_mark())
	930	value = self.prefix(length)
	931	self.forward(length)
	932	ch = self.peek()
	933	if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
	934	raise ScannerError("while scanning an %s" % name, start_mark,
	935	"expected alphabetic or numeric character, but found %r"
	936	% ch.encode('utf-8'), self.get_mark())
	937	end_mark = self.get_mark()
	938	return TokenClass(value, start_mark, end_mark)
	939
	940	def scan_tag(self):
	941	# See the specification for details.
	942	start_mark = self.get_mark()
	943	ch = self.peek(1)
	944	if ch == u'<':
	945	handle = None
	946	self.forward(2)
	947	suffix = self.scan_tag_uri('tag', start_mark)
	948	if self.peek() != u'>':
	949	raise ScannerError("while parsing a tag", start_mark,
	950	"expected '>', but found %r" % self.peek().encode('utf-8'),
	951	self.get_mark())
	952	self.forward()
	953	elif ch in u'\0 \t\r\n\x85\u2028\u2029':
	954	handle = None
	955	suffix = u'!'
	956	self.forward()
	957	else:
	958	length = 1
	959	use_handle = False
	960	while ch not in u'\0 \r\n\x85\u2028\u2029':
	961	if ch == u'!':
	962	use_handle = True
	963	break
	964	length += 1
	965	ch = self.peek(length)
	966	handle = u'!'
	967	if use_handle:
	968	handle = self.scan_tag_handle('tag', start_mark)
	969	else:
	970	handle = u'!'
	971	self.forward()
	972	suffix = self.scan_tag_uri('tag', start_mark)
	973	ch = self.peek()
	974	if ch not in u'\0 \r\n\x85\u2028\u2029':
	975	raise ScannerError("while scanning a tag", start_mark,
	976	"expected ' ', but found %r" % ch.encode('utf-8'),
	977	self.get_mark())
	978	value = (handle, suffix)
	979	end_mark = self.get_mark()
	980	return TagToken(value, start_mark, end_mark)
	981
	982	def scan_block_scalar(self, style):
	983	# See the specification for details.
	984
	985	if style == '>':
	986	folded = True
	987	else:
	988	folded = False
	989
	990	chunks = []
	991	start_mark = self.get_mark()
	992
	993	# Scan the header.
	994	self.forward()
	995	chomping, increment = self.scan_block_scalar_indicators(start_mark)
	996	self.scan_block_scalar_ignored_line(start_mark)
	997
	998	# Determine the indentation level and go to the first non-empty line.
	999	min_indent = self.indent+1
	1000	if min_indent < 1:
	1001	min_indent = 1
	1002	if increment is None:
	1003	breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
	1004	indent = max(min_indent, max_indent)
	1005	else:
	1006	indent = min_indent+increment-1
	1007	breaks, end_mark = self.scan_block_scalar_breaks(indent)
	1008	line_break = u''
	1009
	1010	# Scan the inner part of the block scalar.
	1011	while self.column == indent and self.peek() != u'\0':
	1012	chunks.extend(breaks)
	1013	leading_non_space = self.peek() not in u' \t'
	1014	length = 0
	1015	while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
	1016	length += 1
	1017	chunks.append(self.prefix(length))
	1018	self.forward(length)
	1019	line_break = self.scan_line_break()
	1020	breaks, end_mark = self.scan_block_scalar_breaks(indent)
	1021	if self.column == indent and self.peek() != u'\0':
	1022
	1023	# Unfortunately, folding rules are ambiguous.
	1024	#
	1025	# This is the folding according to the specification:
	1026
	1027	if folded and line_break == u'\n' \
	1028	and leading_non_space and self.peek() not in u' \t':
	1029	if not breaks:
	1030	chunks.append(u' ')
	1031	else:
	1032	chunks.append(line_break)
	1033
	1034	# This is Clark Evans's interpretation (also in the spec
	1035	# examples):
	1036	#
	1037	#if folded and line_break == u'\n':
	1038	# if not breaks:
	1039	# if self.peek() not in ' \t':
	1040	# chunks.append(u' ')
	1041	# else:
	1042	# chunks.append(line_break)
	1043	#else:
	1044	# chunks.append(line_break)
	1045	else:
	1046	break
	1047
	1048	# Chomp the tail.
	1049	if chomping is not False:
	1050	chunks.append(line_break)
	1051	if chomping is True:
	1052	chunks.extend(breaks)
	1053
	1054	# We are done.
	1055	return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
	1056	style)
	1057
	1058	def scan_block_scalar_indicators(self, start_mark):
	1059	# See the specification for details.
	1060	chomping = None
	1061	increment = None
	1062	ch = self.peek()
	1063	if ch in u'+-':
	1064	if ch == '+':
	1065	chomping = True
	1066	else:
	1067	chomping = False
	1068	self.forward()
	1069	ch = self.peek()
	1070	if ch in u'0123456789':
	1071	increment = int(ch)
	1072	if increment == 0:
	1073	raise ScannerError("while scanning a block scalar", start_mark,
	1074	"expected indentation indicator in the range 1-9, but found 0",
	1075	self.get_mark())
	1076	self.forward()
	1077	elif ch in u'0123456789':
	1078	increment = int(ch)
	1079	if increment == 0:
	1080	raise ScannerError("while scanning a block scalar", start_mark,
	1081	"expected indentation indicator in the range 1-9, but found 0",
	1082	self.get_mark())
	1083	self.forward()
	1084	ch = self.peek()
	1085	if ch in u'+-':
	1086	if ch == '+':
	1087	chomping = True
	1088	else:
	1089	chomping = False
	1090	self.forward()
	1091	ch = self.peek()
	1092	if ch not in u'\0 \r\n\x85\u2028\u2029':
	1093	raise ScannerError("while scanning a block scalar", start_mark,
	1094	"expected chomping or indentation indicators, but found %r"
	1095	% ch.encode('utf-8'), self.get_mark())
	1096	return chomping, increment
	1097
	1098	def scan_block_scalar_ignored_line(self, start_mark):
	1099	# See the specification for details.
	1100	while self.peek() == u' ':
	1101	self.forward()
	1102	if self.peek() == u'#':
	1103	while self.peek() not in u'\0\r\n\x85\u2028\u2029':
	1104	self.forward()
	1105	ch = self.peek()
	1106	if ch not in u'\0\r\n\x85\u2028\u2029':
	1107	raise ScannerError("while scanning a block scalar", start_mark,
	1108	"expected a comment or a line break, but found %r"
	1109	% ch.encode('utf-8'), self.get_mark())
	1110	self.scan_line_break()
	1111
	1112	def scan_block_scalar_indentation(self):
	1113	# See the specification for details.
	1114	chunks = []
	1115	max_indent = 0
	1116	end_mark = self.get_mark()
	1117	while self.peek() in u' \r\n\x85\u2028\u2029':
	1118	if self.peek() != u' ':
	1119	chunks.append(self.scan_line_break())
	1120	end_mark = self.get_mark()
	1121	else:
	1122	self.forward()
	1123	if self.column > max_indent:
	1124	max_indent = self.column
	1125	return chunks, max_indent, end_mark
	1126
	1127	def scan_block_scalar_breaks(self, indent):
	1128	# See the specification for details.
	1129	chunks = []
	1130	end_mark = self.get_mark()
	1131	while self.column < indent and self.peek() == u' ':
	1132	self.forward()
	1133	while self.peek() in u'\r\n\x85\u2028\u2029':
	1134	chunks.append(self.scan_line_break())
	1135	end_mark = self.get_mark()
	1136	while self.column < indent and self.peek() == u' ':
	1137	self.forward()
	1138	return chunks, end_mark
	1139
	1140	def scan_flow_scalar(self, style):
	1141	# See the specification for details.
	1142	# Note that we loose indentation rules for quoted scalars. Quoted
	1143	# scalars don't need to adhere indentation because " and ' clearly
	1144	# mark the beginning and the end of them. Therefore we are less
	1145	# restrictive then the specification requires. We only need to check
	1146	# that document separators are not included in scalars.
	1147	if style == '"':
	1148	double = True
	1149	else:
	1150	double = False
	1151	chunks = []
	1152	start_mark = self.get_mark()
	1153	quote = self.peek()
	1154	self.forward()
	1155	chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
	1156	while self.peek() != quote:
	1157	chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
	1158	chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
	1159	self.forward()
	1160	end_mark = self.get_mark()
	1161	return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
	1162	style)
	1163
	1164	ESCAPE_REPLACEMENTS = {
	1165	u'0': u'\0',
	1166	u'a': u'\x07',
	1167	u'b': u'\x08',
	1168	u't': u'\x09',
	1169	u'\t': u'\x09',
	1170	u'n': u'\x0A',
	1171	u'v': u'\x0B',
	1172	u'f': u'\x0C',
	1173	u'r': u'\x0D',
	1174	u'e': u'\x1B',
	1175	u' ': u'\x20',
	1176	u'\"': u'\"',
	1177	u'\\': u'\\',
	1178	u'N': u'\x85',
	1179	u'_': u'\xA0',
	1180	u'L': u'\u2028',
	1181	u'P': u'\u2029',
	1182	}
	1183
	1184	ESCAPE_CODES = {
	1185	u'x': 2,
	1186	u'u': 4,
	1187	u'U': 8,
	1188	}
	1189
	1190	def scan_flow_scalar_non_spaces(self, double, start_mark):
	1191	# See the specification for details.
	1192	chunks = []
	1193	while True:
	1194	length = 0
	1195	while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
	1196	length += 1
	1197	if length:
	1198	chunks.append(self.prefix(length))
	1199	self.forward(length)
	1200	ch = self.peek()
	1201	if not double and ch == u'\'' and self.peek(1) == u'\'':
	1202	chunks.append(u'\'')
	1203	self.forward(2)
	1204	elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
	1205	chunks.append(ch)
	1206	self.forward()
	1207	elif double and ch == u'\\':
	1208	self.forward()
	1209	ch = self.peek()
	1210	if ch in self.ESCAPE_REPLACEMENTS:
	1211	chunks.append(self.ESCAPE_REPLACEMENTS[ch])
	1212	self.forward()
	1213	elif ch in self.ESCAPE_CODES:
	1214	length = self.ESCAPE_CODES[ch]
	1215	self.forward()
	1216	for k in range(length):
	1217	if self.peek(k) not in u'0123456789ABCDEFabcdef':
	1218	raise ScannerError("while scanning a double-quoted scalar", start_mark,
	1219	"expected escape sequence of %d hexdecimal numbers, but found %r" %
	1220	(length, self.peek(k).encode('utf-8')), self.get_mark())
	1221	code = int(self.prefix(length), 16)
	1222	chunks.append(unichr(code))
	1223	self.forward(length)
	1224	elif ch in u'\r\n\x85\u2028\u2029':
	1225	self.scan_line_break()
	1226	chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
	1227	else:
	1228	raise ScannerError("while scanning a double-quoted scalar", start_mark,
	1229	"found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
	1230	else:
	1231	return chunks
	1232
	1233	def scan_flow_scalar_spaces(self, double, start_mark):
	1234	# See the specification for details.
	1235	chunks = []
	1236	length = 0
	1237	while self.peek(length) in u' \t':
	1238	length += 1
	1239	whitespaces = self.prefix(length)
	1240	self.forward(length)
	1241	ch = self.peek()
	1242	if ch == u'\0':
	1243	raise ScannerError("while scanning a quoted scalar", start_mark,
	1244	"found unexpected end of stream", self.get_mark())
	1245	elif ch in u'\r\n\x85\u2028\u2029':
	1246	line_break = self.scan_line_break()
	1247	breaks = self.scan_flow_scalar_breaks(double, start_mark)
	1248	if line_break != u'\n':
	1249	chunks.append(line_break)
	1250	elif not breaks:
	1251	chunks.append(u' ')
	1252	chunks.extend(breaks)
	1253	else:
	1254	chunks.append(whitespaces)
	1255	return chunks
	1256
	1257	def scan_flow_scalar_breaks(self, double, start_mark):
	1258	# See the specification for details.
	1259	chunks = []
	1260	while True:
	1261	# Instead of checking indentation, we check for document
	1262	# separators.
	1263	prefix = self.prefix(3)
	1264	if (prefix == u'---' or prefix == u'...') \
	1265	and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
	1266	raise ScannerError("while scanning a quoted scalar", start_mark,
	1267	"found unexpected document separator", self.get_mark())
	1268	while self.peek() in u' \t':
	1269	self.forward()
	1270	if self.peek() in u'\r\n\x85\u2028\u2029':
	1271	chunks.append(self.scan_line_break())
	1272	else:
	1273	return chunks
	1274
	1275	def scan_plain(self):
	1276	# See the specification for details.
	1277	# We add an additional restriction for the flow context:
	1278	# plain scalars in the flow context cannot contain ',', ':' and '?'.
	1279	# We also keep track of the `allow_simple_key` flag here.
	1280	# Indentation rules are loosed for the flow context.
	1281	chunks = []
	1282	start_mark = self.get_mark()
	1283	end_mark = start_mark
	1284	indent = self.indent+1
	1285	# We allow zero indentation for scalars, but then we need to check for
	1286	# document separators at the beginning of the line.
	1287	#if indent == 0:
	1288	# indent = 1
	1289	spaces = []
	1290	while True:
	1291	length = 0
	1292	if self.peek() == u'#':
	1293	break
	1294	while True:
	1295	ch = self.peek(length)
	1296	if ch in u'\0 \t\r\n\x85\u2028\u2029' \
	1297	or (not self.flow_level and ch == u':' and
	1298	self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
	1299	or (self.flow_level and ch in u',:?[]{}'):
	1300	break
	1301	length += 1
	1302	# It's not clear what we should do with ':' in the flow context.
	1303	if (self.flow_level and ch == u':'
	1304	and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
	1305	self.forward(length)
	1306	raise ScannerError("while scanning a plain scalar", start_mark,
	1307	"found unexpected ':'", self.get_mark(),
	1308	"Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
	1309	if length == 0:
	1310	break
	1311	self.allow_simple_key = False
	1312	chunks.extend(spaces)
	1313	chunks.append(self.prefix(length))
	1314	self.forward(length)
	1315	end_mark = self.get_mark()
	1316	spaces = self.scan_plain_spaces(indent, start_mark)
	1317	if not spaces or self.peek() == u'#' \
	1318	or (not self.flow_level and self.column < indent):
	1319	break
	1320	return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
	1321
	1322	def scan_plain_spaces(self, indent, start_mark):
	1323	# See the specification for details.
	1324	# The specification is really confusing about tabs in plain scalars.
	1325	# We just forbid them completely. Do not use tabs in YAML!
	1326	chunks = []
	1327	length = 0
	1328	while self.peek(length) in u' ':
	1329	length += 1
	1330	whitespaces = self.prefix(length)
	1331	self.forward(length)
	1332	ch = self.peek()
	1333	if ch in u'\r\n\x85\u2028\u2029':
	1334	line_break = self.scan_line_break()
	1335	self.allow_simple_key = True
	1336	prefix = self.prefix(3)
	1337	if (prefix == u'---' or prefix == u'...') \
	1338	and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
	1339	return
	1340	breaks = []
	1341	while self.peek() in u' \r\n\x85\u2028\u2029':
	1342	if self.peek() == ' ':
	1343	self.forward()
	1344	else:
	1345	breaks.append(self.scan_line_break())
	1346	prefix = self.prefix(3)
	1347	if (prefix == u'---' or prefix == u'...') \
	1348	and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
	1349	return
	1350	if line_break != u'\n':
	1351	chunks.append(line_break)
	1352	elif not breaks:
	1353	chunks.append(u' ')
	1354	chunks.extend(breaks)
	1355	elif whitespaces:
	1356	chunks.append(whitespaces)
	1357	return chunks
	1358
	1359	def scan_tag_handle(self, name, start_mark):
	1360	# See the specification for details.
	1361	# For some strange reasons, the specification does not allow '_' in
	1362	# tag handles. I have allowed it anyway.
	1363	ch = self.peek()
	1364	if ch != u'!':
	1365	raise ScannerError("while scanning a %s" % name, start_mark,
	1366	"expected '!', but found %r" % ch.encode('utf-8'),
	1367	self.get_mark())
	1368	length = 1
	1369	ch = self.peek(length)
	1370	if ch != u' ':
	1371	while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
	1372	or ch in u'-_':
	1373	length += 1
	1374	ch = self.peek(length)
	1375	if ch != u'!':
	1376	self.forward(length)
	1377	raise ScannerError("while scanning a %s" % name, start_mark,
	1378	"expected '!', but found %r" % ch.encode('utf-8'),
	1379	self.get_mark())
	1380	length += 1
	1381	value = self.prefix(length)
	1382	self.forward(length)
	1383	return value
	1384
	1385	def scan_tag_uri(self, name, start_mark):
	1386	# See the specification for details.
	1387	# Note: we do not check if URI is well-formed.
	1388	chunks = []
	1389	length = 0
	1390	ch = self.peek(length)
	1391	while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
	1392	or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
	1393	if ch == u'%':
	1394	chunks.append(self.prefix(length))
	1395	self.forward(length)
	1396	length = 0
	1397	chunks.append(self.scan_uri_escapes(name, start_mark))
	1398	else:
	1399	length += 1
	1400	ch = self.peek(length)
	1401	if length:
	1402	chunks.append(self.prefix(length))
	1403	self.forward(length)
	1404	length = 0
	1405	if not chunks:
	1406	raise ScannerError("while parsing a %s" % name, start_mark,
	1407	"expected URI, but found %r" % ch.encode('utf-8'),
	1408	self.get_mark())
	1409	return u''.join(chunks)
	1410
	1411	def scan_uri_escapes(self, name, start_mark):
	1412	# See the specification for details.
	1413	bytes = []
	1414	mark = self.get_mark()
	1415	while self.peek() == u'%':
	1416	self.forward()
	1417	for k in range(2):
	1418	if self.peek(k) not in u'0123456789ABCDEFabcdef':
	1419	raise ScannerError("while scanning a %s" % name, start_mark,
	1420	"expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
	1421	(self.peek(k).encode('utf-8')), self.get_mark())
	1422	bytes.append(chr(int(self.prefix(2), 16)))
	1423	self.forward(2)
	1424	try:
	1425	value = unicode(''.join(bytes), 'utf-8')
	1426	except UnicodeDecodeError, exc:
	1427	raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
	1428	return value
	1429
	1430	def scan_line_break(self):
	1431	# Transforms:
	1432	# '\r\n' : '\n'
	1433	# '\r' : '\n'
	1434	# '\n' : '\n'
	1435	# '\x85' : '\n'
	1436	# '\u2028' : '\u2028'
	1437	# '\u2029 : '\u2029'
	1438	# default : ''
	1439	ch = self.peek()
	1440	if ch in u'\r\n\x85':
	1441	if self.prefix(2) == u'\r\n':
	1442	self.forward(2)
	1443	else:
	1444	self.forward()
	1445	return u'\n'
	1446	elif ch in u'\u2028\u2029':
	1447	self.forward()
	1448	return ch
	1449	return u''
	1450
	1451	#try:
	1452	# import psyco
	1453	# psyco.bind(Scanner)
	1454	#except ImportError:
	1455	# pass
	1456

Note: See TracBrowser for help on using the repository browser.

Download in other formats: