1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: 2007-2008 by Armin Ronacher.
15 :license: BSD, see LICENSE for more details.
18 from operator import itemgetter
19 from collections import deque
20 from jinja2.exceptions import TemplateSyntaxError
21 from jinja2.utils import LRUCache
24 # cache for the lexers. Exists in order to be able to have multiple
25 # environments with the same lexer
26 _lexer_cache = LRUCache(50)
28 # static regular expressions
29 whitespace_re = re.compile(r'\s+', re.U)
30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
32 integer_re = re.compile(r'\d+')
33 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
34 float_re = re.compile(r'(?<!\.)\d+\.\d+')
35 newline_re = re.compile(r'(\r\n|\r|\n)')
37 # bind operators to token types
67 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
68 assert len(operators) == len(reverse_operators), 'operators dropped'
69 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
70 sorted(operators, key=lambda x: -len(x))))
73 def count_newlines(value):
74 """Count the number of newline characters in the string. This is
75 useful for extensions that filter a stream.
77 return len(newline_re.findall(value))
80 class Failure(object):
81 """Class that raises a `TemplateSyntaxError` if called.
82 Used by the `Lexer` to specify known errors.
85 def __init__(self, message, cls=TemplateSyntaxError):
86 self.message = message
87 self.error_class = cls
89 def __call__(self, lineno, filename):
90 raise self.error_class(self.message, lineno, filename)
96 lineno, type, value = (property(itemgetter(x)) for x in range(3))
98 def __new__(cls, lineno, type, value):
99 return tuple.__new__(cls, (lineno, intern(str(type)), value))
102 if self.type in reverse_operators:
103 return reverse_operators[self.type]
104 elif self.type is 'name':
108 def test(self, expr):
109 """Test a token against a token expression. This can either be a
110 token type or ``'token_type:token_value'``. This can only test
111 against string values and types.
113 # here we do a regular string equality check as test_any is usually
114 # passed an iterable of not interned strings.
115 if self.type == expr:
118 return expr.split(':', 1) == [self.type, self.value]
121 def test_any(self, *iterable):
122 """Test against multiple token expressions."""
123 for expr in iterable:
129 return 'Token(%r, %r, %r)' % (
136 class TokenStreamIterator(object):
137 """The iterator for tokenstreams. Iterate over the stream
138 until the eof token is reached.
141 def __init__(self, stream):
148 token = self.stream.current
149 if token.type == 'eof':
151 raise StopIteration()
156 class TokenStream(object):
157 """A token stream is an iterable that yields :class:`Token`\s. The
158 parser however does not iterate over it but calls :meth:`next` to go
159 one token ahead. The current active token is stored as :attr:`current`.
162 def __init__(self, generator, name, filename):
163 self._next = iter(generator).next
164 self._pushed = deque()
166 self.filename = filename
168 self.current = Token(1, 'initial', '')
172 return TokenStreamIterator(self)
174 def __nonzero__(self):
175 """Are we at the end of the stream?"""
176 return bool(self._pushed) or self.current.type != 'eof'
178 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
180 def push(self, token):
181 """Push a token back to the stream."""
182 self._pushed.append(token)
185 """Look at the next token."""
186 old_token = self.next()
187 result = self.current
189 self.current = old_token
193 """Got n tokens ahead."""
197 def next_if(self, expr):
198 """Perform the token test and return the token if it matched.
199 Otherwise the return value is `None`.
201 if self.current.test(expr):
204 def skip_if(self, expr):
205 """Like :meth:`next_if` but only returns `True` or `False`."""
206 return self.next_if(expr) is not None
209 """Go one token ahead and return the old one"""
212 self.current = self._pushed.popleft()
213 elif self.current.type is not 'eof':
215 self.current = self._next()
216 except StopIteration:
221 """Close the stream."""
222 self.current = Token(self.current.lineno, 'eof', '')
226 def expect(self, expr):
227 """Expect a given token type and return it. This accepts the same
228 argument as :meth:`jinja2.lexer.Token.test`.
230 if not self.current.test(expr):
232 expr = expr.split(':')[1]
233 if self.current.type is 'eof':
234 raise TemplateSyntaxError('unexpected end of template, '
235 'expected %r.' % expr,
237 self.name, self.filename)
238 raise TemplateSyntaxError("expected token %r, got %r" %
239 (expr, str(self.current)),
241 self.name, self.filename)
248 def get_lexer(environment):
249 """Return a lexer which is probably cached."""
250 key = (environment.block_start_string,
251 environment.block_end_string,
252 environment.variable_start_string,
253 environment.variable_end_string,
254 environment.comment_start_string,
255 environment.comment_end_string,
256 environment.line_statement_prefix,
257 environment.trim_blocks,
258 environment.newline_sequence)
259 lexer = _lexer_cache.get(key)
261 lexer = Lexer(environment)
262 _lexer_cache[key] = lexer
267 """Class that implements a lexer for a given environment. Automatically
268 created by the environment class, usually you don't have to do that.
270 Note that the lexer is not automatically bound to an environment.
271 Multiple environments can share the same lexer.
274 def __init__(self, environment):
276 c = lambda x: re.compile(x, re.M | re.S)
279 # lexing rules for tags
281 (whitespace_re, 'whitespace', None),
282 (float_re, 'float', None),
283 (integer_re, 'integer', None),
284 (name_re, 'name', None),
285 (string_re, 'string', None),
286 (operator_re, 'operator', None)
289 # assamble the root lexing rule. because "|" is ungreedy
290 # we have to sort by length so that the lexer continues working
291 # as expected when we have parsing rules like <% for block and
292 # <%= for variables. (if someone wants asp like syntax)
293 # variables are just part of the rules if variable processing
296 ('comment', environment.comment_start_string),
297 ('block', environment.block_start_string),
298 ('variable', environment.variable_start_string)
300 root_tag_rules.sort(key=lambda x: -len(x[1]))
302 # now escape the rules. This is done here so that the escape
303 # signs don't count for the lengths of the tags.
304 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
306 # if we have a line statement prefix we need an extra rule for
307 # that. We add this rule *after* all the others.
308 if environment.line_statement_prefix is not None:
309 prefix = e(environment.line_statement_prefix)
310 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
312 # block suffix if trimming is enabled
313 block_suffix_re = environment.trim_blocks and '\\n?' or ''
315 self.newline_sequence = environment.newline_sequence
317 # global lexing rules
321 (c('(.*?)(?:%s)' % '|'.join(
322 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
323 e(environment.block_start_string),
324 e(environment.block_start_string),
325 e(environment.block_end_string)
327 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
328 for n, r in root_tag_rules
329 ])), ('data', '#bygroup'), '#bygroup'),
331 (c('.+'), 'data', None)
335 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
336 e(environment.comment_end_string),
337 e(environment.comment_end_string),
339 )), ('comment', 'comment_end'), '#pop'),
340 (c('(.)'), (Failure('Missing end of comment tag'),), None)
344 (c('(?:\-%s\s*|%s)%s' % (
345 e(environment.block_end_string),
346 e(environment.block_end_string),
348 )), 'block_end', '#pop'),
353 e(environment.variable_end_string),
354 e(environment.variable_end_string)
355 )), 'variable_end', '#pop')
359 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
360 e(environment.block_start_string),
361 e(environment.block_start_string),
362 e(environment.block_end_string),
363 e(environment.block_end_string),
365 )), ('data', 'raw_end'), '#pop'),
366 (c('(.)'), (Failure('Missing end of raw directive'),), None)
369 'linestatement_begin': [
370 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
374 def _normalize_newlines(self, value):
375 """Called for strings and template data to normlize it to unicode."""
376 return newline_re.sub(self.newline_sequence, value)
378 def tokenize(self, source, name=None, filename=None, state=None):
379 """Calls tokeniter + tokenize and wraps it in a token stream.
381 stream = self.tokeniter(source, name, filename, state)
382 return TokenStream(self.wrap(stream, name, filename), name, filename)
384 def wrap(self, stream, name=None, filename=None):
385 """This is called with the stream as returned by `tokenize` and wraps
386 every token in a :class:`Token` and converts the value.
388 for lineno, token, value in stream:
389 if token in ('comment_begin', 'comment', 'comment_end',
392 elif token == 'linestatement_begin':
393 token = 'block_begin'
394 elif token == 'linestatement_end':
396 # we are not interested in those tokens in the parser
397 elif token in ('raw_begin', 'raw_end'):
399 elif token == 'data':
400 value = self._normalize_newlines(value)
401 elif token == 'keyword':
403 elif token == 'name':
405 elif token == 'string':
406 # try to unescape string
408 value = self._normalize_newlines(value[1:-1]) \
409 .encode('ascii', 'backslashreplace') \
410 .decode('unicode-escape')
412 msg = str(e).split(':')[-1].strip()
413 raise TemplateSyntaxError(msg, lineno, name, filename)
414 # if we can express it as bytestring (ascii only)
415 # we do that for support of semi broken APIs
416 # as datetime.datetime.strftime
421 elif token == 'integer':
423 elif token == 'float':
425 elif token == 'operator':
426 token = operators[value]
427 yield Token(lineno, token, value)
429 def tokeniter(self, source, name, filename=None, state=None):
430 """This method tokenizes the text and returns the tokens in a
431 generator. Use this method if you just want to tokenize a template.
433 source = '\n'.join(unicode(source).splitlines())
437 if state is not None and state != 'root':
438 assert state in ('variable', 'block'), 'invalid state'
439 stack.append(state + '_begin')
442 statetokens = self.rules[stack[-1]]
443 source_length = len(source)
449 for regex, tokens, new_state in statetokens:
450 m = regex.match(source, pos)
451 # if no match we try again with the next rule
455 # we only match blocks and variables if brances / parentheses
456 # are balanced. continue parsing with the lower rule which
457 # is the operator rule. do this only if the end tags look
459 if balancing_stack and \
460 tokens in ('variable_end', 'block_end',
461 'linestatement_end'):
464 # tuples support more options
465 if isinstance(tokens, tuple):
466 for idx, token in enumerate(tokens):
468 if token.__class__ is Failure:
469 raise token(lineno, filename)
470 # bygroup is a bit more complex, in that case we
471 # yield for the current token the first named
473 elif token == '#bygroup':
474 for key, value in m.groupdict().iteritems():
475 if value is not None:
476 yield lineno, key, value
477 lineno += value.count('\n')
480 raise RuntimeError('%r wanted to resolve '
481 'the token dynamically'
482 ' but no group matched'
486 data = m.group(idx + 1)
488 yield lineno, token, data
489 lineno += data.count('\n')
491 # strings as token just are yielded as it.
494 # update brace/parentheses balance
495 if tokens == 'operator':
497 balancing_stack.append('}')
499 balancing_stack.append(')')
501 balancing_stack.append(']')
502 elif data in ('}', ')', ']'):
503 if not balancing_stack:
504 raise TemplateSyntaxError('unexpected "%s"' %
507 expected_op = balancing_stack.pop()
508 if expected_op != data:
509 raise TemplateSyntaxError('unexpected "%s", '
515 yield lineno, tokens, data
516 lineno += data.count('\n')
518 # fetch new position into new variable so that we can check
519 # if there is a internal parsing error which would result
520 # in an infinite loop
523 # handle state changes
524 if new_state is not None:
525 # remove the uppermost state
526 if new_state == '#pop':
528 # resolve the new state by group checking
529 elif new_state == '#bygroup':
530 for key, value in m.groupdict().iteritems():
531 if value is not None:
535 raise RuntimeError('%r wanted to resolve the '
536 'new state dynamically but'
537 ' no group matched' %
539 # direct state name given
541 stack.append(new_state)
542 statetokens = self.rules[stack[-1]]
543 # we are still at the same position and no stack change.
544 # this means a loop without break condition, avoid that and
547 raise RuntimeError('%r yielded empty string without '
548 'stack change' % regex)
549 # publish new function and start again
552 # if loop terminated without break we havn't found a single match
553 # either we are at the end of the file or we have a problem
556 if pos >= source_length:
558 # something went wrong
559 raise TemplateSyntaxError('unexpected char %r at %d' %
560 (source[pos], pos), lineno,