1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: (c) 2010 by the Jinja Team.
15 :license: BSD, see LICENSE for more details.
18 from operator import itemgetter
19 from collections import deque
20 from jinja2.exceptions import TemplateSyntaxError
21 from jinja2.utils import LRUCache, next
24 # cache for the lexers. Exists in order to be able to have multiple
25 # environments with the same lexer
26 _lexer_cache = LRUCache(50)
28 # static regular expressions
29 whitespace_re = re.compile(r'\s+', re.U)
30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
32 integer_re = re.compile(r'\d+')
34 # we use the unicode identifier rule if this python version is able
35 # to handle unicode identifiers, otherwise the standard ASCII one.
37 compile('föö', '<unknown>', 'eval')
39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
41 from jinja2 import _stringdefs
42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43 _stringdefs.xid_continue))
45 float_re = re.compile(r'(?<!\.)\d+\.\d+')
46 newline_re = re.compile(r'(\r\n|\r|\n)')
48 # internal the tokens and keep references to them
49 TOKEN_ADD = intern('add')
50 TOKEN_ASSIGN = intern('assign')
51 TOKEN_COLON = intern('colon')
52 TOKEN_COMMA = intern('comma')
53 TOKEN_DIV = intern('div')
54 TOKEN_DOT = intern('dot')
55 TOKEN_EQ = intern('eq')
56 TOKEN_FLOORDIV = intern('floordiv')
57 TOKEN_GT = intern('gt')
58 TOKEN_GTEQ = intern('gteq')
59 TOKEN_LBRACE = intern('lbrace')
60 TOKEN_LBRACKET = intern('lbracket')
61 TOKEN_LPAREN = intern('lparen')
62 TOKEN_LT = intern('lt')
63 TOKEN_LTEQ = intern('lteq')
64 TOKEN_MOD = intern('mod')
65 TOKEN_MUL = intern('mul')
66 TOKEN_NE = intern('ne')
67 TOKEN_PIPE = intern('pipe')
68 TOKEN_POW = intern('pow')
69 TOKEN_RBRACE = intern('rbrace')
70 TOKEN_RBRACKET = intern('rbracket')
71 TOKEN_RPAREN = intern('rparen')
72 TOKEN_SEMICOLON = intern('semicolon')
73 TOKEN_SUB = intern('sub')
74 TOKEN_TILDE = intern('tilde')
75 TOKEN_WHITESPACE = intern('whitespace')
76 TOKEN_FLOAT = intern('float')
77 TOKEN_INTEGER = intern('integer')
78 TOKEN_NAME = intern('name')
79 TOKEN_STRING = intern('string')
80 TOKEN_OPERATOR = intern('operator')
81 TOKEN_BLOCK_BEGIN = intern('block_begin')
82 TOKEN_BLOCK_END = intern('block_end')
83 TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84 TOKEN_VARIABLE_END = intern('variable_end')
85 TOKEN_RAW_BEGIN = intern('raw_begin')
86 TOKEN_RAW_END = intern('raw_end')
87 TOKEN_COMMENT_BEGIN = intern('comment_begin')
88 TOKEN_COMMENT_END = intern('comment_end')
89 TOKEN_COMMENT = intern('comment')
90 TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91 TOKEN_LINESTATEMENT_END = intern('linestatement_end')
92 TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93 TOKEN_LINECOMMENT_END = intern('linecomment_end')
94 TOKEN_LINECOMMENT = intern('linecomment')
95 TOKEN_DATA = intern('data')
96 TOKEN_INITIAL = intern('initial')
97 TOKEN_EOF = intern('eof')
99 # bind operators to token types
104 '//': TOKEN_FLOORDIV,
129 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130 assert len(operators) == len(reverse_operators), 'operators dropped'
131 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132 sorted(operators, key=lambda x: -len(x))))
134 ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
138 ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139 TOKEN_COMMENT, TOKEN_LINECOMMENT])
142 def _describe_token_type(token_type):
143 if token_type in reverse_operators:
144 return reverse_operators[token_type]
146 TOKEN_COMMENT_BEGIN: 'begin of comment',
147 TOKEN_COMMENT_END: 'end of comment',
148 TOKEN_COMMENT: 'comment',
149 TOKEN_LINECOMMENT: 'comment',
150 TOKEN_BLOCK_BEGIN: 'begin of statement block',
151 TOKEN_BLOCK_END: 'end of statement block',
152 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
153 TOKEN_VARIABLE_END: 'end of print statement',
154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
155 TOKEN_LINESTATEMENT_END: 'end of line statement',
156 TOKEN_DATA: 'template data / text',
157 TOKEN_EOF: 'end of template'
158 }.get(token_type, token_type)
161 def describe_token(token):
162 """Returns a description of the token."""
163 if token.type == 'name':
165 return _describe_token_type(token.type)
168 def describe_token_expr(expr):
169 """Like `describe_token` but for token expressions."""
171 type, value = expr.split(':', 1)
176 return _describe_token_type(type)
179 def count_newlines(value):
180 """Count the number of newline characters in the string. This is
181 useful for extensions that filter a stream.
183 return len(newline_re.findall(value))
186 def compile_rules(environment):
187 """Compiles all the rules from the environment into a list of rules."""
190 (len(environment.comment_start_string), 'comment',
191 e(environment.comment_start_string)),
192 (len(environment.block_start_string), 'block',
193 e(environment.block_start_string)),
194 (len(environment.variable_start_string), 'variable',
195 e(environment.variable_start_string))
198 if environment.line_statement_prefix is not None:
199 rules.append((len(environment.line_statement_prefix), 'linestatement',
200 r'^\s*' + e(environment.line_statement_prefix)))
201 if environment.line_comment_prefix is not None:
202 rules.append((len(environment.line_comment_prefix), 'linecomment',
203 r'(?:^|(?<=\S))[^\S\r\n]*' +
204 e(environment.line_comment_prefix)))
206 return [x[1:] for x in sorted(rules, reverse=True)]
209 class Failure(object):
210 """Class that raises a `TemplateSyntaxError` if called.
211 Used by the `Lexer` to specify known errors.
214 def __init__(self, message, cls=TemplateSyntaxError):
215 self.message = message
216 self.error_class = cls
218 def __call__(self, lineno, filename):
219 raise self.error_class(self.message, lineno, filename)
225 lineno, type, value = (property(itemgetter(x)) for x in range(3))
227 def __new__(cls, lineno, type, value):
228 return tuple.__new__(cls, (lineno, intern(str(type)), value))
231 if self.type in reverse_operators:
232 return reverse_operators[self.type]
233 elif self.type == 'name':
237 def test(self, expr):
238 """Test a token against a token expression. This can either be a
239 token type or ``'token_type:token_value'``. This can only test
240 against string values and types.
242 # here we do a regular string equality check as test_any is usually
243 # passed an iterable of not interned strings.
244 if self.type == expr:
247 return expr.split(':', 1) == [self.type, self.value]
250 def test_any(self, *iterable):
251 """Test against multiple token expressions."""
252 for expr in iterable:
258 return 'Token(%r, %r, %r)' % (
265 class TokenStreamIterator(object):
266 """The iterator for tokenstreams. Iterate over the stream
267 until the eof token is reached.
270 def __init__(self, stream):
277 token = self.stream.current
278 if token.type is TOKEN_EOF:
280 raise StopIteration()
285 class TokenStream(object):
286 """A token stream is an iterable that yields :class:`Token`\s. The
287 parser however does not iterate over it but calls :meth:`next` to go
288 one token ahead. The current active token is stored as :attr:`current`.
291 def __init__(self, generator, name, filename):
292 self._next = iter(generator).next
293 self._pushed = deque()
295 self.filename = filename
297 self.current = Token(1, TOKEN_INITIAL, '')
301 return TokenStreamIterator(self)
303 def __nonzero__(self):
304 return bool(self._pushed) or self.current.type is not TOKEN_EOF
306 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
308 def push(self, token):
309 """Push a token back to the stream."""
310 self._pushed.append(token)
313 """Look at the next token."""
314 old_token = next(self)
315 result = self.current
317 self.current = old_token
321 """Got n tokens ahead."""
325 def next_if(self, expr):
326 """Perform the token test and return the token if it matched.
327 Otherwise the return value is `None`.
329 if self.current.test(expr):
332 def skip_if(self, expr):
333 """Like :meth:`next_if` but only returns `True` or `False`."""
334 return self.next_if(expr) is not None
337 """Go one token ahead and return the old one"""
340 self.current = self._pushed.popleft()
341 elif self.current.type is not TOKEN_EOF:
343 self.current = self._next()
344 except StopIteration:
349 """Close the stream."""
350 self.current = Token(self.current.lineno, TOKEN_EOF, '')
354 def expect(self, expr):
355 """Expect a given token type and return it. This accepts the same
356 argument as :meth:`jinja2.lexer.Token.test`.
358 if not self.current.test(expr):
359 expr = describe_token_expr(expr)
360 if self.current.type is TOKEN_EOF:
361 raise TemplateSyntaxError('unexpected end of template, '
362 'expected %r.' % expr,
364 self.name, self.filename)
365 raise TemplateSyntaxError("expected token %r, got %r" %
366 (expr, describe_token(self.current)),
368 self.name, self.filename)
375 def get_lexer(environment):
376 """Return a lexer which is probably cached."""
377 key = (environment.block_start_string,
378 environment.block_end_string,
379 environment.variable_start_string,
380 environment.variable_end_string,
381 environment.comment_start_string,
382 environment.comment_end_string,
383 environment.line_statement_prefix,
384 environment.line_comment_prefix,
385 environment.trim_blocks,
386 environment.newline_sequence)
387 lexer = _lexer_cache.get(key)
389 lexer = Lexer(environment)
390 _lexer_cache[key] = lexer
395 """Class that implements a lexer for a given environment. Automatically
396 created by the environment class, usually you don't have to do that.
398 Note that the lexer is not automatically bound to an environment.
399 Multiple environments can share the same lexer.
402 def __init__(self, environment):
404 c = lambda x: re.compile(x, re.M | re.S)
407 # lexing rules for tags
409 (whitespace_re, TOKEN_WHITESPACE, None),
410 (float_re, TOKEN_FLOAT, None),
411 (integer_re, TOKEN_INTEGER, None),
412 (name_re, TOKEN_NAME, None),
413 (string_re, TOKEN_STRING, None),
414 (operator_re, TOKEN_OPERATOR, None)
417 # assamble the root lexing rule. because "|" is ungreedy
418 # we have to sort by length so that the lexer continues working
419 # as expected when we have parsing rules like <% for block and
420 # <%= for variables. (if someone wants asp like syntax)
421 # variables are just part of the rules if variable processing
423 root_tag_rules = compile_rules(environment)
425 # block suffix if trimming is enabled
426 block_suffix_re = environment.trim_blocks and '\\n?' or ''
428 self.newline_sequence = environment.newline_sequence
430 # global lexing rules
434 (c('(.*?)(?:%s)' % '|'.join(
435 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
436 e(environment.block_start_string),
437 e(environment.block_start_string),
438 e(environment.block_end_string)
440 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
441 for n, r in root_tag_rules
442 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
444 (c('.+'), TOKEN_DATA, None)
447 TOKEN_COMMENT_BEGIN: [
448 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
449 e(environment.comment_end_string),
450 e(environment.comment_end_string),
452 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
453 (c('(.)'), (Failure('Missing end of comment tag'),), None)
457 (c('(?:\-%s\s*|%s)%s' % (
458 e(environment.block_end_string),
459 e(environment.block_end_string),
461 )), TOKEN_BLOCK_END, '#pop'),
464 TOKEN_VARIABLE_BEGIN: [
466 e(environment.variable_end_string),
467 e(environment.variable_end_string)
468 )), TOKEN_VARIABLE_END, '#pop')
472 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
473 e(environment.block_start_string),
474 e(environment.block_start_string),
475 e(environment.block_end_string),
476 e(environment.block_end_string),
478 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
479 (c('(.)'), (Failure('Missing end of raw directive'),), None)
482 TOKEN_LINESTATEMENT_BEGIN: [
483 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
486 TOKEN_LINECOMMENT_BEGIN: [
487 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
488 TOKEN_LINECOMMENT_END), '#pop')
492 def _normalize_newlines(self, value):
493 """Called for strings and template data to normlize it to unicode."""
494 return newline_re.sub(self.newline_sequence, value)
496 def tokenize(self, source, name=None, filename=None, state=None):
497 """Calls tokeniter + tokenize and wraps it in a token stream.
499 stream = self.tokeniter(source, name, filename, state)
500 return TokenStream(self.wrap(stream, name, filename), name, filename)
502 def wrap(self, stream, name=None, filename=None):
503 """This is called with the stream as returned by `tokenize` and wraps
504 every token in a :class:`Token` and converts the value.
506 for lineno, token, value in stream:
507 if token in ignored_tokens:
509 elif token == 'linestatement_begin':
510 token = 'block_begin'
511 elif token == 'linestatement_end':
513 # we are not interested in those tokens in the parser
514 elif token in ('raw_begin', 'raw_end'):
516 elif token == 'data':
517 value = self._normalize_newlines(value)
518 elif token == 'keyword':
520 elif token == 'name':
522 elif token == 'string':
523 # try to unescape string
525 value = self._normalize_newlines(value[1:-1]) \
526 .encode('ascii', 'backslashreplace') \
527 .decode('unicode-escape')
529 msg = str(e).split(':')[-1].strip()
530 raise TemplateSyntaxError(msg, lineno, name, filename)
531 # if we can express it as bytestring (ascii only)
532 # we do that for support of semi broken APIs
533 # as datetime.datetime.strftime. On python 3 this
534 # call becomes a noop thanks to 2to3
539 elif token == 'integer':
541 elif token == 'float':
543 elif token == 'operator':
544 token = operators[value]
545 yield Token(lineno, token, value)
547 def tokeniter(self, source, name, filename=None, state=None):
548 """This method tokenizes the text and returns the tokens in a
549 generator. Use this method if you just want to tokenize a template.
551 source = '\n'.join(unicode(source).splitlines())
555 if state is not None and state != 'root':
556 assert state in ('variable', 'block'), 'invalid state'
557 stack.append(state + '_begin')
560 statetokens = self.rules[stack[-1]]
561 source_length = len(source)
567 for regex, tokens, new_state in statetokens:
568 m = regex.match(source, pos)
569 # if no match we try again with the next rule
573 # we only match blocks and variables if brances / parentheses
574 # are balanced. continue parsing with the lower rule which
575 # is the operator rule. do this only if the end tags look
577 if balancing_stack and \
578 tokens in ('variable_end', 'block_end',
579 'linestatement_end'):
582 # tuples support more options
583 if isinstance(tokens, tuple):
584 for idx, token in enumerate(tokens):
586 if token.__class__ is Failure:
587 raise token(lineno, filename)
588 # bygroup is a bit more complex, in that case we
589 # yield for the current token the first named
591 elif token == '#bygroup':
592 for key, value in m.groupdict().iteritems():
593 if value is not None:
594 yield lineno, key, value
595 lineno += value.count('\n')
598 raise RuntimeError('%r wanted to resolve '
599 'the token dynamically'
600 ' but no group matched'
604 data = m.group(idx + 1)
605 if data or token not in ignore_if_empty:
606 yield lineno, token, data
607 lineno += data.count('\n')
609 # strings as token just are yielded as it.
612 # update brace/parentheses balance
613 if tokens == 'operator':
615 balancing_stack.append('}')
617 balancing_stack.append(')')
619 balancing_stack.append(']')
620 elif data in ('}', ')', ']'):
621 if not balancing_stack:
622 raise TemplateSyntaxError('unexpected \'%s\'' %
625 expected_op = balancing_stack.pop()
626 if expected_op != data:
627 raise TemplateSyntaxError('unexpected \'%s\', '
633 if data or tokens not in ignore_if_empty:
634 yield lineno, tokens, data
635 lineno += data.count('\n')
637 # fetch new position into new variable so that we can check
638 # if there is a internal parsing error which would result
639 # in an infinite loop
642 # handle state changes
643 if new_state is not None:
644 # remove the uppermost state
645 if new_state == '#pop':
647 # resolve the new state by group checking
648 elif new_state == '#bygroup':
649 for key, value in m.groupdict().iteritems():
650 if value is not None:
654 raise RuntimeError('%r wanted to resolve the '
655 'new state dynamically but'
656 ' no group matched' %
658 # direct state name given
660 stack.append(new_state)
661 statetokens = self.rules[stack[-1]]
662 # we are still at the same position and no stack change.
663 # this means a loop without break condition, avoid that and
666 raise RuntimeError('%r yielded empty string without '
667 'stack change' % regex)
668 # publish new function and start again
671 # if loop terminated without break we havn't found a single match
672 # either we are at the end of the file or we have a problem
675 if pos >= source_length:
677 # something went wrong
678 raise TemplateSyntaxError('unexpected char %r at %d' %
679 (source[pos], pos), lineno,