1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: (c) 2010 by the Jinja Team.
15 :license: BSD, see LICENSE for more details.
18 from operator import itemgetter
19 from collections import deque
20 from jinja2.exceptions import TemplateSyntaxError
21 from jinja2.utils import LRUCache, next
24 # cache for the lexers. Exists in order to be able to have multiple
25 # environments with the same lexer
26 _lexer_cache = LRUCache(50)
28 # static regular expressions
29 whitespace_re = re.compile(r'\s+', re.U)
30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
32 integer_re = re.compile(r'\d+')
34 # we use the unicode identifier rule if this python version is able
35 # to handle unicode identifiers, otherwise the standard ASCII one.
37 compile('föö', '<unknown>', 'eval')
39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
41 from jinja2 import _stringdefs
42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43 _stringdefs.xid_continue))
45 float_re = re.compile(r'(?<!\.)\d+\.\d+')
46 newline_re = re.compile(r'(\r\n|\r|\n)')
48 # internal the tokens and keep references to them
49 TOKEN_ADD = intern('add')
50 TOKEN_ASSIGN = intern('assign')
51 TOKEN_COLON = intern('colon')
52 TOKEN_COMMA = intern('comma')
53 TOKEN_DIV = intern('div')
54 TOKEN_DOT = intern('dot')
55 TOKEN_EQ = intern('eq')
56 TOKEN_FLOORDIV = intern('floordiv')
57 TOKEN_GT = intern('gt')
58 TOKEN_GTEQ = intern('gteq')
59 TOKEN_LBRACE = intern('lbrace')
60 TOKEN_LBRACKET = intern('lbracket')
61 TOKEN_LPAREN = intern('lparen')
62 TOKEN_LT = intern('lt')
63 TOKEN_LTEQ = intern('lteq')
64 TOKEN_MOD = intern('mod')
65 TOKEN_MUL = intern('mul')
66 TOKEN_NE = intern('ne')
67 TOKEN_PIPE = intern('pipe')
68 TOKEN_POW = intern('pow')
69 TOKEN_RBRACE = intern('rbrace')
70 TOKEN_RBRACKET = intern('rbracket')
71 TOKEN_RPAREN = intern('rparen')
72 TOKEN_SEMICOLON = intern('semicolon')
73 TOKEN_SUB = intern('sub')
74 TOKEN_TILDE = intern('tilde')
75 TOKEN_WHITESPACE = intern('whitespace')
76 TOKEN_FLOAT = intern('float')
77 TOKEN_INTEGER = intern('integer')
78 TOKEN_NAME = intern('name')
79 TOKEN_STRING = intern('string')
80 TOKEN_OPERATOR = intern('operator')
81 TOKEN_BLOCK_BEGIN = intern('block_begin')
82 TOKEN_BLOCK_END = intern('block_end')
83 TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84 TOKEN_VARIABLE_END = intern('variable_end')
85 TOKEN_RAW_BEGIN = intern('raw_begin')
86 TOKEN_RAW_END = intern('raw_end')
87 TOKEN_COMMENT_BEGIN = intern('comment_begin')
88 TOKEN_COMMENT_END = intern('comment_end')
89 TOKEN_COMMENT = intern('comment')
90 TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91 TOKEN_LINESTATEMENT_END = intern('linestatement_end')
92 TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93 TOKEN_LINECOMMENT_END = intern('linecomment_end')
94 TOKEN_LINECOMMENT = intern('linecomment')
95 TOKEN_DATA = intern('data')
96 TOKEN_INITIAL = intern('initial')
97 TOKEN_EOF = intern('eof')
99 # bind operators to token types
104 '//': TOKEN_FLOORDIV,
129 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130 assert len(operators) == len(reverse_operators), 'operators dropped'
131 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132 sorted(operators, key=lambda x: -len(x))))
134 ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
138 ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139 TOKEN_COMMENT, TOKEN_LINECOMMENT])
142 def _describe_token_type(token_type):
143 if token_type in reverse_operators:
144 return reverse_operators[token_type]
146 TOKEN_COMMENT_BEGIN: 'begin of comment',
147 TOKEN_COMMENT_END: 'end of comment',
148 TOKEN_COMMENT: 'comment',
149 TOKEN_LINECOMMENT: 'comment',
150 TOKEN_BLOCK_BEGIN: 'begin of statement block',
151 TOKEN_BLOCK_END: 'end of statement block',
152 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
153 TOKEN_VARIABLE_END: 'end of print statement',
154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
155 TOKEN_LINESTATEMENT_END: 'end of line statement',
156 TOKEN_DATA: 'template data / text',
157 TOKEN_EOF: 'end of template'
158 }.get(token_type, token_type)
161 def describe_token(token):
162 """Returns a description of the token."""
163 if token.type == 'name':
165 return _describe_token_type(token.type)
168 def describe_token_expr(expr):
169 """Like `describe_token` but for token expressions."""
171 type, value = expr.split(':', 1)
176 return _describe_token_type(type)
179 def count_newlines(value):
180 """Count the number of newline characters in the string. This is
181 useful for extensions that filter a stream.
183 return len(newline_re.findall(value))
186 def compile_rules(environment):
187 """Compiles all the rules from the environment into a list of rules."""
190 (len(environment.comment_start_string), 'comment',
191 e(environment.comment_start_string)),
192 (len(environment.block_start_string), 'block',
193 e(environment.block_start_string)),
194 (len(environment.variable_start_string), 'variable',
195 e(environment.variable_start_string))
198 if environment.line_statement_prefix is not None:
199 rules.append((len(environment.line_statement_prefix), 'linestatement',
200 r'^\s*' + e(environment.line_statement_prefix)))
201 if environment.line_comment_prefix is not None:
202 rules.append((len(environment.line_comment_prefix), 'linecomment',
203 r'(?:^|(?<=\S))[^\S\r\n]*' +
204 e(environment.line_comment_prefix)))
206 return [x[1:] for x in sorted(rules, reverse=True)]
209 class Failure(object):
210 """Class that raises a `TemplateSyntaxError` if called.
211 Used by the `Lexer` to specify known errors.
214 def __init__(self, message, cls=TemplateSyntaxError):
215 self.message = message
216 self.error_class = cls
218 def __call__(self, lineno, filename):
219 raise self.error_class(self.message, lineno, filename)
225 lineno, type, value = (property(itemgetter(x)) for x in range(3))
227 def __new__(cls, lineno, type, value):
228 return tuple.__new__(cls, (lineno, intern(str(type)), value))
231 if self.type in reverse_operators:
232 return reverse_operators[self.type]
233 elif self.type == 'name':
237 def test(self, expr):
238 """Test a token against a token expression. This can either be a
239 token type or ``'token_type:token_value'``. This can only test
240 against string values and types.
242 # here we do a regular string equality check as test_any is usually
243 # passed an iterable of not interned strings.
244 if self.type == expr:
247 return expr.split(':', 1) == [self.type, self.value]
250 def test_any(self, *iterable):
251 """Test against multiple token expressions."""
252 for expr in iterable:
258 return 'Token(%r, %r, %r)' % (
265 class TokenStreamIterator(object):
266 """The iterator for tokenstreams. Iterate over the stream
267 until the eof token is reached.
270 def __init__(self, stream):
277 token = self.stream.current
278 if token.type is TOKEN_EOF:
280 raise StopIteration()
285 class TokenStream(object):
286 """A token stream is an iterable that yields :class:`Token`\s. The
287 parser however does not iterate over it but calls :meth:`next` to go
288 one token ahead. The current active token is stored as :attr:`current`.
291 def __init__(self, generator, name, filename):
292 self._next = iter(generator).next
293 self._pushed = deque()
295 self.filename = filename
297 self.current = Token(1, TOKEN_INITIAL, '')
301 return TokenStreamIterator(self)
303 def __nonzero__(self):
304 return bool(self._pushed) or self.current.type is not TOKEN_EOF
306 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
308 def push(self, token):
309 """Push a token back to the stream."""
310 self._pushed.append(token)
313 """Look at the next token."""
314 old_token = next(self)
315 result = self.current
317 self.current = old_token
321 """Got n tokens ahead."""
325 def next_if(self, expr):
326 """Perform the token test and return the token if it matched.
327 Otherwise the return value is `None`.
329 if self.current.test(expr):
332 def skip_if(self, expr):
333 """Like :meth:`next_if` but only returns `True` or `False`."""
334 return self.next_if(expr) is not None
337 """Go one token ahead and return the old one"""
340 self.current = self._pushed.popleft()
341 elif self.current.type is not TOKEN_EOF:
343 self.current = self._next()
344 except StopIteration:
349 """Close the stream."""
350 self.current = Token(self.current.lineno, TOKEN_EOF, '')
354 def expect(self, expr):
355 """Expect a given token type and return it. This accepts the same
356 argument as :meth:`jinja2.lexer.Token.test`.
358 if not self.current.test(expr):
359 expr = describe_token_expr(expr)
360 if self.current.type is TOKEN_EOF:
361 raise TemplateSyntaxError('unexpected end of template, '
362 'expected %r.' % expr,
364 self.name, self.filename)
365 raise TemplateSyntaxError("expected token %r, got %r" %
366 (expr, describe_token(self.current)),
368 self.name, self.filename)
375 def get_lexer(environment):
376 """Return a lexer which is probably cached."""
377 key = (environment.block_start_string,
378 environment.block_end_string,
379 environment.variable_start_string,
380 environment.variable_end_string,
381 environment.comment_start_string,
382 environment.comment_end_string,
383 environment.line_statement_prefix,
384 environment.line_comment_prefix,
385 environment.trim_blocks,
386 environment.newline_sequence)
387 lexer = _lexer_cache.get(key)
389 lexer = Lexer(environment)
390 _lexer_cache[key] = lexer
395 """Class that implements a lexer for a given environment. Automatically
396 created by the environment class, usually you don't have to do that.
398 Note that the lexer is not automatically bound to an environment.
399 Multiple environments can share the same lexer.
402 def __init__(self, environment):
404 c = lambda x: re.compile(x, re.M | re.S)
407 # lexing rules for tags
409 (whitespace_re, TOKEN_WHITESPACE, None),
410 (float_re, TOKEN_FLOAT, None),
411 (integer_re, TOKEN_INTEGER, None),
412 (name_re, TOKEN_NAME, None),
413 (string_re, TOKEN_STRING, None),
414 (operator_re, TOKEN_OPERATOR, None)
417 # assamble the root lexing rule. because "|" is ungreedy
418 # we have to sort by length so that the lexer continues working
419 # as expected when we have parsing rules like <% for block and
420 # <%= for variables. (if someone wants asp like syntax)
421 # variables are just part of the rules if variable processing
423 root_tag_rules = compile_rules(environment)
425 # block suffix if trimming is enabled
426 block_suffix_re = environment.trim_blocks and '\\n?' or ''
428 self.newline_sequence = environment.newline_sequence
430 # global lexing rules
434 (c('(.*?)(?:%s)' % '|'.join(
435 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % (
436 e(environment.block_start_string),
437 e(environment.block_start_string),
438 e(environment.block_end_string),
439 e(environment.block_end_string)
441 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
442 for n, r in root_tag_rules
443 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
445 (c('.+'), TOKEN_DATA, None)
448 TOKEN_COMMENT_BEGIN: [
449 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
450 e(environment.comment_end_string),
451 e(environment.comment_end_string),
453 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
454 (c('(.)'), (Failure('Missing end of comment tag'),), None)
458 (c('(?:\-%s\s*|%s)%s' % (
459 e(environment.block_end_string),
460 e(environment.block_end_string),
462 )), TOKEN_BLOCK_END, '#pop'),
465 TOKEN_VARIABLE_BEGIN: [
467 e(environment.variable_end_string),
468 e(environment.variable_end_string)
469 )), TOKEN_VARIABLE_END, '#pop')
473 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
474 e(environment.block_start_string),
475 e(environment.block_start_string),
476 e(environment.block_end_string),
477 e(environment.block_end_string),
479 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
480 (c('(.)'), (Failure('Missing end of raw directive'),), None)
483 TOKEN_LINESTATEMENT_BEGIN: [
484 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
487 TOKEN_LINECOMMENT_BEGIN: [
488 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
489 TOKEN_LINECOMMENT_END), '#pop')
493 def _normalize_newlines(self, value):
494 """Called for strings and template data to normlize it to unicode."""
495 return newline_re.sub(self.newline_sequence, value)
497 def tokenize(self, source, name=None, filename=None, state=None):
498 """Calls tokeniter + tokenize and wraps it in a token stream.
500 stream = self.tokeniter(source, name, filename, state)
501 return TokenStream(self.wrap(stream, name, filename), name, filename)
503 def wrap(self, stream, name=None, filename=None):
504 """This is called with the stream as returned by `tokenize` and wraps
505 every token in a :class:`Token` and converts the value.
507 for lineno, token, value in stream:
508 if token in ignored_tokens:
510 elif token == 'linestatement_begin':
511 token = 'block_begin'
512 elif token == 'linestatement_end':
514 # we are not interested in those tokens in the parser
515 elif token in ('raw_begin', 'raw_end'):
517 elif token == 'data':
518 value = self._normalize_newlines(value)
519 elif token == 'keyword':
521 elif token == 'name':
523 elif token == 'string':
524 # try to unescape string
526 value = self._normalize_newlines(value[1:-1]) \
527 .encode('ascii', 'backslashreplace') \
528 .decode('unicode-escape')
530 msg = str(e).split(':')[-1].strip()
531 raise TemplateSyntaxError(msg, lineno, name, filename)
532 # if we can express it as bytestring (ascii only)
533 # we do that for support of semi broken APIs
534 # as datetime.datetime.strftime. On python 3 this
535 # call becomes a noop thanks to 2to3
540 elif token == 'integer':
542 elif token == 'float':
544 elif token == 'operator':
545 token = operators[value]
546 yield Token(lineno, token, value)
548 def tokeniter(self, source, name, filename=None, state=None):
549 """This method tokenizes the text and returns the tokens in a
550 generator. Use this method if you just want to tokenize a template.
552 source = '\n'.join(unicode(source).splitlines())
556 if state is not None and state != 'root':
557 assert state in ('variable', 'block'), 'invalid state'
558 stack.append(state + '_begin')
561 statetokens = self.rules[stack[-1]]
562 source_length = len(source)
568 for regex, tokens, new_state in statetokens:
569 m = regex.match(source, pos)
570 # if no match we try again with the next rule
574 # we only match blocks and variables if brances / parentheses
575 # are balanced. continue parsing with the lower rule which
576 # is the operator rule. do this only if the end tags look
578 if balancing_stack and \
579 tokens in ('variable_end', 'block_end',
580 'linestatement_end'):
583 # tuples support more options
584 if isinstance(tokens, tuple):
585 for idx, token in enumerate(tokens):
587 if token.__class__ is Failure:
588 raise token(lineno, filename)
589 # bygroup is a bit more complex, in that case we
590 # yield for the current token the first named
592 elif token == '#bygroup':
593 for key, value in m.groupdict().iteritems():
594 if value is not None:
595 yield lineno, key, value
596 lineno += value.count('\n')
599 raise RuntimeError('%r wanted to resolve '
600 'the token dynamically'
601 ' but no group matched'
605 data = m.group(idx + 1)
606 if data or token not in ignore_if_empty:
607 yield lineno, token, data
608 lineno += data.count('\n')
610 # strings as token just are yielded as it.
613 # update brace/parentheses balance
614 if tokens == 'operator':
616 balancing_stack.append('}')
618 balancing_stack.append(')')
620 balancing_stack.append(']')
621 elif data in ('}', ')', ']'):
622 if not balancing_stack:
623 raise TemplateSyntaxError('unexpected \'%s\'' %
626 expected_op = balancing_stack.pop()
627 if expected_op != data:
628 raise TemplateSyntaxError('unexpected \'%s\', '
634 if data or tokens not in ignore_if_empty:
635 yield lineno, tokens, data
636 lineno += data.count('\n')
638 # fetch new position into new variable so that we can check
639 # if there is a internal parsing error which would result
640 # in an infinite loop
643 # handle state changes
644 if new_state is not None:
645 # remove the uppermost state
646 if new_state == '#pop':
648 # resolve the new state by group checking
649 elif new_state == '#bygroup':
650 for key, value in m.groupdict().iteritems():
651 if value is not None:
655 raise RuntimeError('%r wanted to resolve the '
656 'new state dynamically but'
657 ' no group matched' %
659 # direct state name given
661 stack.append(new_state)
662 statetokens = self.rules[stack[-1]]
663 # we are still at the same position and no stack change.
664 # this means a loop without break condition, avoid that and
667 raise RuntimeError('%r yielded empty string without '
668 'stack change' % regex)
669 # publish new function and start again
672 # if loop terminated without break we havn't found a single match
673 # either we are at the end of the file or we have a problem
676 if pos >= source_length:
678 # something went wrong
679 raise TemplateSyntaxError('unexpected char %r at %d' %
680 (source[pos], pos), lineno,