392 lines
10 KiB
Python
392 lines
10 KiB
Python
import re
|
|
import sys
|
|
|
|
from .ply import lex
|
|
|
|
if sys.version_info < (3,):
|
|
text_type = unicode
|
|
else:
|
|
text_type = str
|
|
|
|
|
|
def _raise_error(t, message=None):
|
|
lexpos = t.lexer.lexpos
|
|
lexdata = t.lexer.lexdata
|
|
lineno = t.lexer.lineno
|
|
column = _find_column(lexdata, t)
|
|
if message is None:
|
|
message = "Illegal character '%s'" % lexdata[lexpos]
|
|
raise ValueError(
|
|
"Line %d, column %d, index %d: %s" % (lineno, column, lexpos, message)
|
|
)
|
|
|
|
|
|
def _find_column(input, token):
|
|
last_cr = input.rfind('\n', 0, token.lexpos)
|
|
column = (token.lexpos - last_cr) - 1
|
|
return column
|
|
|
|
|
|
class Lexer(object):
|
|
|
|
tokens = (
|
|
'BOOL',
|
|
'FLOAT',
|
|
'NUMBER',
|
|
'COMMA',
|
|
'COMMENT',
|
|
'MULTICOMMENT',
|
|
'IDENTIFIER',
|
|
'EQUAL',
|
|
'STRING',
|
|
'ADD',
|
|
'MINUS',
|
|
'MULTIPLY',
|
|
'DIVIDE',
|
|
'LEFTBRACE',
|
|
'RIGHTBRACE',
|
|
'LEFTBRACKET',
|
|
'RIGHTBRACKET',
|
|
'PERIOD',
|
|
'EPLUS',
|
|
'EMINUS',
|
|
'LEFTPAREN',
|
|
'RIGHTPAREN',
|
|
'QMARK',
|
|
'COLON',
|
|
'ASTERISK_PERIOD',
|
|
'GT',
|
|
'LT',
|
|
'EQ',
|
|
'NE',
|
|
'LE',
|
|
'GE',
|
|
)
|
|
|
|
states = (
|
|
('stringdollar', 'exclusive'),
|
|
('string', 'exclusive'),
|
|
('heredoc', 'exclusive'),
|
|
('tabbedheredoc', 'exclusive'),
|
|
)
|
|
|
|
can_export_comments = []
|
|
|
|
def t_BOOL(self, t):
|
|
r'(true)|(false)'
|
|
t.value = t.value == 'true'
|
|
return t
|
|
|
|
def t_EMINUS(self, t):
|
|
r'(?<=\d|\.)[eE]-'
|
|
return t
|
|
|
|
def t_EPLUS(self, t):
|
|
r'(?<=\d)[eE]\+?|(?<=\d\.)[eE]\+?'
|
|
return t
|
|
|
|
def t_FLOAT(self, t):
|
|
r'-?((\d+\.\d*)|(\d*\.\d+))'
|
|
t.value = float(t.value)
|
|
return t
|
|
|
|
def t_hexnumber(self, t):
|
|
r'-?0[xX][0-9a-fA-F]+'
|
|
t.value = int(t.value, base=16)
|
|
t.type = 'NUMBER'
|
|
return t
|
|
|
|
def t_intnumber(self, t):
|
|
r'-?\d+'
|
|
t.value = int(t.value)
|
|
t.type = 'NUMBER'
|
|
return t
|
|
|
|
def t_PERIOD(self, t):
|
|
r'\.'
|
|
return t
|
|
|
|
def t_COMMA(self, t):
|
|
r','
|
|
return t
|
|
|
|
def t_QMARK(self, t):
|
|
r'\?'
|
|
return t
|
|
|
|
def t_COLON(self, t):
|
|
r':'
|
|
return t
|
|
|
|
def t_ASTERISK_PERIOD(self, t):
|
|
r'\*\.'
|
|
return t
|
|
|
|
def t_GT(self, t):
|
|
r'(?<!>)>(?!>|=)'
|
|
return t
|
|
|
|
def t_LT(self, t):
|
|
r'(?<!<)<(?!<|=)'
|
|
return t
|
|
|
|
def t_EQ(self, t):
|
|
r'=='
|
|
return t
|
|
|
|
def t_NE(self, t):
|
|
r'!='
|
|
return t
|
|
|
|
def t_LE(self, t):
|
|
r'<='
|
|
return t
|
|
|
|
def t_GE(self, t):
|
|
r'>='
|
|
return t
|
|
|
|
def t_IDENTIFIER(self, t):
|
|
r'[^\W\d][\w.-]*'
|
|
t.value = text_type(t.value)
|
|
return t
|
|
|
|
# Strings
|
|
def t_string(self, t):
|
|
# Start of a string
|
|
r'\"'
|
|
# abs_start is the absolute start of the string. We use this at the end
|
|
# to know how many new lines we've consumed
|
|
t.lexer.abs_start = t.lexer.lexpos
|
|
# rel_pos is the begining of the unconsumed part of the string. It will
|
|
# get modified when consuming escaped characters
|
|
t.lexer.rel_pos = t.lexer.lexpos
|
|
# The value of the consumed part of the string
|
|
t.lexer.string_value = u''
|
|
t.lexer.begin('string')
|
|
|
|
def t_string_escapedchar(self, t):
|
|
# If a quote or backslash is escaped, build up the string by ignoring
|
|
# the escape character. Should this be done for other characters?
|
|
r'(?<=\\)(\"|\\)'
|
|
t.lexer.string_value += (
|
|
t.lexer.lexdata[t.lexer.rel_pos : t.lexer.lexpos - 2] + t.value
|
|
)
|
|
t.lexer.rel_pos = t.lexer.lexpos
|
|
pass
|
|
|
|
def t_string_stringdollar(self, t):
|
|
# Left brace preceeded by a dollar
|
|
r'(?<=\$)\{'
|
|
t.lexer.braces = 1
|
|
t.lexer.begin('stringdollar')
|
|
|
|
def t_string_ignoring(self, t):
|
|
# Ignore everything except for a quote
|
|
r'[^\"]'
|
|
pass
|
|
|
|
def t_string_STRING(self, t):
|
|
# End of the string
|
|
r'\"'
|
|
t.value = (
|
|
t.lexer.string_value + t.lexer.lexdata[t.lexer.rel_pos : t.lexer.lexpos - 1]
|
|
)
|
|
t.lexer.lineno += t.lexer.lexdata[t.lexer.abs_start : t.lexer.lexpos - 1].count(
|
|
'\n'
|
|
)
|
|
t.lexer.begin('INITIAL')
|
|
return t
|
|
|
|
def t_string_eof(self, t):
|
|
t.lexer.lineno += t.lexer.lexdata[t.lexer.abs_start : t.lexer.lexpos].count(
|
|
'\n'
|
|
)
|
|
_raise_error(t, 'EOF before closing string quote')
|
|
|
|
def t_stringdollar_dontcare(self, t):
|
|
# Ignore everything except for braces
|
|
r'[^\{\}]'
|
|
pass
|
|
|
|
def t_stringdollar_lbrace(self, t):
|
|
r'\{'
|
|
t.lexer.braces += 1
|
|
|
|
def t_stringdollar_rbrace(self, t):
|
|
r'\}'
|
|
t.lexer.braces -= 1
|
|
|
|
if t.lexer.braces == 0:
|
|
# End of the dollar brace, back to the rest of the string
|
|
t.lexer.begin('string')
|
|
|
|
def t_stringdollar_eof(self, t):
|
|
t.lexer.lineno += t.lexer.lexdata[t.lexer.abs_start : t.lexer.lexpos].count(
|
|
'\n'
|
|
)
|
|
_raise_error(t, "EOF before closing '${}' expression")
|
|
|
|
def _init_heredoc(self, t):
|
|
t.lexer.here_start = t.lexer.lexpos
|
|
|
|
if t.value.endswith('\r\n'):
|
|
t.lexer.newline_chars = 2
|
|
else:
|
|
t.lexer.newline_chars = 1
|
|
|
|
if t.lexer.is_tabbed:
|
|
# Chop '<<-'
|
|
chop = 3
|
|
else:
|
|
# Chop '<<'
|
|
chop = 2
|
|
|
|
t.lexer.here_identifier = t.value[chop : -t.lexer.newline_chars]
|
|
# We consumed a newline in the regex so bump the counter
|
|
t.lexer.lineno += 1
|
|
|
|
def t_tabbedheredoc(self, t):
|
|
r'<<-\S+\r?\n'
|
|
t.lexer.is_tabbed = True
|
|
self._init_heredoc(t)
|
|
t.lexer.begin('tabbedheredoc')
|
|
|
|
def t_heredoc(self, t):
|
|
r'<<\S+\r?\n'
|
|
t.lexer.is_tabbed = False
|
|
self._init_heredoc(t)
|
|
t.lexer.begin('heredoc')
|
|
|
|
def _end_heredoc(self, t):
|
|
if t.lexer.is_tabbed:
|
|
# Strip leading tabs
|
|
value = t.value.strip()
|
|
else:
|
|
value = t.value
|
|
|
|
if value == t.lexer.here_identifier:
|
|
# Handle case where identifier is on a line of its own. Need to
|
|
# subtract the current line and the newline characters from
|
|
# the previous line to get the endpos
|
|
endpos = t.lexer.lexpos - (t.lexer.newline_chars + len(t.value))
|
|
elif value.endswith(t.lexer.here_identifier):
|
|
# Handle case where identifier is at the end of the line. Need to
|
|
# subtract the identifier from to get the endpos
|
|
endpos = t.lexer.lexpos - len(t.lexer.here_identifier)
|
|
else:
|
|
return
|
|
|
|
entire_string = t.lexer.lexdata[t.lexer.here_start : endpos]
|
|
|
|
if t.lexer.is_tabbed:
|
|
# Get rid of any initial tabs, and remove any tabs preceded by
|
|
# a new line
|
|
chopped_starting_tabs = re.sub('^\t*', '', entire_string)
|
|
t.value = re.sub('\n\t*', '\n', chopped_starting_tabs)
|
|
else:
|
|
t.value = entire_string
|
|
|
|
t.lexer.lineno += t.lexer.lexdata[t.lexer.here_start : t.lexer.lexpos].count(
|
|
'\n'
|
|
)
|
|
t.lexer.begin('INITIAL')
|
|
return t
|
|
|
|
def t_tabbedheredoc_STRING(self, t):
|
|
r'^\t*.+?(?=\r?$)'
|
|
return self._end_heredoc(t)
|
|
|
|
def t_heredoc_STRING(self, t):
|
|
r'^.+?(?=\r?$)'
|
|
return self._end_heredoc(t)
|
|
|
|
def t_heredoc_ignoring(self, t):
|
|
r'.+|\n'
|
|
pass
|
|
|
|
def t_heredoc_eof(self, t):
|
|
t.lexer.lineno += t.lexer.lexdata[t.lexer.here_start : t.lexer.lexpos].count(
|
|
'\n'
|
|
)
|
|
_raise_error(t, 'EOF before closing heredoc')
|
|
|
|
t_tabbedheredoc_ignoring = t_heredoc_ignoring
|
|
t_tabbedheredoc_eof = t_heredoc_eof
|
|
|
|
t_LEFTBRACE = r'\{'
|
|
t_RIGHTBRACE = r'\}'
|
|
t_LEFTBRACKET = r'\['
|
|
t_RIGHTBRACKET = r'\]'
|
|
t_LEFTPAREN = r'\('
|
|
t_RIGHTPAREN = r'\)'
|
|
|
|
def t_COMMENT(self, t):
|
|
r'(\#|(//)).*'
|
|
if 'COMMENT' in self.can_export_comments:
|
|
t.value = t.value.lstrip('#').lstrip('//').lstrip()
|
|
return t
|
|
|
|
def t_MULTICOMMENT(self, t):
|
|
r'/\*(.|\n)*?(\*/)'
|
|
t.lexer.lineno += t.value.count('\n')
|
|
if 'MULTICOMMENT' in self.can_export_comments:
|
|
return t
|
|
|
|
# Define a rule so we can track line numbers
|
|
def t_newline(self, t):
|
|
r'\n+'
|
|
t.lexer.lineno += len(t.value)
|
|
|
|
t_ignore = ' \t\r\f\v'
|
|
|
|
t_EQUAL = r'(?<!=)=(?!=)'
|
|
t_ADD = r'\+'
|
|
t_MINUS = r'-'
|
|
t_MULTIPLY = r'\*'
|
|
t_DIVIDE = r'/'
|
|
|
|
# Error handling rule
|
|
def t_error(self, t):
|
|
if t.value.startswith('/*'):
|
|
_raise_error(t, 'EOF before closing multiline comment')
|
|
elif t.value.startswith('*/'):
|
|
_raise_error(t, "Found '*/' before start of multiline comment")
|
|
elif t.value.startswith('/'):
|
|
c = t.value[1]
|
|
_raise_error(t, "Expected '//' for comment, got '/%s'" % c)
|
|
elif t.value.startswith('<<'):
|
|
_raise_error(t, "Heredoc must have a marker, e.g. '<<FOO'")
|
|
elif t.value.startswith('<'):
|
|
c = t.value[1]
|
|
_raise_error(t, "Heredoc must start with '<<', got '<%s'" % c)
|
|
else:
|
|
_raise_error(t)
|
|
|
|
def __init__(self, export_comments=None):
|
|
if export_comments is not None:
|
|
if export_comments == 'LINE':
|
|
self.can_export_comments = ['COMMENT']
|
|
elif export_comments == 'MULTILINE':
|
|
self.can_export_comments = ['MULTICOMMENT']
|
|
elif export_comments == 'ALL':
|
|
self.can_export_comments = ['COMMENT', 'MULTICOMMENT']
|
|
else:
|
|
raise ValueError(
|
|
'Only `LINE`, `MULTILINE` and `ALL` value are allowed for '
|
|
'`export_comments`. given: `%s`.' % export_comments
|
|
)
|
|
|
|
self.lex = lex.lex(
|
|
module=self,
|
|
debug=False,
|
|
reflags=(re.UNICODE | re.MULTILINE),
|
|
errorlog=lex.NullLogger(),
|
|
)
|
|
|
|
def input(self, s):
|
|
return self.lex.input(s)
|
|
|
|
def token(self):
|
|
return self.lex.token()
|