83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Copyright (C) 2009-2018 the sqlparse authors and contributors
|
||
|
# <see AUTHORS file>
|
||
|
#
|
||
|
# This module is part of python-sqlparse and is released under
|
||
|
# the BSD License: https://opensource.org/licenses/BSD-3-Clause
|
||
|
|
||
|
"""SQL Lexer"""
|
||
|
|
||
|
# This code is based on the SqlLexer in pygments.
|
||
|
# http://pygments.org/
|
||
|
# It's separated from the rest of pygments to increase performance
|
||
|
# and to allow some customizations.
|
||
|
|
||
|
from sqlparse import tokens
|
||
|
from sqlparse.keywords import SQL_REGEX
|
||
|
from sqlparse.compat import text_type, file_types
|
||
|
from sqlparse.utils import consume
|
||
|
|
||
|
|
||
|
class Lexer(object):
|
||
|
"""Lexer
|
||
|
Empty class. Leaving for backwards-compatibility
|
||
|
"""
|
||
|
|
||
|
@staticmethod
|
||
|
def get_tokens(text, encoding=None):
|
||
|
"""
|
||
|
Return an iterable of (tokentype, value) pairs generated from
|
||
|
`text`. If `unfiltered` is set to `True`, the filtering mechanism
|
||
|
is bypassed even if filters are defined.
|
||
|
|
||
|
Also preprocess the text, i.e. expand tabs and strip it if
|
||
|
wanted and applies registered filters.
|
||
|
|
||
|
Split ``text`` into (tokentype, text) pairs.
|
||
|
|
||
|
``stack`` is the initial stack (default: ``['root']``)
|
||
|
"""
|
||
|
if isinstance(text, file_types):
|
||
|
text = text.read()
|
||
|
|
||
|
if isinstance(text, text_type):
|
||
|
pass
|
||
|
elif isinstance(text, bytes):
|
||
|
if encoding:
|
||
|
text = text.decode(encoding)
|
||
|
else:
|
||
|
try:
|
||
|
text = text.decode('utf-8')
|
||
|
except UnicodeDecodeError:
|
||
|
text = text.decode('unicode-escape')
|
||
|
else:
|
||
|
raise TypeError(u"Expected text or file-like object, got {!r}".
|
||
|
format(type(text)))
|
||
|
|
||
|
iterable = enumerate(text)
|
||
|
for pos, char in iterable:
|
||
|
for rexmatch, action in SQL_REGEX:
|
||
|
m = rexmatch(text, pos)
|
||
|
|
||
|
if not m:
|
||
|
continue
|
||
|
elif isinstance(action, tokens._TokenType):
|
||
|
yield action, m.group()
|
||
|
elif callable(action):
|
||
|
yield action(m.group())
|
||
|
|
||
|
consume(iterable, m.end() - pos - 1)
|
||
|
break
|
||
|
else:
|
||
|
yield tokens.Error, char
|
||
|
|
||
|
|
||
|
def tokenize(sql, encoding=None):
|
||
|
"""Tokenize sql.
|
||
|
|
||
|
Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
|
||
|
of ``(token type, value)`` items.
|
||
|
"""
|
||
|
return Lexer().get_tokens(sql, encoding)
|