f1643565b8
* modulo operator, % * cos, sin and tan functions * acos, asin, atan and atan2 functions * cosh, sinh and tanh functions * fact and combin functions * degrees and radians functions * odd and even functions git-svn-id: http://google-refine.googlecode.com/svn/trunk@1908 7d457c2a-affb-35e4-300a-418c747d4874
348 lines
11 KiB
Java
348 lines
11 KiB
Java
/*
|
|
|
|
Copyright 2010, Google Inc.
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above
|
|
copyright notice, this list of conditions and the following disclaimer
|
|
in the documentation and/or other materials provided with the
|
|
distribution.
|
|
* Neither the name of Google Inc. nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
package com.google.refine.grel;
|
|
|
|
public class Scanner {
|
|
static public enum TokenType {
|
|
Error,
|
|
Delimiter,
|
|
Operator,
|
|
Identifier,
|
|
Number,
|
|
String,
|
|
Regex
|
|
}
|
|
|
|
static public class Token {
|
|
final public int start;
|
|
final public int end;
|
|
final public TokenType type;
|
|
final public String text;
|
|
|
|
Token(int start, int end, TokenType type, String text) {
|
|
this.start = start;
|
|
this.end = end;
|
|
this.type = type;
|
|
this.text = text;
|
|
}
|
|
}
|
|
|
|
static public class ErrorToken extends Token {
|
|
final public String detail; // error detail
|
|
|
|
public ErrorToken(int start, int end, String text, String detail) {
|
|
super(start, end, TokenType.Error, text);
|
|
this.detail = detail;
|
|
}
|
|
}
|
|
|
|
static public class NumberToken extends Token {
|
|
final public Number value;
|
|
|
|
public NumberToken(int start, int end, String text, Number value) {
|
|
super(start, end, TokenType.Number, text);
|
|
this.value = value;
|
|
}
|
|
}
|
|
|
|
static public class RegexToken extends Token {
|
|
final public boolean caseInsensitive;
|
|
|
|
public RegexToken(int start, int end, String text, boolean caseInsensitive) {
|
|
super(start, end, TokenType.Regex, text);
|
|
this.caseInsensitive = caseInsensitive;
|
|
}
|
|
}
|
|
|
|
protected String _text; // input text to tokenize
|
|
protected int _index; // index of the next character to process
|
|
protected int _limit; // process up to this index
|
|
|
|
public Scanner(String s) {
|
|
this(s, 0, s.length());
|
|
}
|
|
|
|
public Scanner(String s, int from, int to) {
|
|
_text = s;
|
|
_index = from;
|
|
_limit = to;
|
|
}
|
|
|
|
public int getIndex() {
|
|
return _index;
|
|
}
|
|
|
|
/**
|
|
* The regexPossible flag is used by the parser to hint the scanner what to do
|
|
* when it encounters a slash. Since the divide operator / and the opening
|
|
* delimiter of a regex literal are the same, but divide operators and regex
|
|
* literals can't occur at the same place in an expression, this flag is a cheap
|
|
* way to distinguish the two without having to look ahead.
|
|
*
|
|
* @param regexPossible
|
|
* @return
|
|
*/
|
|
public Token next(boolean regexPossible) {
|
|
// skip whitespace
|
|
while (_index < _limit && Character.isWhitespace(_text.charAt(_index))) {
|
|
_index++;
|
|
}
|
|
if (_index == _limit) {
|
|
return null;
|
|
}
|
|
|
|
char c = _text.charAt(_index);
|
|
int start = _index;
|
|
String detail = null;
|
|
|
|
if (Character.isDigit(c)) { // number literal
|
|
long value = 0;
|
|
|
|
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
|
|
value = value * 10 + (c - '0');
|
|
_index++;
|
|
}
|
|
|
|
if (_index < _limit && (c == '.' || c == 'e')) {
|
|
double value2 = value;
|
|
if (c == '.') {
|
|
_index++;
|
|
|
|
double division = 1;
|
|
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
|
|
value2 = value2 * 10 + (c - '0');
|
|
division *= 10;
|
|
_index++;
|
|
}
|
|
|
|
value2 /= division;
|
|
}
|
|
|
|
// TODO: support exponent e notation
|
|
|
|
return new NumberToken(
|
|
start,
|
|
_index,
|
|
_text.substring(start, _index),
|
|
value2
|
|
);
|
|
} else {
|
|
return new NumberToken(
|
|
start,
|
|
_index,
|
|
_text.substring(start, _index),
|
|
value
|
|
);
|
|
}
|
|
} else if (c == '"' || c == '\'') {
|
|
/*
|
|
* String Literal
|
|
*/
|
|
|
|
StringBuffer sb = new StringBuffer();
|
|
char delimiter = c;
|
|
|
|
_index++; // skip opening delimiter
|
|
|
|
while (_index < _limit) {
|
|
c = _text.charAt(_index);
|
|
if (c == delimiter) {
|
|
_index++; // skip closing delimiter
|
|
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.String,
|
|
sb.toString()
|
|
);
|
|
} else if (c == '\\') {
|
|
_index++; // skip escaping marker
|
|
if (_index < _limit) {
|
|
char c2 = _text.charAt(_index);
|
|
if (c2 == 't') {
|
|
sb.append('\t');
|
|
} else if (c2 == 'n') {
|
|
sb.append('\n');
|
|
} else if (c2 == 'r') {
|
|
sb.append('\r');
|
|
} else if (c2 == '\\') {
|
|
sb.append('\\');
|
|
} else {
|
|
sb.append(c2);
|
|
}
|
|
}
|
|
} else {
|
|
sb.append(c);
|
|
}
|
|
_index++;
|
|
}
|
|
|
|
detail = "String not properly closed";
|
|
// fall through
|
|
|
|
} else if (Character.isLetter(c) || c == '_') { // identifier
|
|
while (_index < _limit) {
|
|
char c1 = _text.charAt(_index);
|
|
if (c1 == '_' || Character.isLetterOrDigit(c1)) {
|
|
_index++;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Identifier,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else if (c == '/' && regexPossible) {
|
|
/*
|
|
* Regex literal
|
|
*/
|
|
StringBuffer sb = new StringBuffer();
|
|
|
|
_index++; // skip opening delimiter
|
|
|
|
while (_index < _limit) {
|
|
c = _text.charAt(_index);
|
|
if (c == '/') {
|
|
_index++; // skip closing delimiter
|
|
|
|
boolean caseInsensitive = false;
|
|
if (_index < _limit && _text.charAt(_index) == 'i') {
|
|
caseInsensitive = true;
|
|
_index++;
|
|
}
|
|
|
|
return new RegexToken(
|
|
start,
|
|
_index,
|
|
sb.toString(),
|
|
caseInsensitive
|
|
);
|
|
} else if (c == '\\') {
|
|
sb.append(c);
|
|
|
|
_index++; // skip escaping marker
|
|
if (_index < _limit) {
|
|
sb.append(_text.charAt(_index));
|
|
}
|
|
} else {
|
|
sb.append(c);
|
|
}
|
|
_index++;
|
|
}
|
|
|
|
detail = "Regex not properly closed";
|
|
// fall through
|
|
} else if ("+-*/.%".indexOf(c) >= 0) { // operator
|
|
_index++;
|
|
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else if ("()[],".indexOf(c) >= 0) { // delimiter
|
|
_index++;
|
|
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Delimiter,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else if (c == '!' && _index < _limit - 1 && _text.charAt(_index + 1) == '=') {
|
|
_index += 2;
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else if (c == '<') {
|
|
if (_index < _limit - 1 &&
|
|
(_text.charAt(_index + 1) == '=' ||
|
|
_text.charAt(_index + 1) == '>')) {
|
|
|
|
_index += 2;
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else {
|
|
_index++;
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
}
|
|
} else if (">=".indexOf(c) >= 0) { // operator
|
|
if (_index < _limit - 1 && _text.charAt(_index + 1) == '=') {
|
|
_index += 2;
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
} else {
|
|
_index++;
|
|
return new Token(
|
|
start,
|
|
_index,
|
|
TokenType.Operator,
|
|
_text.substring(start, _index)
|
|
);
|
|
}
|
|
} else {
|
|
_index++;
|
|
detail = "Unrecognized symbol";
|
|
}
|
|
|
|
return new ErrorToken(
|
|
start,
|
|
_index,
|
|
_text.substring(start, _index),
|
|
detail
|
|
);
|
|
}
|
|
}
|