RandomSec/main/java/com/metaweb/gridworks/gel/Scanner.java
Stefano Mazzocchi 365868564f create 'main' and 'server'
git-svn-id: http://google-refine.googlecode.com/svn/branches/split-refactor@905 7d457c2a-affb-35e4-300a-418c747d4874
2010-05-30 16:52:50 +00:00

305 lines
9.4 KiB
Java

package com.metaweb.gridworks.gel;
public class Scanner {
static public enum TokenType {
Error,
Delimiter,
Operator,
Identifier,
Number,
String,
Regex
}
static public class Token {
final public int start;
final public int end;
final public TokenType type;
final public String text;
Token(int start, int end, TokenType type, String text) {
this.start = start;
this.end = end;
this.type = type;
this.text = text;
}
}
static public class ErrorToken extends Token {
final public String detail; // error detail
public ErrorToken(int start, int end, String text, String detail) {
super(start, end, TokenType.Error, text);
this.detail = detail;
}
}
static public class NumberToken extends Token {
final public double value;
public NumberToken(int start, int end, String text, double value) {
super(start, end, TokenType.Number, text);
this.value = value;
}
}
static public class RegexToken extends Token {
final public boolean caseInsensitive;
public RegexToken(int start, int end, String text, boolean caseInsensitive) {
super(start, end, TokenType.Regex, text);
this.caseInsensitive = caseInsensitive;
}
}
protected String _text; // input text to tokenize
protected int _index; // index of the next character to process
protected int _limit; // process up to this index
public Scanner(String s) {
this(s, 0, s.length());
}
public Scanner(String s, int from, int to) {
_text = s;
_index = from;
_limit = to;
}
public int getIndex() {
return _index;
}
/**
* The regexPossible flag is used by the parser to hint the scanner what to do
* when it encounters a slash. Since the divide operator / and the opening
* delimiter of a regex literal are the same, but divide operators and regex
* literals can't occur at the same place in an expression, this flag is a cheap
* way to distinguish the two without having to look ahead.
*
* @param regexPossible
* @return
*/
public Token next(boolean regexPossible) {
// skip whitespace
while (_index < _limit && Character.isWhitespace(_text.charAt(_index))) {
_index++;
}
if (_index == _limit) {
return null;
}
char c = _text.charAt(_index);
int start = _index;
String detail = null;
if (Character.isDigit(c)) { // number literal
double value = 0;
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
value = value * 10 + (c - '0');
_index++;
}
if (_index < _limit && c == '.') {
_index++;
double division = 1;
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
value = value * 10 + (c - '0');
division *= 10;
_index++;
}
value /= division;
}
// TODO: support exponent e notation
return new NumberToken(
start,
_index,
_text.substring(start, _index),
value
);
} else if (c == '"' || c == '\'') {
/*
* String Literal
*/
StringBuffer sb = new StringBuffer();
char delimiter = c;
_index++; // skip opening delimiter
while (_index < _limit) {
c = _text.charAt(_index);
if (c == delimiter) {
_index++; // skip closing delimiter
return new Token(
start,
_index,
TokenType.String,
sb.toString()
);
} else if (c == '\\') {
_index++; // skip escaping marker
if (_index < _limit) {
char c2 = _text.charAt(_index);
if (c2 == 't') {
sb.append('\t');
} else if (c2 == 'n') {
sb.append('\n');
} else if (c2 == 'r') {
sb.append('\r');
} else if (c2 == '\\') {
sb.append('\\');
} else {
sb.append(c2);
}
}
} else {
sb.append(c);
}
_index++;
}
detail = "String not properly closed";
// fall through
} else if (Character.isLetter(c) || c == '_') { // identifier
while (_index < _limit) {
char c1 = _text.charAt(_index);
if (c1 == '_' || Character.isLetterOrDigit(c1)) {
_index++;
} else {
break;
}
}
return new Token(
start,
_index,
TokenType.Identifier,
_text.substring(start, _index)
);
} else if (c == '/' && regexPossible) {
/*
* Regex literal
*/
StringBuffer sb = new StringBuffer();
_index++; // skip opening delimiter
while (_index < _limit) {
c = _text.charAt(_index);
if (c == '/') {
_index++; // skip closing delimiter
boolean caseInsensitive = false;
if (_index < _limit && _text.charAt(_index) == 'i') {
caseInsensitive = true;
_index++;
}
return new RegexToken(
start,
_index,
sb.toString(),
caseInsensitive
);
} else if (c == '\\') {
sb.append(c);
_index++; // skip escaping marker
if (_index < _limit) {
sb.append(_text.charAt(_index));
}
} else {
sb.append(c);
}
_index++;
}
detail = "Regex not properly closed";
// fall through
} else if ("+-*/.".indexOf(c) >= 0) { // operator
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else if ("()[],".indexOf(c) >= 0) { // delimiter
_index++;
return new Token(
start,
_index,
TokenType.Delimiter,
_text.substring(start, _index)
);
} else if (c == '!' && _index < _limit - 1 && _text.charAt(_index + 1) == '=') {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else if (c == '<') {
if (_index < _limit - 1 &&
(_text.charAt(_index + 1) == '=' ||
_text.charAt(_index + 1) == '>')) {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else {
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
}
} else if (">=".indexOf(c) >= 0) { // operator
if (_index < _limit - 1 && _text.charAt(_index + 1) == '=') {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else {
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
}
} else {
_index++;
detail = "Unrecognized symbol";
}
return new ErrorToken(
start,
_index,
_text.substring(start, _index),
detail
);
}
}