RandomSec/src/main/java/com/metaweb/gridworks/gel/Scanner.java
David Huynh 311d15f493 Re-organized column header popup menus and added a bunch of common facets and common cell edit transforms.
Added native syntax for regex in GEL and modified replace, split, partition, and rpartition functions to support regex. Removed function replaceRegex.


git-svn-id: http://google-refine.googlecode.com/svn/trunk@249 7d457c2a-affb-35e4-300a-418c747d4874
2010-03-09 06:57:08 +00:00

279 lines
8.6 KiB
Java

package com.metaweb.gridworks.gel;
public class Scanner {
static public enum TokenType {
Error,
Delimiter,
Operator,
Identifier,
Number,
String,
Regex
}
static public class Token {
final public int start;
final public int end;
final public TokenType type;
final public String text;
Token(int start, int end, TokenType type, String text) {
this.start = start;
this.end = end;
this.type = type;
this.text = text;
}
}
static public class ErrorToken extends Token {
final public String detail; // error detail
public ErrorToken(int start, int end, String text, String detail) {
super(start, end, TokenType.Error, text);
this.detail = detail;
}
}
static public class NumberToken extends Token {
final public double value;
public NumberToken(int start, int end, String text, double value) {
super(start, end, TokenType.Number, text);
this.value = value;
}
}
static public class RegexToken extends Token {
final public boolean caseInsensitive;
public RegexToken(int start, int end, String text, boolean caseInsensitive) {
super(start, end, TokenType.Regex, text);
this.caseInsensitive = caseInsensitive;
}
}
protected String _text;
protected int _index;
protected int _limit;
public Scanner(String s) {
this(s, 0, s.length());
}
public Scanner(String s, int from, int to) {
_text = s;
_index = from;
_limit = to;
}
public int getIndex() {
return _index;
}
public Token next(boolean regexPossible) {
// skip whitespace
while (_index < _limit && Character.isWhitespace(_text.charAt(_index))) {
_index++;
}
if (_index == _limit) {
return null;
}
char c = _text.charAt(_index);
int start = _index;
String detail = null;
if (Character.isDigit(c)) { // number literal
double value = 0;
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
value = value * 10 + (c - '0');
_index++;
}
if (_index < _limit && c == '.') {
_index++;
double division = 1;
while (_index < _limit && Character.isDigit(c = _text.charAt(_index))) {
value = value * 10 + (c - '0');
division *= 10;
_index++;
}
value /= division;
}
// TODO: support exponent e notation
return new NumberToken(
start,
_index,
_text.substring(start, _index),
value
);
} else if (c == '"' || c == '\'') {
/*
* String Literal
*/
StringBuffer sb = new StringBuffer();
char delimiter = c;
_index++; // skip opening delimiter
while (_index < _limit) {
c = _text.charAt(_index);
if (c == delimiter) {
_index++; // skip closing delimiter
return new Token(
start,
_index,
TokenType.String,
sb.toString()
);
} else if (c == '\\') {
_index++; // skip escaping marker
if (_index < _limit) {
sb.append(_text.charAt(_index));
}
} else {
sb.append(c);
}
_index++;
}
detail = "String not properly closed";
// fall through
} else if (Character.isLetter(c)) { // identifier
while (_index < _limit && Character.isLetterOrDigit(_text.charAt(_index))) {
_index++;
}
return new Token(
start,
_index,
TokenType.Identifier,
_text.substring(start, _index)
);
} else if (c == '/' && regexPossible) {
/*
* Regex literal
*/
StringBuffer sb = new StringBuffer();
_index++; // skip opening delimiter
while (_index < _limit) {
c = _text.charAt(_index);
if (c == '/') {
_index++; // skip closing delimiter
boolean caseInsensitive = false;
if (_index < _limit && _text.charAt(_index) == 'i') {
caseInsensitive = true;
_index++;
}
return new RegexToken(
start,
_index,
sb.toString(),
caseInsensitive
);
} else if (c == '\\') {
sb.append(c);
_index++; // skip escaping marker
if (_index < _limit) {
sb.append(_text.charAt(_index));
}
} else {
sb.append(c);
}
_index++;
}
detail = "Regex not properly closed";
// fall through
} else if ("+-*/.".indexOf(c) >= 0) { // operator
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else if ("()[],".indexOf(c) >= 0) { // delimiter
_index++;
return new Token(
start,
_index,
TokenType.Delimiter,
_text.substring(start, _index)
);
} else if (c == '!' && _index < _limit - 1 && _text.charAt(_index + 1) == '=') {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else if (c == '<') {
if (_index < _limit - 1 &&
(_text.charAt(_index + 1) == '=' ||
_text.charAt(_index + 1) == '>')) {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else {
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
}
} else if (">=".indexOf(c) >= 0) { // operator
if (_index < _limit - 1 && _text.charAt(_index + 1) == '=') {
_index += 2;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
} else {
_index++;
return new Token(
start,
_index,
TokenType.Operator,
_text.substring(start, _index)
);
}
} else {
_index++;
detail = "Unrecognized symbol";
}
return new ErrorToken(
start,
_index,
_text.substring(start, _index),
detail
);
}
}