- added an encoding guesser

- fixed a bunch of encoding issues
- added a function to reinterpret call content in another encoding
- added a 'phonetic' function to the expression language that supports metaphone and soundex
- updated the COS library to the latest released version 
- added the IBM ICU4j library (that contains the encoding guesser)
- added examples with same content but different encodings


git-svn-id: http://google-refine.googlecode.com/svn/trunk@154 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-02-28 21:51:33 +00:00
parent d9e67ac806
commit 0c6590fe2c
19 changed files with 255 additions and 29 deletions

View File

@ -1,16 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src/main/java"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/servlet-api-2.5.jar" sourcepath="lib-src/servlet-api-2.5-sources.jar"/>
<classpathentry kind="lib" path="lib/jetty-6.1.22.jar" sourcepath="lib-src/jetty-6.1.22-sources.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-6.1.22.jar" sourcepath="lib-src/jetty-util-6.1.22-sources.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar" sourcepath="lib-src/log4j-1.2.15-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar" sourcepath="lib-src/commons-codec-1.3-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-lang-2.5.jar" sourcepath="lib-src/commons-lang-2.5-sources.jar"/>
<classpathentry kind="lib" path="lib/cos-05Nov2002.jar"/>
<classpathentry kind="lib" path="lib/json-20100208.jar" sourcepath="lib-src/json-20100208-sources.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
<classpathentry kind="output" path="build/classes"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src/main/java"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/servlet-api-2.5.jar" sourcepath="lib-src/servlet-api-2.5-sources.jar"/>
<classpathentry kind="lib" path="lib/jetty-6.1.22.jar" sourcepath="lib-src/jetty-6.1.22-sources.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-6.1.22.jar" sourcepath="lib-src/jetty-util-6.1.22-sources.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar" sourcepath="lib-src/log4j-1.2.15-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar" sourcepath="lib-src/commons-codec-1.3-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-lang-2.5.jar" sourcepath="lib-src/commons-lang-2.5-sources.jar"/>
<classpathentry kind="lib" path="lib/json-20100208.jar" sourcepath="lib-src/json-20100208-sources.jar"/>
<classpathentry kind="lib" path="lib/icu4j-4.2.1.jar" sourcepath="lib-src/icu4j-4.2.1-sources.jar"/>
<classpathentry kind="lib" path="lib/cos-20081226.jar" sourcepath="lib-src/cos-20081226-sources.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
<classpathentry kind="output" path="build/classes"/>
</classpath>

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/cos-20081226.jar Normal file

Binary file not shown.

BIN
lib/icu4j-4.2.1.jar Normal file

Binary file not shown.

View File

@ -1,5 +1,7 @@
package com.metaweb.gridworks.commands.edit;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@ -13,6 +15,10 @@ import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.log4j.Logger;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.metaweb.gridworks.ProjectManager;
import com.metaweb.gridworks.ProjectMetadata;
import com.metaweb.gridworks.commands.Command;
@ -27,7 +33,10 @@ import com.oreilly.servlet.multipart.ParamPart;
import com.oreilly.servlet.multipart.Part;
public class CreateProjectCommand extends Command {
@Override
private final static Logger logger = Logger.getLogger("gridworks");
@Override
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
@ -111,16 +120,19 @@ public class CreateProjectCommand extends Command {
if (part.isFile()) {
FilePart filePart = (FilePart) part;
Importer importer = guessImporter(
options, null, filePart.getFileName());
Importer importer = guessImporter(options, null, filePart.getFileName());
if (importer.takesReader()) {
Reader reader = new InputStreamReader(filePart.getInputStream());
try {
importer.read(reader, project, options, skip, limit);
} finally {
reader.close();
}
CharsetDetector detector = new CharsetDetector();
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
Reader reader = charsetMatch.getReader();
try {
importer.read(charsetMatch.getReader(), project, options, skip, limit);
} finally {
reader.close();
}
} else {
InputStream inputStream = filePart.getInputStream();
try {
@ -230,4 +242,28 @@ public class CreateProjectCommand extends Command {
return new TsvCsvImporter();
}
/*
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not
* all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make
* it work. It's far from ideal but I don't have a more efficient solution.
*/
private static InputStream enforceMarking(InputStream input) throws IOException {
if (input.markSupported()) {
return input;
} else {
ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024);
byte[] buffer = new byte[1024 * 4];
long count = 0;
int n = 0;
while (-1 != (n = input.read(buffer))) {
output.write(buffer, 0, n);
count += n;
}
input.close();
return new ByteArrayInputStream(output.toByteArray());
}
}
}

View File

@ -22,6 +22,9 @@ public class GetExpressionLanguageInfoCommand extends Command {
throws ServletException, IOException {
try {
response.setCharacterEncoding("UTF-8");
response.setHeader("Content-Type", "application/json");
JSONWriter writer = new JSONWriter(response.getWriter());
Properties options = new Properties();

View File

@ -40,6 +40,9 @@ public class GuessTypesOfColumnCommand extends Command {
Project project = getProject(request);
String columnName = request.getParameter("columnName");
response.setCharacterEncoding("UTF-8");
response.setHeader("Content-Type", "application/json");
JSONWriter writer = new JSONWriter(response.getWriter());
writer.object();

View File

@ -40,6 +40,9 @@ public class PreviewExpressionCommand extends Command {
return;
}
response.setCharacterEncoding("UTF-8");
response.setHeader("Content-Type", "application/json");
JSONArray rowIndices = jsonStringToArray(rowIndicesString);
int length = rowIndices.length();

View File

@ -24,6 +24,9 @@ public class PreviewProtographCommand extends Command {
try {
Project project = getProject(request);
response.setCharacterEncoding("UTF-8");
response.setHeader("Content-Type", "application/json");
String jsonString = request.getParameter("protograph");
JSONObject json = jsonStringToObject(jsonString);
Protograph protograph = Protograph.reconstruct(json);

View File

@ -39,7 +39,9 @@ import com.metaweb.gridworks.expr.functions.strings.IndexOf;
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
import com.metaweb.gridworks.expr.functions.strings.MD5;
import com.metaweb.gridworks.expr.functions.strings.Partition;
import com.metaweb.gridworks.expr.functions.strings.Phonetic;
import com.metaweb.gridworks.expr.functions.strings.RPartition;
import com.metaweb.gridworks.expr.functions.strings.Reinterpret;
import com.metaweb.gridworks.expr.functions.strings.Replace;
import com.metaweb.gridworks.expr.functions.strings.ReplaceChars;
import com.metaweb.gridworks.expr.functions.strings.ReplaceRegexp;
@ -53,6 +55,7 @@ import com.metaweb.gridworks.expr.functions.strings.ToUppercase;
import com.metaweb.gridworks.expr.functions.strings.Trim;
import com.metaweb.gridworks.expr.functions.strings.Unescape;
import com.metaweb.gridworks.expr.functions.strings.Unicode;
import com.metaweb.gridworks.expr.functions.strings.UnicodeType;
import com.metaweb.gridworks.expr.functions.tests.IsBlank;
import com.metaweb.gridworks.expr.functions.tests.IsNotBlank;
import com.metaweb.gridworks.expr.functions.tests.IsNotNull;
@ -124,9 +127,12 @@ public class ControlFunctionRegistry {
registerFunction("sha1", new SHA1());
registerFunction("md5", new MD5());
registerFunction("unicode", new Unicode());
registerFunction("unicodeType", new UnicodeType());
registerFunction("diff", new Diff());
registerFunction("chomp", new Diff());
registerFunction("fingerprint", new Fingerprint());
registerFunction("phonetic", new Phonetic());
registerFunction("reinterpret", new Reinterpret());
registerFunction("indexOf", new IndexOf());
registerFunction("lastIndexOf", new LastIndexOf());

View File

@ -0,0 +1,51 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Properties;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.Soundex;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.expr.Function;
public class Phonetic implements Function {
private DoubleMetaphone metaphone2 = new DoubleMetaphone();
private Metaphone metaphone = new Metaphone();
private Soundex soundex = new Soundex();
public Object call(Properties bindings, Object[] args) {
if (args.length == 2) {
Object o1 = args[0];
Object o2 = args[1];
if (o1 != null && o2 != null && o2 instanceof String) {
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
String encoding = ((String) o2).toLowerCase();
if ("doublemetaphone".equals(encoding)) {
return metaphone2.doubleMetaphone(str);
} else if ("metaphone".equals(encoding)) {
return metaphone.metaphone(str);
} else if ("soundex".equals(encoding)) {
return soundex.soundex(str);
} else {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
}
}
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 3 strings");
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')");
writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')");
writer.key("returns"); writer.value("string");
writer.endObject();
}
}

View File

@ -0,0 +1,45 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.expr.Function;
public class Reinterpret implements Function {
public Object call(Properties bindings, Object[] args) {
if (args.length == 2) {
Object o1 = args[0];
Object o2 = args[1];
if (o1 != null && o2 != null && o2 instanceof String) {
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
String decoder = (String) o2;
String reinterpreted = null;
try {
reinterpreted = new String(str.getBytes(decoder), "UTF8");
} catch (UnsupportedEncodingException e) {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
}
return reinterpreted;
}
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 2 arguments");
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Returns s reinterpreted thru the given encoder.");
writer.key("params"); writer.value("string s, string encoder");
writer.key("returns"); writer.value("string");
writer.endObject();
}
}

View File

@ -5,9 +5,7 @@ import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
import com.metaweb.gridworks.expr.Function;
import com.metaweb.gridworks.expr.EvalError;
public class Unicode implements Function {
@ -21,7 +19,7 @@ public class Unicode implements Function {
}
return output;
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects an argument");
return null;
}
public void write(JSONWriter writer, Properties options)

View File

@ -0,0 +1,71 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.Function;
public class UnicodeType implements Function {
public Object call(Properties bindings, Object[] args) {
if (args.length == 1 && args[0] != null) {
Object o = args[0];
String s = (o instanceof String) ? (String) o : o.toString();
String[] output = new String[s.length()];
for (int i = 0; i < s.length(); i++) {
output[i] = translateType(Character.getType(s.codePointAt(i)));
}
return output;
}
return null;
}
private String translateType(int type) {
switch(type) {
case 0: return "unassigned";
case 1: return "uppercase letter";
case 2: return "lowercase letter";
case 3: return "titlecase letter";
case 4: return "modifier letter";
case 5: return "other letter";
case 6: return "non spacing mark";
case 7: return "enclosing mark";
case 8: return "combining spacing mark";
case 9: return "decimal digit number";
case 10: return "letter number";
case 11: return "other number";
case 12: return "space separator";
case 13: return "line separator";
case 14: return "paragraph separator";
case 15: return "control";
case 16: return "format";
// 17 does not seem to be used
case 18: return "private use";
case 19: return "surrogate";
case 20: return "dash punctuation";
case 21: return "start punctuation";
case 22: return "end punctuation";
case 23: return "connector punctuation";
case 24: return "other punctuation";
case 25: return "math symbol";
case 26: return "currency symbol";
case 27: return "modifier symbol";
case 28: return "other symbol";
case 29: return "initial quote punctuation";
case 30: return "final quote punctuation";
default: return "unknown";
}
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Returns an array of strings describing each character of s in their full unicode notation");
writer.key("params"); writer.value("string s");
writer.key("returns"); writer.value("string");
writer.endObject();
}
}

View File

@ -1 +1 @@
<html> <head> <title>Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/index.js"></script> </head> <body> <div id="header"> <h1>Gridworks</h1> </div> <div id="body"> <div id="projects"></div> <h2>New Project</h2> <p>Create a new project by uploading a tab-separated value or comma-separated value file.</p> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload"> <table cellspacing="5"> <tr><td>Project Name:</td><td><input type="text" size="30" id="project-name-input" name="project-name" /></td></tr> <tr style="display: none;"><td>Project Password:</td><td><input type="password" size="30" id="project-password-input" name="project-password" /><br/>optional, not protected, so use some password you don't care to reveal</td></tr> <tr><td>Upload File:</td><td> <input type="file" id="project-file-input" name="project-file" size="50" /> </td></tr> <tr><td>Skip:</td><td> <input id="skip-input" name="skip" size="5" /> initial data rows </td></tr> <tr><td>Load up to:</td><td> <input id="limit-input" name="limit" size="5" /> data rows </td></tr> <tr><td></td><td id="submit-container"> <input type="submit" value="Create Project" id="upload-file-button" /> </td></tr> </table> </form> </div> </body> </html>
<html> <head> <title>Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/index.js"></script> </head> <body> <div id="header"> <h1>Gridworks</h1> </div> <div id="body"> <div id="projects"></div> <h2>New Project</h2> <p>Create a new project by uploading a tab-separated value or comma-separated value file.</p> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <table cellspacing="5"> <tr><td>Project Name:</td><td><input type="text" size="30" id="project-name-input" name="project-name" /></td></tr> <tr style="display: none;"><td>Project Password:</td><td><input type="password" size="30" id="project-password-input" name="project-password" /><br/>optional, not protected, so use some password you don't care to reveal</td></tr> <tr><td>Upload File:</td><td> <input type="file" id="project-file-input" name="project-file" size="50" /> </td></tr> <tr><td>Skip:</td><td> <input id="skip-input" name="skip" size="5" /> initial data rows </td></tr> <tr><td>Load up to:</td><td> <input id="limit-input" name="limit" size="5" /> data rows </td></tr> <tr><td></td><td id="submit-container"> <input type="submit" value="Create Project" id="upload-file-button" /> </td></tr> </table> </form> </div> </body> </html>

3
tests/example-latin1.tsv Normal file
View File

@ -0,0 +1,3 @@
Start End Country Location Type Sub_Type Name Killed Cost Affected Id
02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752
Can't render this file because it has a wrong number of fields in line 2.

3
tests/example-utf8.tsv Normal file
View File

@ -0,0 +1,3 @@
Start End Country Location Type Sub_Type Name Killed Cost Affected Id
02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752
Can't render this file because it has a wrong number of fields in line 2.