- added an encoding guesser
- fixed a bunch of encoding issues - added a function to reinterpret call content in another encoding - added a 'phonetic' function to the expression language that supports metaphone and soundex - updated the COS library to the latest released version - added the IBM ICU4j library (that contains the encoding guesser) - added examples with same content but different encodings git-svn-id: http://google-refine.googlecode.com/svn/trunk@154 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
d9e67ac806
commit
0c6590fe2c
@ -8,8 +8,9 @@
|
||||
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar" sourcepath="lib-src/log4j-1.2.15-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar" sourcepath="lib-src/commons-codec-1.3-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-lang-2.5.jar" sourcepath="lib-src/commons-lang-2.5-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/cos-05Nov2002.jar"/>
|
||||
<classpathentry kind="lib" path="lib/json-20100208.jar" sourcepath="lib-src/json-20100208-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/icu4j-4.2.1.jar" sourcepath="lib-src/icu4j-4.2.1-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/cos-20081226.jar" sourcepath="lib-src/cos-20081226-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
|
||||
<classpathentry kind="output" path="build/classes"/>
|
||||
|
BIN
lib-src/cos-20081226-sources.jar
Normal file
BIN
lib-src/cos-20081226-sources.jar
Normal file
Binary file not shown.
BIN
lib-src/icu4j-4.2.1-sources.jar
Normal file
BIN
lib-src/icu4j-4.2.1-sources.jar
Normal file
Binary file not shown.
Binary file not shown.
BIN
lib/cos-20081226.jar
Normal file
BIN
lib/cos-20081226.jar
Normal file
Binary file not shown.
BIN
lib/icu4j-4.2.1.jar
Normal file
BIN
lib/icu4j-4.2.1.jar
Normal file
Binary file not shown.
@ -1,5 +1,7 @@
|
||||
package com.metaweb.gridworks.commands.edit;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
@ -13,6 +15,10 @@ import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import com.metaweb.gridworks.ProjectManager;
|
||||
import com.metaweb.gridworks.ProjectMetadata;
|
||||
import com.metaweb.gridworks.commands.Command;
|
||||
@ -27,6 +33,9 @@ import com.oreilly.servlet.multipart.ParamPart;
|
||||
import com.oreilly.servlet.multipart.Part;
|
||||
|
||||
public class CreateProjectCommand extends Command {
|
||||
|
||||
private final static Logger logger = Logger.getLogger("gridworks");
|
||||
|
||||
@Override
|
||||
public void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws ServletException, IOException {
|
||||
@ -111,13 +120,16 @@ public class CreateProjectCommand extends Command {
|
||||
|
||||
if (part.isFile()) {
|
||||
FilePart filePart = (FilePart) part;
|
||||
Importer importer = guessImporter(
|
||||
options, null, filePart.getFileName());
|
||||
|
||||
Importer importer = guessImporter(options, null, filePart.getFileName());
|
||||
|
||||
if (importer.takesReader()) {
|
||||
Reader reader = new InputStreamReader(filePart.getInputStream());
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
|
||||
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
|
||||
Reader reader = charsetMatch.getReader();
|
||||
try {
|
||||
importer.read(reader, project, options, skip, limit);
|
||||
importer.read(charsetMatch.getReader(), project, options, skip, limit);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
@ -230,4 +242,28 @@ public class CreateProjectCommand extends Command {
|
||||
return new TsvCsvImporter();
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not
|
||||
* all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make
|
||||
* it work. It's far from ideal but I don't have a more efficient solution.
|
||||
*/
|
||||
private static InputStream enforceMarking(InputStream input) throws IOException {
|
||||
if (input.markSupported()) {
|
||||
return input;
|
||||
} else {
|
||||
ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024);
|
||||
|
||||
byte[] buffer = new byte[1024 * 4];
|
||||
long count = 0;
|
||||
int n = 0;
|
||||
while (-1 != (n = input.read(buffer))) {
|
||||
output.write(buffer, 0, n);
|
||||
count += n;
|
||||
}
|
||||
input.close();
|
||||
|
||||
return new ByteArrayInputStream(output.toByteArray());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -22,6 +22,9 @@ public class GetExpressionLanguageInfoCommand extends Command {
|
||||
throws ServletException, IOException {
|
||||
|
||||
try {
|
||||
response.setCharacterEncoding("UTF-8");
|
||||
response.setHeader("Content-Type", "application/json");
|
||||
|
||||
JSONWriter writer = new JSONWriter(response.getWriter());
|
||||
Properties options = new Properties();
|
||||
|
||||
|
@ -40,6 +40,9 @@ public class GuessTypesOfColumnCommand extends Command {
|
||||
Project project = getProject(request);
|
||||
String columnName = request.getParameter("columnName");
|
||||
|
||||
response.setCharacterEncoding("UTF-8");
|
||||
response.setHeader("Content-Type", "application/json");
|
||||
|
||||
JSONWriter writer = new JSONWriter(response.getWriter());
|
||||
writer.object();
|
||||
|
||||
|
@ -40,6 +40,9 @@ public class PreviewExpressionCommand extends Command {
|
||||
return;
|
||||
}
|
||||
|
||||
response.setCharacterEncoding("UTF-8");
|
||||
response.setHeader("Content-Type", "application/json");
|
||||
|
||||
JSONArray rowIndices = jsonStringToArray(rowIndicesString);
|
||||
int length = rowIndices.length();
|
||||
|
||||
|
@ -24,6 +24,9 @@ public class PreviewProtographCommand extends Command {
|
||||
try {
|
||||
Project project = getProject(request);
|
||||
|
||||
response.setCharacterEncoding("UTF-8");
|
||||
response.setHeader("Content-Type", "application/json");
|
||||
|
||||
String jsonString = request.getParameter("protograph");
|
||||
JSONObject json = jsonStringToObject(jsonString);
|
||||
Protograph protograph = Protograph.reconstruct(json);
|
||||
|
@ -39,7 +39,9 @@ import com.metaweb.gridworks.expr.functions.strings.IndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.MD5;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Partition;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Phonetic;
|
||||
import com.metaweb.gridworks.expr.functions.strings.RPartition;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Reinterpret;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Replace;
|
||||
import com.metaweb.gridworks.expr.functions.strings.ReplaceChars;
|
||||
import com.metaweb.gridworks.expr.functions.strings.ReplaceRegexp;
|
||||
@ -53,6 +55,7 @@ import com.metaweb.gridworks.expr.functions.strings.ToUppercase;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Trim;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Unescape;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Unicode;
|
||||
import com.metaweb.gridworks.expr.functions.strings.UnicodeType;
|
||||
import com.metaweb.gridworks.expr.functions.tests.IsBlank;
|
||||
import com.metaweb.gridworks.expr.functions.tests.IsNotBlank;
|
||||
import com.metaweb.gridworks.expr.functions.tests.IsNotNull;
|
||||
@ -124,9 +127,12 @@ public class ControlFunctionRegistry {
|
||||
registerFunction("sha1", new SHA1());
|
||||
registerFunction("md5", new MD5());
|
||||
registerFunction("unicode", new Unicode());
|
||||
registerFunction("unicodeType", new UnicodeType());
|
||||
registerFunction("diff", new Diff());
|
||||
registerFunction("chomp", new Diff());
|
||||
registerFunction("fingerprint", new Fingerprint());
|
||||
registerFunction("phonetic", new Phonetic());
|
||||
registerFunction("reinterpret", new Reinterpret());
|
||||
|
||||
registerFunction("indexOf", new IndexOf());
|
||||
registerFunction("lastIndexOf", new LastIndexOf());
|
||||
|
@ -0,0 +1,51 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
|
||||
public class Phonetic implements Function {
|
||||
|
||||
private DoubleMetaphone metaphone2 = new DoubleMetaphone();
|
||||
private Metaphone metaphone = new Metaphone();
|
||||
private Soundex soundex = new Soundex();
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 2) {
|
||||
Object o1 = args[0];
|
||||
Object o2 = args[1];
|
||||
if (o1 != null && o2 != null && o2 instanceof String) {
|
||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||
String encoding = ((String) o2).toLowerCase();
|
||||
if ("doublemetaphone".equals(encoding)) {
|
||||
return metaphone2.doubleMetaphone(str);
|
||||
} else if ("metaphone".equals(encoding)) {
|
||||
return metaphone.metaphone(str);
|
||||
} else if ("soundex".equals(encoding)) {
|
||||
return soundex.soundex(str);
|
||||
} else {
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
|
||||
}
|
||||
}
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 3 strings");
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')");
|
||||
writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
|
||||
public class Reinterpret implements Function {
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 2) {
|
||||
Object o1 = args[0];
|
||||
Object o2 = args[1];
|
||||
if (o1 != null && o2 != null && o2 instanceof String) {
|
||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||
String decoder = (String) o2;
|
||||
String reinterpreted = null;
|
||||
|
||||
try {
|
||||
reinterpreted = new String(str.getBytes(decoder), "UTF8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
|
||||
}
|
||||
|
||||
return reinterpreted;
|
||||
}
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 2 arguments");
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns s reinterpreted thru the given encoder.");
|
||||
writer.key("params"); writer.value("string s, string encoder");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
@ -5,9 +5,7 @@ import java.util.Properties;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
|
||||
public class Unicode implements Function {
|
||||
|
||||
@ -21,7 +19,7 @@ public class Unicode implements Function {
|
||||
}
|
||||
return output;
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects an argument");
|
||||
return null;
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
|
@ -0,0 +1,71 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
|
||||
public class UnicodeType implements Function {
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 1 && args[0] != null) {
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
String[] output = new String[s.length()];
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
output[i] = translateType(Character.getType(s.codePointAt(i)));
|
||||
}
|
||||
return output;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String translateType(int type) {
|
||||
switch(type) {
|
||||
case 0: return "unassigned";
|
||||
case 1: return "uppercase letter";
|
||||
case 2: return "lowercase letter";
|
||||
case 3: return "titlecase letter";
|
||||
case 4: return "modifier letter";
|
||||
case 5: return "other letter";
|
||||
case 6: return "non spacing mark";
|
||||
case 7: return "enclosing mark";
|
||||
case 8: return "combining spacing mark";
|
||||
case 9: return "decimal digit number";
|
||||
case 10: return "letter number";
|
||||
case 11: return "other number";
|
||||
case 12: return "space separator";
|
||||
case 13: return "line separator";
|
||||
case 14: return "paragraph separator";
|
||||
case 15: return "control";
|
||||
case 16: return "format";
|
||||
// 17 does not seem to be used
|
||||
case 18: return "private use";
|
||||
case 19: return "surrogate";
|
||||
case 20: return "dash punctuation";
|
||||
case 21: return "start punctuation";
|
||||
case 22: return "end punctuation";
|
||||
case 23: return "connector punctuation";
|
||||
case 24: return "other punctuation";
|
||||
case 25: return "math symbol";
|
||||
case 26: return "currency symbol";
|
||||
case 27: return "modifier symbol";
|
||||
case 28: return "other symbol";
|
||||
case 29: return "initial quote punctuation";
|
||||
case 30: return "final quote punctuation";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns an array of strings describing each character of s in their full unicode notation");
|
||||
writer.key("params"); writer.value("string s");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
@ -1 +1 @@
|
||||
<html>
<head>
<title>Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
</head>
<body>
<div id="header">
<h1>Gridworks</h1>
</div>
<div id="body">
<div id="projects"></div>
<h2>New Project</h2>
<p>Create a new project by uploading a tab-separated value or comma-separated value file.</p>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload">
<table cellspacing="5">
<tr><td>Project Name:</td><td><input type="text" size="30" id="project-name-input" name="project-name" /></td></tr>
<tr style="display: none;"><td>Project Password:</td><td><input type="password" size="30" id="project-password-input" name="project-password" /><br/>optional, not protected, so use some password you don't care to reveal</td></tr>
<tr><td>Upload File:</td><td>
<input type="file" id="project-file-input" name="project-file" size="50" />
</td></tr>
<tr><td>Skip:</td><td>
<input id="skip-input" name="skip" size="5" /> initial data rows
</td></tr>
<tr><td>Load up to:</td><td>
<input id="limit-input" name="limit" size="5" /> data rows
</td></tr>
<tr><td></td><td id="submit-container">
<input type="submit" value="Create Project" id="upload-file-button" />
</td></tr>
</table>
</form>
</div>
</body>
</html>
|
||||
<html>
<head>
<title>Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
</head>
<body>
<div id="header">
<h1>Gridworks</h1>
</div>
<div id="body">
<div id="projects"></div>
<h2>New Project</h2>
<p>Create a new project by uploading a tab-separated value or comma-separated value file.</p>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<table cellspacing="5">
<tr><td>Project Name:</td><td><input type="text" size="30" id="project-name-input" name="project-name" /></td></tr>
<tr style="display: none;"><td>Project Password:</td><td><input type="password" size="30" id="project-password-input" name="project-password" /><br/>optional, not protected, so use some password you don't care to reveal</td></tr>
<tr><td>Upload File:</td><td>
<input type="file" id="project-file-input" name="project-file" size="50" />
</td></tr>
<tr><td>Skip:</td><td>
<input id="skip-input" name="skip" size="5" /> initial data rows
</td></tr>
<tr><td>Load up to:</td><td>
<input id="limit-input" name="limit" size="5" /> data rows
</td></tr>
<tr><td></td><td id="submit-container">
<input type="submit" value="Create Project" id="upload-file-button" />
</td></tr>
</table>
</form>
</div>
</body>
</html>
|
3
tests/example-latin1.tsv
Normal file
3
tests/example-latin1.tsv
Normal file
@ -0,0 +1,3 @@
|
||||
Start End Country Location Type Sub_Type Name Killed Cost Affected Id
|
||||
02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
|
||||
07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752
|
Can't render this file because it has a wrong number of fields in line 2.
|
3
tests/example-utf8.tsv
Normal file
3
tests/example-utf8.tsv
Normal file
@ -0,0 +1,3 @@
|
||||
Start End Country Location Type Sub_Type Name Killed Cost Affected Id
|
||||
02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
|
||||
07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752
|
Can't render this file because it has a wrong number of fields in line 2.
|
Loading…
Reference in New Issue
Block a user