diff --git a/.classpath b/.classpath
index f49c826a8..48811b68a 100644
--- a/.classpath
+++ b/.classpath
@@ -1,16 +1,17 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib-src/cos-20081226-sources.jar b/lib-src/cos-20081226-sources.jar
new file mode 100644
index 000000000..1c5bdaf5b
Binary files /dev/null and b/lib-src/cos-20081226-sources.jar differ
diff --git a/lib-src/icu4j-4.2.1-sources.jar b/lib-src/icu4j-4.2.1-sources.jar
new file mode 100644
index 000000000..97c17e63c
Binary files /dev/null and b/lib-src/icu4j-4.2.1-sources.jar differ
diff --git a/lib/cos-05Nov2002.jar b/lib/cos-05Nov2002.jar
deleted file mode 100644
index 6a4a1ff5f..000000000
Binary files a/lib/cos-05Nov2002.jar and /dev/null differ
diff --git a/lib/cos-20081226.jar b/lib/cos-20081226.jar
new file mode 100644
index 000000000..ea39c9896
Binary files /dev/null and b/lib/cos-20081226.jar differ
diff --git a/lib/icu4j-4.2.1.jar b/lib/icu4j-4.2.1.jar
new file mode 100644
index 000000000..bf0d532cb
Binary files /dev/null and b/lib/icu4j-4.2.1.jar differ
diff --git a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java
index 289c148a8..468369042 100644
--- a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java
+++ b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java
@@ -1,5 +1,7 @@
package com.metaweb.gridworks.commands.edit;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -13,6 +15,10 @@ import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
+import org.apache.log4j.Logger;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
import com.metaweb.gridworks.ProjectManager;
import com.metaweb.gridworks.ProjectMetadata;
import com.metaweb.gridworks.commands.Command;
@@ -27,7 +33,10 @@ import com.oreilly.servlet.multipart.ParamPart;
import com.oreilly.servlet.multipart.Part;
public class CreateProjectCommand extends Command {
- @Override
+
+ private final static Logger logger = Logger.getLogger("gridworks");
+
+ @Override
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
@@ -111,16 +120,19 @@ public class CreateProjectCommand extends Command {
if (part.isFile()) {
FilePart filePart = (FilePart) part;
- Importer importer = guessImporter(
- options, null, filePart.getFileName());
+
+ Importer importer = guessImporter(options, null, filePart.getFileName());
if (importer.takesReader()) {
- Reader reader = new InputStreamReader(filePart.getInputStream());
- try {
- importer.read(reader, project, options, skip, limit);
- } finally {
- reader.close();
- }
+ CharsetDetector detector = new CharsetDetector();
+ CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
+ logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
+ Reader reader = charsetMatch.getReader();
+ try {
+ importer.read(charsetMatch.getReader(), project, options, skip, limit);
+ } finally {
+ reader.close();
+ }
} else {
InputStream inputStream = filePart.getInputStream();
try {
@@ -230,4 +242,28 @@ public class CreateProjectCommand extends Command {
return new TsvCsvImporter();
}
+ /*
+ * NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not
+ * all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make
+ * it work. It's far from ideal but I don't have a more efficient solution.
+ */
+ private static InputStream enforceMarking(InputStream input) throws IOException {
+ if (input.markSupported()) {
+ return input;
+ } else {
+ ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024);
+
+ byte[] buffer = new byte[1024 * 4];
+ long count = 0;
+ int n = 0;
+ while (-1 != (n = input.read(buffer))) {
+ output.write(buffer, 0, n);
+ count += n;
+ }
+ input.close();
+
+ return new ByteArrayInputStream(output.toByteArray());
+ }
+ }
+
}
diff --git a/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java
index 456804323..bdd5eeb72 100644
--- a/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java
+++ b/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java
@@ -22,6 +22,9 @@ public class GetExpressionLanguageInfoCommand extends Command {
throws ServletException, IOException {
try {
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Content-Type", "application/json");
+
JSONWriter writer = new JSONWriter(response.getWriter());
Properties options = new Properties();
diff --git a/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java
index d7ced540d..a71a5740c 100644
--- a/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java
+++ b/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java
@@ -40,6 +40,9 @@ public class GuessTypesOfColumnCommand extends Command {
Project project = getProject(request);
String columnName = request.getParameter("columnName");
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Content-Type", "application/json");
+
JSONWriter writer = new JSONWriter(response.getWriter());
writer.object();
diff --git a/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java
index eb71d4f0a..546b6cfe3 100644
--- a/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java
+++ b/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java
@@ -40,6 +40,9 @@ public class PreviewExpressionCommand extends Command {
return;
}
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Content-Type", "application/json");
+
JSONArray rowIndices = jsonStringToArray(rowIndicesString);
int length = rowIndices.length();
diff --git a/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java
index 842d9c4fb..dd844ab61 100644
--- a/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java
+++ b/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java
@@ -24,6 +24,9 @@ public class PreviewProtographCommand extends Command {
try {
Project project = getProject(request);
+ response.setCharacterEncoding("UTF-8");
+ response.setHeader("Content-Type", "application/json");
+
String jsonString = request.getParameter("protograph");
JSONObject json = jsonStringToObject(jsonString);
Protograph protograph = Protograph.reconstruct(json);
diff --git a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java
index d68262db4..545f40a77 100644
--- a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java
+++ b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java
@@ -39,7 +39,9 @@ import com.metaweb.gridworks.expr.functions.strings.IndexOf;
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
import com.metaweb.gridworks.expr.functions.strings.MD5;
import com.metaweb.gridworks.expr.functions.strings.Partition;
+import com.metaweb.gridworks.expr.functions.strings.Phonetic;
import com.metaweb.gridworks.expr.functions.strings.RPartition;
+import com.metaweb.gridworks.expr.functions.strings.Reinterpret;
import com.metaweb.gridworks.expr.functions.strings.Replace;
import com.metaweb.gridworks.expr.functions.strings.ReplaceChars;
import com.metaweb.gridworks.expr.functions.strings.ReplaceRegexp;
@@ -53,6 +55,7 @@ import com.metaweb.gridworks.expr.functions.strings.ToUppercase;
import com.metaweb.gridworks.expr.functions.strings.Trim;
import com.metaweb.gridworks.expr.functions.strings.Unescape;
import com.metaweb.gridworks.expr.functions.strings.Unicode;
+import com.metaweb.gridworks.expr.functions.strings.UnicodeType;
import com.metaweb.gridworks.expr.functions.tests.IsBlank;
import com.metaweb.gridworks.expr.functions.tests.IsNotBlank;
import com.metaweb.gridworks.expr.functions.tests.IsNotNull;
@@ -124,9 +127,12 @@ public class ControlFunctionRegistry {
registerFunction("sha1", new SHA1());
registerFunction("md5", new MD5());
registerFunction("unicode", new Unicode());
+ registerFunction("unicodeType", new UnicodeType());
registerFunction("diff", new Diff());
registerFunction("chomp", new Diff());
registerFunction("fingerprint", new Fingerprint());
+ registerFunction("phonetic", new Phonetic());
+ registerFunction("reinterpret", new Reinterpret());
registerFunction("indexOf", new IndexOf());
registerFunction("lastIndexOf", new LastIndexOf());
diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java
new file mode 100644
index 000000000..7ccf39681
--- /dev/null
+++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java
@@ -0,0 +1,51 @@
+package com.metaweb.gridworks.expr.functions.strings;
+
+import java.util.Properties;
+
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Soundex;
+import org.json.JSONException;
+import org.json.JSONWriter;
+
+import com.metaweb.gridworks.expr.ControlFunctionRegistry;
+import com.metaweb.gridworks.expr.EvalError;
+import com.metaweb.gridworks.expr.Function;
+
+public class Phonetic implements Function {
+
+ private DoubleMetaphone metaphone2 = new DoubleMetaphone();
+ private Metaphone metaphone = new Metaphone();
+ private Soundex soundex = new Soundex();
+
+ public Object call(Properties bindings, Object[] args) {
+ if (args.length == 2) {
+ Object o1 = args[0];
+ Object o2 = args[1];
+ if (o1 != null && o2 != null && o2 instanceof String) {
+ String str = (o1 instanceof String) ? (String) o1 : o1.toString();
+ String encoding = ((String) o2).toLowerCase();
+ if ("doublemetaphone".equals(encoding)) {
+ return metaphone2.doubleMetaphone(str);
+ } else if ("metaphone".equals(encoding)) {
+ return metaphone.metaphone(str);
+ } else if ("soundex".equals(encoding)) {
+ return soundex.soundex(str);
+ } else {
+ return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
+ }
+ }
+ }
+ return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 3 strings");
+ }
+
+ public void write(JSONWriter writer, Properties options)
+ throws JSONException {
+
+ writer.object();
+ writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')");
+ writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')");
+ writer.key("returns"); writer.value("string");
+ writer.endObject();
+ }
+}
diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java
new file mode 100644
index 000000000..d413a0176
--- /dev/null
+++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java
@@ -0,0 +1,45 @@
+package com.metaweb.gridworks.expr.functions.strings;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Properties;
+
+import org.json.JSONException;
+import org.json.JSONWriter;
+
+import com.metaweb.gridworks.expr.ControlFunctionRegistry;
+import com.metaweb.gridworks.expr.EvalError;
+import com.metaweb.gridworks.expr.Function;
+
+public class Reinterpret implements Function {
+
+ public Object call(Properties bindings, Object[] args) {
+ if (args.length == 2) {
+ Object o1 = args[0];
+ Object o2 = args[1];
+ if (o1 != null && o2 != null && o2 instanceof String) {
+ String str = (o1 instanceof String) ? (String) o1 : o1.toString();
+ String decoder = (String) o2;
+ String reinterpreted = null;
+
+ try {
+ reinterpreted = new String(str.getBytes(decoder), "UTF8");
+ } catch (UnsupportedEncodingException e) {
+ return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
+ }
+
+ return reinterpreted;
+ }
+ }
+ return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 2 arguments");
+ }
+
+ public void write(JSONWriter writer, Properties options)
+ throws JSONException {
+
+ writer.object();
+ writer.key("description"); writer.value("Returns s reinterpreted thru the given encoder.");
+ writer.key("params"); writer.value("string s, string encoder");
+ writer.key("returns"); writer.value("string");
+ writer.endObject();
+ }
+}
diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java
index 60cbdf4ea..0c14f3a65 100644
--- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java
+++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java
@@ -5,9 +5,7 @@ import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
-import com.metaweb.gridworks.expr.ControlFunctionRegistry;
import com.metaweb.gridworks.expr.Function;
-import com.metaweb.gridworks.expr.EvalError;
public class Unicode implements Function {
@@ -21,7 +19,7 @@ public class Unicode implements Function {
}
return output;
}
- return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects an argument");
+ return null;
}
public void write(JSONWriter writer, Properties options)
diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java
new file mode 100644
index 000000000..9ccc73ca9
--- /dev/null
+++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java
@@ -0,0 +1,71 @@
+package com.metaweb.gridworks.expr.functions.strings;
+
+import java.util.Properties;
+
+import org.json.JSONException;
+import org.json.JSONWriter;
+
+import com.metaweb.gridworks.expr.Function;
+
+public class UnicodeType implements Function {
+
+ public Object call(Properties bindings, Object[] args) {
+ if (args.length == 1 && args[0] != null) {
+ Object o = args[0];
+ String s = (o instanceof String) ? (String) o : o.toString();
+ String[] output = new String[s.length()];
+ for (int i = 0; i < s.length(); i++) {
+ output[i] = translateType(Character.getType(s.codePointAt(i)));
+ }
+ return output;
+ }
+ return null;
+ }
+
+ private String translateType(int type) {
+ switch(type) {
+ case 0: return "unassigned";
+ case 1: return "uppercase letter";
+ case 2: return "lowercase letter";
+ case 3: return "titlecase letter";
+ case 4: return "modifier letter";
+ case 5: return "other letter";
+ case 6: return "non spacing mark";
+ case 7: return "enclosing mark";
+ case 8: return "combining spacing mark";
+ case 9: return "decimal digit number";
+ case 10: return "letter number";
+ case 11: return "other number";
+ case 12: return "space separator";
+ case 13: return "line separator";
+ case 14: return "paragraph separator";
+ case 15: return "control";
+ case 16: return "format";
+ // 17 does not seem to be used
+ case 18: return "private use";
+ case 19: return "surrogate";
+ case 20: return "dash punctuation";
+ case 21: return "start punctuation";
+ case 22: return "end punctuation";
+ case 23: return "connector punctuation";
+ case 24: return "other punctuation";
+ case 25: return "math symbol";
+ case 26: return "currency symbol";
+ case 27: return "modifier symbol";
+ case 28: return "other symbol";
+ case 29: return "initial quote punctuation";
+ case 30: return "final quote punctuation";
+ default: return "unknown";
+ }
+ }
+
+ public void write(JSONWriter writer, Properties options)
+ throws JSONException {
+
+ writer.object();
+ writer.key("description"); writer.value("Returns an array of strings describing each character of s in their full unicode notation");
+ writer.key("params"); writer.value("string s");
+ writer.key("returns"); writer.value("string");
+ writer.endObject();
+ }
+}
diff --git a/src/main/webapp/index.html b/src/main/webapp/index.html
index 3191d15b9..00afb05a5 100644
--- a/src/main/webapp/index.html
+++ b/src/main/webapp/index.html
@@ -1 +1 @@
-
Gridworks
New Project
Create a new project by uploading a tab-separated value or comma-separated value file.
\ No newline at end of file
+
Gridworks
New Project
Create a new project by uploading a tab-separated value or comma-separated value file.
\ No newline at end of file
diff --git a/tests/example-latin1.tsv b/tests/example-latin1.tsv
new file mode 100644
index 000000000..487066e49
--- /dev/null
+++ b/tests/example-latin1.tsv
@@ -0,0 +1,3 @@
+Start End Country Location Type Sub_Type Name Killed Cost Affected Id
+02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
+07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752
diff --git a/tests/example-utf8.tsv b/tests/example-utf8.tsv
new file mode 100644
index 000000000..723681f05
--- /dev/null
+++ b/tests/example-utf8.tsv
@@ -0,0 +1,3 @@
+Start End Country Location Type Sub_Type Name Killed Cost Affected Id
+02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160
+07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752