diff --git a/.classpath b/.classpath index f49c826a8..48811b68a 100644 --- a/.classpath +++ b/.classpath @@ -1,16 +1,17 @@ - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + diff --git a/lib-src/cos-20081226-sources.jar b/lib-src/cos-20081226-sources.jar new file mode 100644 index 000000000..1c5bdaf5b Binary files /dev/null and b/lib-src/cos-20081226-sources.jar differ diff --git a/lib-src/icu4j-4.2.1-sources.jar b/lib-src/icu4j-4.2.1-sources.jar new file mode 100644 index 000000000..97c17e63c Binary files /dev/null and b/lib-src/icu4j-4.2.1-sources.jar differ diff --git a/lib/cos-05Nov2002.jar b/lib/cos-05Nov2002.jar deleted file mode 100644 index 6a4a1ff5f..000000000 Binary files a/lib/cos-05Nov2002.jar and /dev/null differ diff --git a/lib/cos-20081226.jar b/lib/cos-20081226.jar new file mode 100644 index 000000000..ea39c9896 Binary files /dev/null and b/lib/cos-20081226.jar differ diff --git a/lib/icu4j-4.2.1.jar b/lib/icu4j-4.2.1.jar new file mode 100644 index 000000000..bf0d532cb Binary files /dev/null and b/lib/icu4j-4.2.1.jar differ diff --git a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java index 289c148a8..468369042 100644 --- a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java @@ -1,5 +1,7 @@ package com.metaweb.gridworks.commands.edit; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -13,6 +15,10 @@ import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.apache.log4j.Logger; + +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; import com.metaweb.gridworks.ProjectManager; import com.metaweb.gridworks.ProjectMetadata; import com.metaweb.gridworks.commands.Command; @@ -27,7 +33,10 @@ import com.oreilly.servlet.multipart.ParamPart; import com.oreilly.servlet.multipart.Part; public class CreateProjectCommand extends Command { - @Override + + private final static Logger logger = Logger.getLogger("gridworks"); + + @Override public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { @@ -111,16 +120,19 @@ public class CreateProjectCommand extends Command { if (part.isFile()) { FilePart filePart = (FilePart) part; - Importer importer = guessImporter( - options, null, filePart.getFileName()); + + Importer importer = guessImporter(options, null, filePart.getFileName()); if (importer.takesReader()) { - Reader reader = new InputStreamReader(filePart.getInputStream()); - try { - importer.read(reader, project, options, skip, limit); - } finally { - reader.close(); - } + CharsetDetector detector = new CharsetDetector(); + CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect(); + logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]"); + Reader reader = charsetMatch.getReader(); + try { + importer.read(charsetMatch.getReader(), project, options, skip, limit); + } finally { + reader.close(); + } } else { InputStream inputStream = filePart.getInputStream(); try { @@ -230,4 +242,28 @@ public class CreateProjectCommand extends Command { return new TsvCsvImporter(); } + /* + * NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not + * all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make + * it work. It's far from ideal but I don't have a more efficient solution. + */ + private static InputStream enforceMarking(InputStream input) throws IOException { + if (input.markSupported()) { + return input; + } else { + ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024); + + byte[] buffer = new byte[1024 * 4]; + long count = 0; + int n = 0; + while (-1 != (n = input.read(buffer))) { + output.write(buffer, 0, n); + count += n; + } + input.close(); + + return new ByteArrayInputStream(output.toByteArray()); + } + } + } diff --git a/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java index 456804323..bdd5eeb72 100644 --- a/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/util/GetExpressionLanguageInfoCommand.java @@ -22,6 +22,9 @@ public class GetExpressionLanguageInfoCommand extends Command { throws ServletException, IOException { try { + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + JSONWriter writer = new JSONWriter(response.getWriter()); Properties options = new Properties(); diff --git a/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java index d7ced540d..a71a5740c 100644 --- a/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/util/GuessTypesOfColumnCommand.java @@ -40,6 +40,9 @@ public class GuessTypesOfColumnCommand extends Command { Project project = getProject(request); String columnName = request.getParameter("columnName"); + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + JSONWriter writer = new JSONWriter(response.getWriter()); writer.object(); diff --git a/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java index eb71d4f0a..546b6cfe3 100644 --- a/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/util/PreviewExpressionCommand.java @@ -40,6 +40,9 @@ public class PreviewExpressionCommand extends Command { return; } + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + JSONArray rowIndices = jsonStringToArray(rowIndicesString); int length = rowIndices.length(); diff --git a/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java b/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java index 842d9c4fb..dd844ab61 100644 --- a/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/util/PreviewProtographCommand.java @@ -24,6 +24,9 @@ public class PreviewProtographCommand extends Command { try { Project project = getProject(request); + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + String jsonString = request.getParameter("protograph"); JSONObject json = jsonStringToObject(jsonString); Protograph protograph = Protograph.reconstruct(json); diff --git a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java index d68262db4..545f40a77 100644 --- a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java +++ b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java @@ -39,7 +39,9 @@ import com.metaweb.gridworks.expr.functions.strings.IndexOf; import com.metaweb.gridworks.expr.functions.strings.LastIndexOf; import com.metaweb.gridworks.expr.functions.strings.MD5; import com.metaweb.gridworks.expr.functions.strings.Partition; +import com.metaweb.gridworks.expr.functions.strings.Phonetic; import com.metaweb.gridworks.expr.functions.strings.RPartition; +import com.metaweb.gridworks.expr.functions.strings.Reinterpret; import com.metaweb.gridworks.expr.functions.strings.Replace; import com.metaweb.gridworks.expr.functions.strings.ReplaceChars; import com.metaweb.gridworks.expr.functions.strings.ReplaceRegexp; @@ -53,6 +55,7 @@ import com.metaweb.gridworks.expr.functions.strings.ToUppercase; import com.metaweb.gridworks.expr.functions.strings.Trim; import com.metaweb.gridworks.expr.functions.strings.Unescape; import com.metaweb.gridworks.expr.functions.strings.Unicode; +import com.metaweb.gridworks.expr.functions.strings.UnicodeType; import com.metaweb.gridworks.expr.functions.tests.IsBlank; import com.metaweb.gridworks.expr.functions.tests.IsNotBlank; import com.metaweb.gridworks.expr.functions.tests.IsNotNull; @@ -124,9 +127,12 @@ public class ControlFunctionRegistry { registerFunction("sha1", new SHA1()); registerFunction("md5", new MD5()); registerFunction("unicode", new Unicode()); + registerFunction("unicodeType", new UnicodeType()); registerFunction("diff", new Diff()); registerFunction("chomp", new Diff()); registerFunction("fingerprint", new Fingerprint()); + registerFunction("phonetic", new Phonetic()); + registerFunction("reinterpret", new Reinterpret()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf()); diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java new file mode 100644 index 000000000..7ccf39681 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java @@ -0,0 +1,51 @@ +package com.metaweb.gridworks.expr.functions.strings; + +import java.util.Properties; + +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Soundex; +import org.json.JSONException; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.ControlFunctionRegistry; +import com.metaweb.gridworks.expr.EvalError; +import com.metaweb.gridworks.expr.Function; + +public class Phonetic implements Function { + + private DoubleMetaphone metaphone2 = new DoubleMetaphone(); + private Metaphone metaphone = new Metaphone(); + private Soundex soundex = new Soundex(); + + public Object call(Properties bindings, Object[] args) { + if (args.length == 2) { + Object o1 = args[0]; + Object o2 = args[1]; + if (o1 != null && o2 != null && o2 instanceof String) { + String str = (o1 instanceof String) ? (String) o1 : o1.toString(); + String encoding = ((String) o2).toLowerCase(); + if ("doublemetaphone".equals(encoding)) { + return metaphone2.doubleMetaphone(str); + } else if ("metaphone".equals(encoding)) { + return metaphone.metaphone(str); + } else if ("soundex".equals(encoding)) { + return soundex.soundex(str); + } else { + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding."); + } + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 3 strings"); + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')"); + writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')"); + writer.key("returns"); writer.value("string"); + writer.endObject(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java new file mode 100644 index 000000000..d413a0176 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Reinterpret.java @@ -0,0 +1,45 @@ +package com.metaweb.gridworks.expr.functions.strings; + +import java.io.UnsupportedEncodingException; +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.ControlFunctionRegistry; +import com.metaweb.gridworks.expr.EvalError; +import com.metaweb.gridworks.expr.Function; + +public class Reinterpret implements Function { + + public Object call(Properties bindings, Object[] args) { + if (args.length == 2) { + Object o1 = args[0]; + Object o2 = args[1]; + if (o1 != null && o2 != null && o2 instanceof String) { + String str = (o1 instanceof String) ? (String) o1 : o1.toString(); + String decoder = (String) o2; + String reinterpreted = null; + + try { + reinterpreted = new String(str.getBytes(decoder), "UTF8"); + } catch (UnsupportedEncodingException e) { + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized."); + } + + return reinterpreted; + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects 2 arguments"); + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Returns s reinterpreted thru the given encoder."); + writer.key("params"); writer.value("string s, string encoder"); + writer.key("returns"); writer.value("string"); + writer.endObject(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java index 60cbdf4ea..0c14f3a65 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java @@ -5,9 +5,7 @@ import java.util.Properties; import org.json.JSONException; import org.json.JSONWriter; -import com.metaweb.gridworks.expr.ControlFunctionRegistry; import com.metaweb.gridworks.expr.Function; -import com.metaweb.gridworks.expr.EvalError; public class Unicode implements Function { @@ -21,7 +19,7 @@ public class Unicode implements Function { } return output; } - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects an argument"); + return null; } public void write(JSONWriter writer, Properties options) diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java new file mode 100644 index 000000000..9ccc73ca9 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/UnicodeType.java @@ -0,0 +1,71 @@ +package com.metaweb.gridworks.expr.functions.strings; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.Function; + +public class UnicodeType implements Function { + + public Object call(Properties bindings, Object[] args) { + if (args.length == 1 && args[0] != null) { + Object o = args[0]; + String s = (o instanceof String) ? (String) o : o.toString(); + String[] output = new String[s.length()]; + for (int i = 0; i < s.length(); i++) { + output[i] = translateType(Character.getType(s.codePointAt(i))); + } + return output; + } + return null; + } + + private String translateType(int type) { + switch(type) { + case 0: return "unassigned"; + case 1: return "uppercase letter"; + case 2: return "lowercase letter"; + case 3: return "titlecase letter"; + case 4: return "modifier letter"; + case 5: return "other letter"; + case 6: return "non spacing mark"; + case 7: return "enclosing mark"; + case 8: return "combining spacing mark"; + case 9: return "decimal digit number"; + case 10: return "letter number"; + case 11: return "other number"; + case 12: return "space separator"; + case 13: return "line separator"; + case 14: return "paragraph separator"; + case 15: return "control"; + case 16: return "format"; + // 17 does not seem to be used + case 18: return "private use"; + case 19: return "surrogate"; + case 20: return "dash punctuation"; + case 21: return "start punctuation"; + case 22: return "end punctuation"; + case 23: return "connector punctuation"; + case 24: return "other punctuation"; + case 25: return "math symbol"; + case 26: return "currency symbol"; + case 27: return "modifier symbol"; + case 28: return "other symbol"; + case 29: return "initial quote punctuation"; + case 30: return "final quote punctuation"; + default: return "unknown"; + } + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Returns an array of strings describing each character of s in their full unicode notation"); + writer.key("params"); writer.value("string s"); + writer.key("returns"); writer.value("string"); + writer.endObject(); + } +} diff --git a/src/main/webapp/index.html b/src/main/webapp/index.html index 3191d15b9..00afb05a5 100644 --- a/src/main/webapp/index.html +++ b/src/main/webapp/index.html @@ -1 +1 @@ - Gridworks

New Project

Create a new project by uploading a tab-separated value or comma-separated value file.

Project Name:
Project Password:
optional, not protected, so use some password you don't care to reveal
Upload File:
Skip: initial data rows
Load up to: data rows
\ No newline at end of file + Gridworks

New Project

Create a new project by uploading a tab-separated value or comma-separated value file.

Project Name:
Project Password:
optional, not protected, so use some password you don't care to reveal
Upload File:
Skip: initial data rows
Load up to: data rows
\ No newline at end of file diff --git a/tests/example-latin1.tsv b/tests/example-latin1.tsv new file mode 100644 index 000000000..487066e49 --- /dev/null +++ b/tests/example-latin1.tsv @@ -0,0 +1,3 @@ +Start End Country Location Type Sub_Type Name Killed Cost Affected Id +02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160 +07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752 diff --git a/tests/example-utf8.tsv b/tests/example-utf8.tsv new file mode 100644 index 000000000..723681f05 --- /dev/null +++ b/tests/example-utf8.tsv @@ -0,0 +1,3 @@ +Start End Country Location Type Sub_Type Name Killed Cost Affected Id +02042003 05042003 Algeria Tizi Ouzou, Béjaïa, (Kaby ... Flood General flood 15 50 2003-0160 +07122002 07122002 Algeria Near Béjaïa, Tizi Ouzou ( ... Flood 6 2002-0752