Added streaming json parser for faster re-loading of existing projects.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@470 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-04-13 23:57:03 +00:00
parent 8142b27ee4
commit 4a06c49a9a
13 changed files with 253 additions and 11 deletions

View File

@ -24,5 +24,6 @@
<classpathentry kind="lib" path="lib/jython-2.5.1.jar"/>
<classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/>
<classpathentry kind="lib" path="tests/java/lib/junit-4.8.1.jar" sourcepath="tests/java/lib-src/junit-4.8.1-sources.jar"/>
<classpathentry kind="lib" path="lib/jackson-core-asl-1.5.1.jar"/>
<classpathentry kind="output" path="src/main/webapp/WEB-INF/classes"/>
</classpath>

View File

@ -42,6 +42,7 @@ licenses/apache2.0.LICENSE.txt
commons-lang
commons-codec
commons-math
jackson
jdatapath
jetty
jetty-util

Binary file not shown.

Binary file not shown.

View File

@ -417,15 +417,19 @@ public class ProjectManager {
}
public void deleteProject(Project project) {
synchronized (this) {
if (_projectsMetadata.containsKey(project.id)) {
_projectsMetadata.remove(project.id);
}
if (_projects.containsKey(project.id)) {
_projects.remove(project.id);
deleteProject(project.id);
}
File dir = getProjectDir(project.id);
public void deleteProject(long projectID) {
synchronized (this) {
if (_projectsMetadata.containsKey(projectID)) {
_projectsMetadata.remove(projectID);
}
if (_projects.containsKey(projectID)) {
_projects.remove(projectID);
}
File dir = getProjectDir(projectID);
if (dir.exists()) {
dir.delete();
}
@ -433,7 +437,6 @@ public class ProjectManager {
saveWorkspace();
}
protected void load() {
if (loadFromFile(new File(_workspaceDir, "workspace.json"))) return;
if (loadFromFile(new File(_workspaceDir, "workspace.temp.json"))) return;

View File

@ -15,7 +15,9 @@ public class DeleteProjectCommand extends Command {
throws ServletException, IOException {
try {
ProjectManager.singleton.deleteProject(getProject(request));
long projectID = Long.parseLong(request.getParameter("project"));
ProjectManager.singleton.deleteProject(projectID);
respond(response, "{ \"code\" : \"ok\" }");

View File

@ -7,6 +7,8 @@ import java.util.Date;
import java.util.Map;
import java.util.Properties;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
@ -103,4 +105,53 @@ public class Cell implements HasFields, Jsonizable {
return new Cell(value, recon);
}
static public Cell loadStreaming(JsonParser jp, Map<Long, Recon> reconCache) throws Exception {
JsonToken t = jp.getCurrentToken();
if (t == JsonToken.VALUE_NULL || t != JsonToken.START_OBJECT) {
return null;
}
Serializable value = null;
String type = null;
Recon recon = null;
while (jp.nextToken() != JsonToken.END_OBJECT) {
String fieldName = jp.getCurrentName();
jp.nextToken();
if ("r".equals(fieldName)) {
recon = Recon.loadStreaming(jp, reconCache);
} else if ("e".equals(fieldName)) {
value = new EvalError(jp.getText());
} else if ("v".equals(fieldName)) {
JsonToken token = jp.getCurrentToken();
if (token == JsonToken.VALUE_STRING) {
value = jp.getText();
} else if (token == JsonToken.VALUE_NUMBER_INT) {
value = jp.getIntValue();
} else if (token == JsonToken.VALUE_NUMBER_FLOAT) {
value = jp.getFloatValue();
} else if (token == JsonToken.VALUE_TRUE) {
value = true;
} else if (token == JsonToken.VALUE_FALSE) {
value = false;
}
} else if ("t".equals(fieldName)) {
type = jp.getText();
}
}
if (value != null) {
if (type != null) {
if ("date".equals(type)) {
value = ParsingUtilities.stringToDate((String) value);
}
}
return new Cell(value, recon);
} else {
return null;
}
}
}

View File

@ -177,6 +177,8 @@ public class Project {
}
static protected Project loadFromReader(LineNumberReader reader, long id) throws Exception {
long start = System.currentTimeMillis();
/* String version = */ reader.readLine();
Project project = new Project(id);
@ -210,6 +212,13 @@ public class Project {
}
project.columnModel.setMaxCellIndex(maxCellCount - 1);
Gridworks.log(
"Loaded project " + id + " from disk in " +
(System.currentTimeMillis() - start) / 1000 +
" sec(s)"
);
project.recomputeRowContextDependencies();
return project;

View File

@ -6,6 +6,8 @@ import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@ -235,4 +237,72 @@ public class Recon implements HasFields, Jsonizable {
return recon;
}
static public Recon loadStreaming(JsonParser jp, Map<Long, Recon> reconCache) throws Exception {
JsonToken t = jp.getCurrentToken();
if (t == JsonToken.VALUE_NULL || t != JsonToken.START_OBJECT) {
return null;
}
Recon recon = null;
boolean old = true;
while (jp.nextToken() != JsonToken.END_OBJECT) {
String fieldName = jp.getCurrentName();
jp.nextToken();
if ("id".equals(fieldName)) {
long id = jp.getLongValue();
if (reconCache.containsKey(id)) {
recon = reconCache.get(id);
} else {
recon = new Recon(id);
old = false;
}
} else if ("j".equals(fieldName)) {
recon.judgment = stringToJudgment(jp.getText());
} else if ("m".equals(fieldName)) {
if (jp.getCurrentToken() == JsonToken.START_OBJECT) {
ReconCandidate match = ReconCandidate.loadStreaming(jp, reconCache);
if (!old) {
recon.match = match;
}
}
} else if ("f".equals(fieldName)) {
if (jp.getCurrentToken() != JsonToken.START_ARRAY) {
return null;
}
int feature = 0;
while (jp.nextToken() != JsonToken.END_ARRAY) {
if (feature < recon.features.length && !old) {
JsonToken token = jp.getCurrentToken();
if (token == JsonToken.VALUE_STRING) {
recon.features[feature++] = jp.getText();
} else if (token == JsonToken.VALUE_NUMBER_INT) {
recon.features[feature++] = jp.getIntValue();
} else if (token == JsonToken.VALUE_NUMBER_FLOAT) {
recon.features[feature++] = jp.getFloatValue();
} else if (token == JsonToken.VALUE_FALSE) {
recon.features[feature++] = false;
} else if (token == JsonToken.VALUE_TRUE) {
recon.features[feature++] = true;
}
}
}
} else if ("c".equals(fieldName)) {
if (jp.getCurrentToken() != JsonToken.START_ARRAY) {
return null;
}
while (jp.nextToken() != JsonToken.END_ARRAY) {
ReconCandidate rc = ReconCandidate.loadStreaming(jp, reconCache);
if (rc != null && !old) {
recon.addCandidate(rc);
}
}
}
}
return recon;
}
}

View File

@ -1,7 +1,12 @@
package com.metaweb.gridworks.model;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
@ -78,4 +83,58 @@ public class ReconCandidate implements HasFields, Jsonizable {
);
return candidate;
}
static public ReconCandidate loadStreaming(JsonParser jp, Map<Long, Recon> reconCache) throws Exception {
JsonToken t = jp.getCurrentToken();
if (t == JsonToken.VALUE_NULL || t != JsonToken.START_OBJECT) {
return null;
}
String id = null;
String guid = null;
String name = null;
List<String> types = null;
double score = 0;
while (jp.nextToken() != JsonToken.END_OBJECT) {
String fieldName = jp.getCurrentName();
jp.nextToken();
if ("id".equals(fieldName)) {
id = jp.getText();
} else if ("guid".equals(fieldName)) {
guid = jp.getText();
} else if ("name".equals(fieldName)) {
name = jp.getText();
} else if ("score".equals(fieldName)) {
score = jp.getDoubleValue();
} else if ("types".equals(fieldName)) {
if (jp.getCurrentToken() != JsonToken.START_ARRAY) {
return null;
}
types = new ArrayList<String>();
while (jp.nextToken() != JsonToken.END_ARRAY) {
types.add(jp.getText());
}
}
}
String[] typesA;
if (types != null) {
typesA = new String[types.size()];
types.toArray(typesA);
} else {
typesA = new String[0];
}
return new ReconCandidate(
id,
guid,
name,
typesA,
score
);
}
}

View File

@ -7,6 +7,9 @@ import java.util.Map;
import java.util.Properties;
import java.util.Map.Entry;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@ -34,6 +37,12 @@ public class Row implements HasFields, Jsonizable {
cells = new ArrayList<Cell>(cellCount);
}
protected Row(List<Cell> cells, boolean flagged, boolean starred) {
this.cells = cells;
this.flagged = flagged;
this.starred = starred;
}
public Row dup() {
Row row = new Row(cells.size());
row.flagged = flagged;
@ -154,7 +163,9 @@ public class Row implements HasFields, Jsonizable {
}
static public Row load(String s, Map<Long, Recon> reconCache) throws Exception {
return s.length() == 0 ? null : load(ParsingUtilities.evaluateJsonStringToObject(s), reconCache);
return s.length() == 0 ? null :
//load(ParsingUtilities.evaluateJsonStringToObject(s), reconCache);
loadStreaming(s, reconCache);
}
static public Row load(JSONObject obj, Map<Long, Recon> reconCache) throws Exception {
@ -180,4 +191,39 @@ public class Row implements HasFields, Jsonizable {
return row;
}
static public Row loadStreaming(String s, Map<Long, Recon> reconCache) throws Exception {
JsonFactory jsonFactory = new JsonFactory();
JsonParser jp = jsonFactory.createJsonParser(s);
if (jp.nextToken() != JsonToken.START_OBJECT) {
return null;
}
List<Cell> cells = new ArrayList<Cell>();
boolean starred = false;
boolean flagged = false;
while (jp.nextToken() != JsonToken.END_OBJECT) {
String fieldName = jp.getCurrentName();
jp.nextToken();
if (STARRED.equals(fieldName)) {
starred = jp.getBooleanValue();
} else if (FLAGGED.equals(fieldName)) {
flagged = jp.getBooleanValue();
} else if ("cells".equals(fieldName)) {
if (jp.getCurrentToken() != JsonToken.START_ARRAY) {
return null;
}
while (jp.nextToken() != JsonToken.END_ARRAY) {
Cell cell = Cell.loadStreaming(jp, reconCache);
cells.add(cell);
}
}
}
return (cells.size() > 0) ? new Row(cells, flagged, starred) : null;
}
}

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table class="import-project-panel-layout"> <tr><td>Data File:</td> <td><input type="file" id="project-file-input" name="project-file" /></td></tr> <tr><td>Project Name:</td> <td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr> <tr><td>Column separator:</td> <td><input id="separator-input" name="separator" size="2" /> leave blank to guess comma or tab</td></tr> <tr><td>Guess Value Type:</td> <td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc.)</td></tr> <tr><td>Ignore:</td> <td><input id="ignore-input" name="ignore" size="5" value="0" /> initial non-blank lines</td></tr> <tr><td>Header lines:</td> <td><input id="header-lines-input" name="header-lines" size="5" value="1" /> (can be zero)</td></tr> <tr><td>Skip:</td> <td><input id="skip-input" name="skip" size="5" value="0" /> initial data rows</td></tr> <tr><td>Load up to:</td> <td><input id="limit-input" name="limit" size="5" /> data rows (leave blank to load all rows)</td></tr> <tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table class="import-project-panel-layout"> <tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr> <tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr> <tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr> </table> </form> </div> </div> </body> </html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table class="import-project-panel-layout"> <tr><td>Data File:</td> <td><input type="file" id="project-file-input" name="project-file" /></td></tr> <tr><td>Project Name:</td> <td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr> <tr><td></td><td><h3>Text File Options</h3></td></tr> <tr><td>Column separator:</td> <td><input id="separator-input" name="separator" size="2" /> leave blank to guess comma or tab</td></tr> <tr><td>Guess Value Type:</td> <td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc.)</td></tr> <tr><td>Header lines:</td> <td><input id="separator-input" name="separator" size="2" /> leave blank to guess comma or tab</td></tr> <tr><td></td><td><h3>Text File and Excel File Options</h3></td></tr> <tr><td>Ignore:</td> <td><input id="ignore-input" name="ignore" size="5" value="0" /> initial non-blank lines</td></tr> <tr><td>Skip:</td> <td><input id="skip-input" name="skip" size="5" value="0" /> initial data rows</td></tr> <tr><td>Load up to:</td> <td><input id="limit-input" name="limit" size="5" /> data rows (leave blank to load all rows)</td></tr> <tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table class="import-project-panel-layout"> <tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr> <tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr> <tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr> </table> </form> </div> </div> </body> </html>