Fixed up Rdf Triples importer, added a parser UI for it, and got its tests to pass.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@2283 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2011-10-06 21:28:20 +00:00
parent cf206c33bc
commit 18f32ed7e8
7 changed files with 198 additions and 25 deletions

View File

@ -57,6 +57,15 @@ abstract public class ImportingParserBase implements ImportingParser {
this.useInputStream = useInputStream;
}
@Override
public JSONObject createParserUIInitializationData(ImportingJob job,
List<JSONObject> fileRecords, String format) {
JSONObject options = new JSONObject();
JSONUtilities.safePut(options, "includeFileSources", fileRecords.size() > 1);
return options;
}
@Override
public void parse(Project project, ProjectMetadata metadata,
final ImportingJob job, List<JSONObject> fileRecords, String format,

View File

@ -44,7 +44,6 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.NotImplementedException;
import org.jrdf.JRDFFactory;
import org.jrdf.SortedMemoryJRDFFactory;
import org.jrdf.collection.MemMapFactory;
@ -78,12 +77,6 @@ public class RdfTripleImporter extends ImportingParserBase {
_newMapFactory = new MemMapFactory();
}
@Override
public JSONObject createParserUIInitializationData(ImportingJob job,
List<JSONObject> fileRecords, String format) {
throw new NotImplementedException();
}
@Override
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, Reader reader, int limit,
@ -101,14 +94,13 @@ public class RdfTripleImporter extends ImportingParserBase {
return;
}
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
Column subjectColumn = new Column(0, "subject");
project.columnModel.columns.add(0, subjectColumn);
project.columnModel.setKeyColumnIndex(0);
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
try {
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
project.columnModel.addColumn(0, subjectColumn, false);
project.columnModel.setKeyColumnIndex(0);
for (Triple triple : triples) {
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
@ -117,11 +109,7 @@ public class RdfTripleImporter extends ImportingParserBase {
Column column = project.columnModel.getColumnByName(predicate);
if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
try {
project.columnModel.addColumn(-1, column, true);
} catch (ModelException e) {
// ignore
}
project.columnModel.addColumn(-1, column, true);
}
int cellIndex = column.getCellIndex();
@ -156,6 +144,8 @@ public class RdfTripleImporter extends ImportingParserBase {
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
project.rows.addAll(entry.getValue());
}
} catch (ModelException e) {
exceptions.add(e);
} finally {
triples.iterator().close();
}

View File

@ -58,7 +58,7 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
@Override
public JSONObject createParserUIInitializationData(ImportingJob job,
List<JSONObject> fileRecords, String format) {
JSONObject options = new JSONObject();
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
JSONUtilities.safePut(options, "ignoreLines", -1); // number of blank lines at the beginning to ignore
JSONUtilities.safePut(options, "headerLines", 1); // number of header lines
@ -67,8 +67,6 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
JSONUtilities.safePut(options, "storeBlankRows", true);
JSONUtilities.safePut(options, "storeBlankCellsAsNulls", true);
JSONUtilities.safePut(options, "includeFileSources", fileRecords.size() > 1);
return options;
}

View File

@ -24,6 +24,7 @@ public class TextFormatGuesser implements FormatGuesser {
int closeBraces = 0;
int openAngleBrackets = 0;
int closeAngleBrackets = 0;
int trailingPeriods = 0;
char firstChar = ' ';
boolean foundFirstChar = false;
@ -36,6 +37,7 @@ public class TextFormatGuesser implements FormatGuesser {
closeBraces += countSubstrings(chunk, "}");
openAngleBrackets += countSubstrings(chunk, "<");
closeAngleBrackets += countSubstrings(chunk, ">");
trailingPeriods += countLineSuffix(chunk, ".");
if (!foundFirstChar) {
chunk = chunk.trim();
@ -51,9 +53,12 @@ public class TextFormatGuesser implements FormatGuesser {
if ((firstChar == '{' || firstChar == '[') &&
openBraces >= 5 && closeBraces >= 5) {
return "text/json";
} else if (firstChar == '<' &&
openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
return "text/xml";
} else if (openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
if (trailingPeriods > 0) {
return "text/rdf+n3";
} else if (firstChar == '<') {
return "text/xml";
}
}
}
return "text/line-based";
@ -82,4 +87,31 @@ public class TextFormatGuesser implements FormatGuesser {
}
return count;
}
static public int countLineSuffix(String s, String suffix) {
int count = 0;
int from = 0;
while (from < s.length()) {
int lineEnd = s.indexOf('\n', from);
if (lineEnd < 0) {
break;
} else {
int i = lineEnd - 1;
while (i >= from + suffix.length() - 1) {
if (Character.isWhitespace(s.charAt(i))) {
i--;
} else {
String suffix2 = s.subSequence(i - suffix.length() + 1, i + 1).toString();
if (suffix2.equals(suffix)) {
count++;
}
break;
}
}
from = lineEnd + 1;
}
}
return count;
}
}

View File

@ -193,6 +193,8 @@ function registerImporting() {
IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI",
new Packages.com.google.refine.importers.FixedWidthImporter());
IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter());
IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
@ -220,6 +222,8 @@ function registerImporting() {
IM.registerExtension(".xls", "binary/xls");
IM.registerExtension(".xlsx", "text/xml/xlsx");
IM.registerExtension(".n3", "text/rdf+n3");
IM.registerExtension(".marc", "text/marc");
IM.registerExtension(".mrc", "text/marc");
@ -233,6 +237,8 @@ function registerImporting() {
IM.registerMimeType("text/fixed-width", "text/line-based/fixed-width");
IM.registerMimeType("text/rdf+n3", "text/rdf+n3");
IM.registerMimeType("application/msexcel", "binary/xls");
IM.registerMimeType("application/x-msexcel", "binary/xls");
IM.registerMimeType("application/x-ms-excel", "binary/xls");
@ -316,7 +322,8 @@ function init() {
"scripts/index/parser-interfaces/fixed-width-parser-ui.js",
"scripts/index/parser-interfaces/excel-parser-ui.js",
"scripts/index/parser-interfaces/xml-parser-ui.js",
"scripts/index/parser-interfaces/json-parser-ui.js"
"scripts/index/parser-interfaces/json-parser-ui.js",
"scripts/index/parser-interfaces/rdf-triples-parser-ui.js"
]
);

View File

@ -0,0 +1,16 @@
<div class="grid-layout layout-loose layout-full"><table>
<tr>
<td><div class="grid-layout layout-tighter"><table>
<tr>
<td width="1%">Character&nbsp;encoding</td>
<td><input bind="encodingInput"></input></td>
</tr>
</table></div></td>
<td colspan="2"><div class="grid-layout layout-tighter layout-full"><table>
<tr>
<td style="text-align: right;">&nbsp;</td>
<td width="1%"><button class="button" bind="previewButton">Update&nbsp;Preview</button></td>
</tr>
</table></div></td>
</tr>
</table></div>

View File

@ -0,0 +1,121 @@
/*
Copyright 2011, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
Refine.RdfTriplesParserUI = function(controller, jobID, job, format, config,
dataContainerElmt, progressContainerElmt, optionContainerElmt) {
this._controller = controller;
this._jobID = jobID;
this._job = job;
this._format = format;
this._config = config;
this._dataContainer = dataContainerElmt;
this._progressContainer = progressContainerElmt;
this._optionContainer = optionContainerElmt;
this._timerID = null;
this._initialize();
this._updatePreview();
};
Refine.DefaultImportingController.parserUIs["RdfTriplesParserUI"] = Refine.RdfTriplesParserUI;
Refine.RdfTriplesParserUI.prototype.confirmReadyToCreateProject = function() {
return true;
};
Refine.RdfTriplesParserUI.prototype.dispose = function() {
if (this._timerID != null) {
window.clearTimeout(this._timerID);
this._timerID = null;
}
};
Refine.RdfTriplesParserUI.prototype.getOptions = function() {
var options = {
encoding: $.trim(this._optionContainerElmts.encodingInput[0].value)
};
return options;
};
Refine.RdfTriplesParserUI.prototype._initialize = function() {
var self = this;
this._optionContainer.unbind().empty().html(
DOM.loadHTML("core", "scripts/index/parser-interfaces/rdf-triples-parser-ui.html"));
this._optionContainerElmts = DOM.bind(this._optionContainer);
this._optionContainerElmts.previewButton.click(function() { self._updatePreview(); });
this._optionContainerElmts.encodingInput
.attr('value', this._config.encoding || '')
.click(function() {
Encoding.selectEncoding($(this), function() {
self._updatePreview();
});
});
var onChange = function() {
self._scheduleUpdatePreview();
};
this._optionContainer.find("input").bind("change", onChange);
this._optionContainer.find("select").bind("change", onChange);
};
Refine.RdfTriplesParserUI.prototype._scheduleUpdatePreview = function() {
if (this._timerID != null) {
window.clearTimeout(this._timerID);
this._timerID = null;
}
var self = this;
this._timerID = window.setTimeout(function() {
self._timerID = null;
self._updatePreview();
}, 500); // 0.5 second
};
Refine.RdfTriplesParserUI.prototype._updatePreview = function() {
var self = this;
this._progressContainer.show();
this._controller.updateFormatAndOptions(this.getOptions(), function(result) {
if (result.status == "ok") {
self._controller.getPreviewData(function(projectData) {
self._progressContainer.hide();
new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty());
});
}
});
};