Fixed up Rdf Triples importer, added a parser UI for it, and got its tests to pass.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@2283 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
cf206c33bc
commit
18f32ed7e8
@ -57,6 +57,15 @@ abstract public class ImportingParserBase implements ImportingParser {
|
||||
this.useInputStream = useInputStream;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JSONObject createParserUIInitializationData(ImportingJob job,
|
||||
List<JSONObject> fileRecords, String format) {
|
||||
JSONObject options = new JSONObject();
|
||||
JSONUtilities.safePut(options, "includeFileSources", fileRecords.size() > 1);
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parse(Project project, ProjectMetadata metadata,
|
||||
final ImportingJob job, List<JSONObject> fileRecords, String format,
|
||||
|
@ -44,7 +44,6 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
import org.jrdf.JRDFFactory;
|
||||
import org.jrdf.SortedMemoryJRDFFactory;
|
||||
import org.jrdf.collection.MemMapFactory;
|
||||
@ -78,12 +77,6 @@ public class RdfTripleImporter extends ImportingParserBase {
|
||||
_newMapFactory = new MemMapFactory();
|
||||
}
|
||||
|
||||
@Override
|
||||
public JSONObject createParserUIInitializationData(ImportingJob job,
|
||||
List<JSONObject> fileRecords, String format) {
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseOneFile(Project project, ProjectMetadata metadata,
|
||||
ImportingJob job, String fileSource, Reader reader, int limit,
|
||||
@ -101,14 +94,13 @@ public class RdfTripleImporter extends ImportingParserBase {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
|
||||
|
||||
Column subjectColumn = new Column(0, "subject");
|
||||
project.columnModel.columns.add(0, subjectColumn);
|
||||
project.columnModel.setKeyColumnIndex(0);
|
||||
|
||||
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
|
||||
try {
|
||||
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
|
||||
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
|
||||
project.columnModel.addColumn(0, subjectColumn, false);
|
||||
project.columnModel.setKeyColumnIndex(0);
|
||||
|
||||
for (Triple triple : triples) {
|
||||
String subject = triple.getSubject().toString();
|
||||
String predicate = triple.getPredicate().toString();
|
||||
@ -117,11 +109,7 @@ public class RdfTripleImporter extends ImportingParserBase {
|
||||
Column column = project.columnModel.getColumnByName(predicate);
|
||||
if (column == null) {
|
||||
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
|
||||
try {
|
||||
project.columnModel.addColumn(-1, column, true);
|
||||
} catch (ModelException e) {
|
||||
// ignore
|
||||
}
|
||||
project.columnModel.addColumn(-1, column, true);
|
||||
}
|
||||
|
||||
int cellIndex = column.getCellIndex();
|
||||
@ -156,6 +144,8 @@ public class RdfTripleImporter extends ImportingParserBase {
|
||||
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
|
||||
project.rows.addAll(entry.getValue());
|
||||
}
|
||||
} catch (ModelException e) {
|
||||
exceptions.add(e);
|
||||
} finally {
|
||||
triples.iterator().close();
|
||||
}
|
||||
|
@ -58,7 +58,7 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
|
||||
@Override
|
||||
public JSONObject createParserUIInitializationData(ImportingJob job,
|
||||
List<JSONObject> fileRecords, String format) {
|
||||
JSONObject options = new JSONObject();
|
||||
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
|
||||
|
||||
JSONUtilities.safePut(options, "ignoreLines", -1); // number of blank lines at the beginning to ignore
|
||||
JSONUtilities.safePut(options, "headerLines", 1); // number of header lines
|
||||
@ -67,8 +67,6 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
|
||||
JSONUtilities.safePut(options, "storeBlankRows", true);
|
||||
JSONUtilities.safePut(options, "storeBlankCellsAsNulls", true);
|
||||
|
||||
JSONUtilities.safePut(options, "includeFileSources", fileRecords.size() > 1);
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,7 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
int closeBraces = 0;
|
||||
int openAngleBrackets = 0;
|
||||
int closeAngleBrackets = 0;
|
||||
int trailingPeriods = 0;
|
||||
|
||||
char firstChar = ' ';
|
||||
boolean foundFirstChar = false;
|
||||
@ -36,6 +37,7 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
closeBraces += countSubstrings(chunk, "}");
|
||||
openAngleBrackets += countSubstrings(chunk, "<");
|
||||
closeAngleBrackets += countSubstrings(chunk, ">");
|
||||
trailingPeriods += countLineSuffix(chunk, ".");
|
||||
|
||||
if (!foundFirstChar) {
|
||||
chunk = chunk.trim();
|
||||
@ -51,9 +53,12 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
if ((firstChar == '{' || firstChar == '[') &&
|
||||
openBraces >= 5 && closeBraces >= 5) {
|
||||
return "text/json";
|
||||
} else if (firstChar == '<' &&
|
||||
openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
|
||||
return "text/xml";
|
||||
} else if (openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
|
||||
if (trailingPeriods > 0) {
|
||||
return "text/rdf+n3";
|
||||
} else if (firstChar == '<') {
|
||||
return "text/xml";
|
||||
}
|
||||
}
|
||||
}
|
||||
return "text/line-based";
|
||||
@ -82,4 +87,31 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static public int countLineSuffix(String s, String suffix) {
|
||||
int count = 0;
|
||||
int from = 0;
|
||||
while (from < s.length()) {
|
||||
int lineEnd = s.indexOf('\n', from);
|
||||
if (lineEnd < 0) {
|
||||
break;
|
||||
} else {
|
||||
int i = lineEnd - 1;
|
||||
while (i >= from + suffix.length() - 1) {
|
||||
if (Character.isWhitespace(s.charAt(i))) {
|
||||
i--;
|
||||
} else {
|
||||
String suffix2 = s.subSequence(i - suffix.length() + 1, i + 1).toString();
|
||||
if (suffix2.equals(suffix)) {
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
from = lineEnd + 1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -193,6 +193,8 @@ function registerImporting() {
|
||||
IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI",
|
||||
new Packages.com.google.refine.importers.FixedWidthImporter());
|
||||
|
||||
IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
|
||||
|
||||
IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter());
|
||||
IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());
|
||||
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
|
||||
@ -220,6 +222,8 @@ function registerImporting() {
|
||||
IM.registerExtension(".xls", "binary/xls");
|
||||
IM.registerExtension(".xlsx", "text/xml/xlsx");
|
||||
|
||||
IM.registerExtension(".n3", "text/rdf+n3");
|
||||
|
||||
IM.registerExtension(".marc", "text/marc");
|
||||
IM.registerExtension(".mrc", "text/marc");
|
||||
|
||||
@ -233,6 +237,8 @@ function registerImporting() {
|
||||
|
||||
IM.registerMimeType("text/fixed-width", "text/line-based/fixed-width");
|
||||
|
||||
IM.registerMimeType("text/rdf+n3", "text/rdf+n3");
|
||||
|
||||
IM.registerMimeType("application/msexcel", "binary/xls");
|
||||
IM.registerMimeType("application/x-msexcel", "binary/xls");
|
||||
IM.registerMimeType("application/x-ms-excel", "binary/xls");
|
||||
@ -316,7 +322,8 @@ function init() {
|
||||
"scripts/index/parser-interfaces/fixed-width-parser-ui.js",
|
||||
"scripts/index/parser-interfaces/excel-parser-ui.js",
|
||||
"scripts/index/parser-interfaces/xml-parser-ui.js",
|
||||
"scripts/index/parser-interfaces/json-parser-ui.js"
|
||||
"scripts/index/parser-interfaces/json-parser-ui.js",
|
||||
"scripts/index/parser-interfaces/rdf-triples-parser-ui.js"
|
||||
]
|
||||
);
|
||||
|
||||
|
@ -0,0 +1,16 @@
|
||||
<div class="grid-layout layout-loose layout-full"><table>
|
||||
<tr>
|
||||
<td><div class="grid-layout layout-tighter"><table>
|
||||
<tr>
|
||||
<td width="1%">Character encoding</td>
|
||||
<td><input bind="encodingInput"></input></td>
|
||||
</tr>
|
||||
</table></div></td>
|
||||
<td colspan="2"><div class="grid-layout layout-tighter layout-full"><table>
|
||||
<tr>
|
||||
<td style="text-align: right;"> </td>
|
||||
<td width="1%"><button class="button" bind="previewButton">Update Preview</button></td>
|
||||
</tr>
|
||||
</table></div></td>
|
||||
</tr>
|
||||
</table></div>
|
@ -0,0 +1,121 @@
|
||||
/*
|
||||
|
||||
Copyright 2011, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
Refine.RdfTriplesParserUI = function(controller, jobID, job, format, config,
|
||||
dataContainerElmt, progressContainerElmt, optionContainerElmt) {
|
||||
|
||||
this._controller = controller;
|
||||
this._jobID = jobID;
|
||||
this._job = job;
|
||||
this._format = format;
|
||||
this._config = config;
|
||||
|
||||
this._dataContainer = dataContainerElmt;
|
||||
this._progressContainer = progressContainerElmt;
|
||||
this._optionContainer = optionContainerElmt;
|
||||
|
||||
this._timerID = null;
|
||||
this._initialize();
|
||||
this._updatePreview();
|
||||
};
|
||||
Refine.DefaultImportingController.parserUIs["RdfTriplesParserUI"] = Refine.RdfTriplesParserUI;
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype.confirmReadyToCreateProject = function() {
|
||||
return true;
|
||||
};
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype.dispose = function() {
|
||||
if (this._timerID != null) {
|
||||
window.clearTimeout(this._timerID);
|
||||
this._timerID = null;
|
||||
}
|
||||
};
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype.getOptions = function() {
|
||||
var options = {
|
||||
encoding: $.trim(this._optionContainerElmts.encodingInput[0].value)
|
||||
};
|
||||
return options;
|
||||
};
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype._initialize = function() {
|
||||
var self = this;
|
||||
|
||||
this._optionContainer.unbind().empty().html(
|
||||
DOM.loadHTML("core", "scripts/index/parser-interfaces/rdf-triples-parser-ui.html"));
|
||||
this._optionContainerElmts = DOM.bind(this._optionContainer);
|
||||
this._optionContainerElmts.previewButton.click(function() { self._updatePreview(); });
|
||||
|
||||
this._optionContainerElmts.encodingInput
|
||||
.attr('value', this._config.encoding || '')
|
||||
.click(function() {
|
||||
Encoding.selectEncoding($(this), function() {
|
||||
self._updatePreview();
|
||||
});
|
||||
});
|
||||
|
||||
var onChange = function() {
|
||||
self._scheduleUpdatePreview();
|
||||
};
|
||||
this._optionContainer.find("input").bind("change", onChange);
|
||||
this._optionContainer.find("select").bind("change", onChange);
|
||||
};
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype._scheduleUpdatePreview = function() {
|
||||
if (this._timerID != null) {
|
||||
window.clearTimeout(this._timerID);
|
||||
this._timerID = null;
|
||||
}
|
||||
|
||||
var self = this;
|
||||
this._timerID = window.setTimeout(function() {
|
||||
self._timerID = null;
|
||||
self._updatePreview();
|
||||
}, 500); // 0.5 second
|
||||
};
|
||||
|
||||
Refine.RdfTriplesParserUI.prototype._updatePreview = function() {
|
||||
var self = this;
|
||||
|
||||
this._progressContainer.show();
|
||||
|
||||
this._controller.updateFormatAndOptions(this.getOptions(), function(result) {
|
||||
if (result.status == "ok") {
|
||||
self._controller.getPreviewData(function(projectData) {
|
||||
self._progressContainer.hide();
|
||||
|
||||
new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty());
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
Loading…
Reference in New Issue
Block a user