2010-10-20 22:45:52 +02:00
|
|
|
/*
|
|
|
|
|
|
|
|
Copyright 2010, Google Inc.
|
|
|
|
All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions are
|
|
|
|
met:
|
|
|
|
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above
|
|
|
|
copyright notice, this list of conditions and the following disclaimer
|
|
|
|
in the documentation and/or other materials provided with the
|
|
|
|
distribution.
|
|
|
|
* Neither the name of Google Inc. nor the names of its
|
|
|
|
contributors may be used to endorse or promote products derived from
|
|
|
|
this software without specific prior written permission.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
package com.google.refine.importers;
|
2010-05-18 14:41:40 +02:00
|
|
|
|
2011-08-02 23:10:22 +02:00
|
|
|
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
|
|
|
|
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
|
|
|
|
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
|
|
|
|
|
2010-05-18 14:41:40 +02:00
|
|
|
import java.io.Reader;
|
2010-05-19 06:22:45 +02:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Map.Entry;
|
2010-05-18 14:41:40 +02:00
|
|
|
|
|
|
|
import org.jrdf.JRDFFactory;
|
|
|
|
import org.jrdf.SortedMemoryJRDFFactory;
|
|
|
|
import org.jrdf.collection.MemMapFactory;
|
|
|
|
import org.jrdf.graph.Graph;
|
|
|
|
import org.jrdf.graph.Triple;
|
|
|
|
import org.jrdf.parser.line.GraphLineParser;
|
|
|
|
import org.jrdf.parser.line.LineHandler;
|
|
|
|
import org.jrdf.parser.ntriples.NTriplesParserFactory;
|
|
|
|
import org.jrdf.util.ClosableIterable;
|
2011-08-02 05:34:47 +02:00
|
|
|
import org.json.JSONObject;
|
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.ProjectMetadata;
|
|
|
|
import com.google.refine.expr.ExpressionUtils;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.importing.ImportingJob;
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.model.Cell;
|
|
|
|
import com.google.refine.model.Column;
|
|
|
|
import com.google.refine.model.ModelException;
|
|
|
|
import com.google.refine.model.Project;
|
|
|
|
import com.google.refine.model.Row;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.util.JSONUtilities;
|
2010-05-18 14:41:40 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
public class RdfTripleImporter extends ImportingParserBase {
|
2010-08-06 07:04:25 +02:00
|
|
|
private JRDFFactory _jrdfFactory;
|
|
|
|
private NTriplesParserFactory _nTriplesParserFactory;
|
|
|
|
private MemMapFactory _newMapFactory;
|
2010-05-18 14:41:40 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
public RdfTripleImporter() {
|
|
|
|
super(false);
|
2010-08-06 07:04:25 +02:00
|
|
|
_jrdfFactory = SortedMemoryJRDFFactory.getFactory();
|
|
|
|
_nTriplesParserFactory = new NTriplesParserFactory();
|
|
|
|
_newMapFactory = new MemMapFactory();
|
2010-05-18 14:41:40 +02:00
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
|
|
|
@Override
|
|
|
|
public void parseOneFile(Project project, ProjectMetadata metadata,
|
|
|
|
ImportingJob job, String fileSource, Reader reader, int limit,
|
|
|
|
JSONObject options, List<Exception> exceptions) {
|
|
|
|
|
|
|
|
String baseUrl = JSONUtilities.getString(options, "baseUrl", "");
|
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
Graph graph = _jrdfFactory.getNewGraph();
|
|
|
|
LineHandler lineHandler = _nTriplesParserFactory.createParser(graph, _newMapFactory);
|
2010-05-18 14:41:40 +02:00
|
|
|
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
|
2010-08-06 07:04:25 +02:00
|
|
|
try {
|
|
|
|
parser.parse(reader, baseUrl); // fills JRDF graph
|
2011-08-02 05:34:47 +02:00
|
|
|
} catch (Exception e) {
|
|
|
|
exceptions.add(e);
|
|
|
|
return;
|
2010-08-06 07:04:25 +02:00
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
2010-05-18 14:41:40 +02:00
|
|
|
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
|
|
|
|
try {
|
2011-10-06 23:28:20 +02:00
|
|
|
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
|
|
|
|
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
|
|
|
|
project.columnModel.addColumn(0, subjectColumn, false);
|
|
|
|
project.columnModel.setKeyColumnIndex(0);
|
|
|
|
|
2010-05-18 14:41:40 +02:00
|
|
|
for (Triple triple : triples) {
|
|
|
|
String subject = triple.getSubject().toString();
|
|
|
|
String predicate = triple.getPredicate().toString();
|
|
|
|
String object = triple.getObject().toString();
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-19 06:22:45 +02:00
|
|
|
Column column = project.columnModel.getColumnByName(predicate);
|
|
|
|
if (column == null) {
|
2010-08-06 07:04:25 +02:00
|
|
|
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
|
2011-10-06 23:28:20 +02:00
|
|
|
project.columnModel.addColumn(-1, column, true);
|
2010-05-18 23:08:37 +02:00
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-19 06:22:45 +02:00
|
|
|
int cellIndex = column.getCellIndex();
|
|
|
|
if (subjectToRows.containsKey(subject)) {
|
2010-08-06 07:04:25 +02:00
|
|
|
List<Row> rows = subjectToRows.get(subject);
|
|
|
|
for (Row row : rows) {
|
|
|
|
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
|
|
|
|
row.setCell(cellIndex, new Cell(object, null));
|
|
|
|
object = null;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (object != null) {
|
|
|
|
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
|
|
|
|
rows.add(row);
|
|
|
|
|
|
|
|
row.setCell(cellIndex, new Cell(object, null));
|
|
|
|
}
|
2010-05-19 06:22:45 +02:00
|
|
|
} else {
|
2010-08-06 07:04:25 +02:00
|
|
|
List<Row> rows = new ArrayList<Row>();
|
|
|
|
subjectToRows.put(subject, rows);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
|
|
|
|
rows.add(row);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
|
|
|
|
row.setCell(cellIndex, new Cell(object, null));
|
2010-05-18 15:48:52 +02:00
|
|
|
}
|
2010-05-18 14:41:40 +02:00
|
|
|
}
|
|
|
|
|
2010-05-19 06:22:45 +02:00
|
|
|
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
|
2010-08-06 07:04:25 +02:00
|
|
|
project.rows.addAll(entry.getValue());
|
2010-05-19 06:22:45 +02:00
|
|
|
}
|
2011-10-06 23:28:20 +02:00
|
|
|
} catch (ModelException e) {
|
|
|
|
exceptions.add(e);
|
2010-05-18 14:41:40 +02:00
|
|
|
} finally {
|
|
|
|
triples.iterator().close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|