Merge pull request #1563 from OpenRefine/jena-migrate

migrate from JRDF to JENA library
This commit is contained in:
Jacky 2018-04-16 23:09:00 -04:00 committed by GitHub
commit 1a0d1fabd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 124 additions and 87 deletions

View File

@ -17,7 +17,6 @@
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.5.1-slim.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/dom4j-1.6.1.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/jcl-over-slf4j-1.5.6.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/jrdf-0.5.6.jar" sourcepath="main/webapp/WEB-INF/lib-src/jrdf-0.5.6-sources.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/lessen-trunk-r8.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/log4j-1.2.15.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/marc4j-2.4.jar"/>
@ -113,5 +112,20 @@
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-beanutils-1.9.3.jar"/>
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-auth-library-oauth2-http-0.9.0.jar"/>
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-oauth-client-jetty-1.23.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-arq-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-base-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-cmds-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-core-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-base-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-index-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-trans-data-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-transaction-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-iri-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-rdfconnection-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-shaded-guava-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-tdb-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-tdb2-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jsonld-java-0.11.1.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/libthrift-0.10.0.jar"/>
<classpathentry kind="output" path="main/webapp/WEB-INF/classes"/>
</classpath>

View File

@ -33,10 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.importers;
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
@ -44,10 +40,10 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.jrdf.graph.Graph;
import org.jrdf.graph.Triple;
import org.jrdf.parser.RdfReader;
import org.jrdf.util.ClosableIterable;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.rdf.model.StmtIterator;
import org.json.JSONObject;
import com.google.refine.expr.ExpressionUtils;
@ -60,13 +56,13 @@ import com.google.refine.model.Row;
import com.google.refine.model.medadata.ProjectMetadata;
public class RdfTripleImporter extends ImportingParserBase {
private RdfReader rdfReader;
private Mode mode;
public enum Mode {
RDFXML,
NT,
N3
N3,
TTL
}
public RdfTripleImporter() {
@ -75,27 +71,25 @@ public class RdfTripleImporter extends ImportingParserBase {
public RdfTripleImporter(Mode mode) {
super(true);
rdfReader = new RdfReader();
this.mode = mode;
}
@Override
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, InputStream input, int limit,
JSONObject options, List<Exception> exceptions) {
Graph graph;
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource,
InputStream input, int limit, JSONObject options, List<Exception> exceptions) {
// create an empty model
Model model = ModelFactory.createDefaultModel();
try {
switch (mode) {
case NT:
graph = rdfReader.parseNTriples(input);
model.read(input, null, "NT");
break;
case N3:
graph = rdfReader.parseN3(input);
case TTL:
model.read(input, null, "TTL");
break;
case RDFXML:
graph = rdfReader.parseRdfXml(input);
model.read(input, null);
break;
default:
throw new IllegalArgumentException("Unknown parsing mode");
@ -104,63 +98,63 @@ public class RdfTripleImporter extends ImportingParserBase {
exceptions.add(e);
return;
}
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
try {
Map<String, List<Row>> subjectToRows = new LinkedHashMap<String, List<Row>>();
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
project.columnModel.addColumn(0, subjectColumn, false);
project.columnModel.setKeyColumnIndex(0);
for (Triple triple : triples) {
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString();
Column column = project.columnModel.getColumnByName(predicate);
if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
project.columnModel.addColumn(-1, column, true);
}
StmtIterator triples = model.listStatements();
try {
Map<String, List<Row>> subjectToRows = new LinkedHashMap<String, List<Row>>();
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
project.columnModel.addColumn(0, subjectColumn, false);
project.columnModel.setKeyColumnIndex(0);
while (triples.hasNext()) {
Statement triple = triples.nextStatement();
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString();
int cellIndex = column.getCellIndex();
if (subjectToRows.containsKey(subject)) {
List<Row> rows = subjectToRows.get(subject);
for (Row row : rows) {
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
Column column = project.columnModel.getColumnByName(predicate);
if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
project.columnModel.addColumn(-1, column, true);
}
if (object != null) {
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
int cellIndex = column.getCellIndex();
if (subjectToRows.containsKey(subject)) {
List<Row> rows = subjectToRows.get(subject);
for (Row row : rows) {
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
row.setCell(cellIndex, new Cell(object, null));
}
} else {
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
if (object != null) {
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
row.setCell(cellIndex, new Cell(object, null));
}
} else {
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
row.setCell(cellIndex, new Cell(object, null));
}
}
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
project.rows.addAll(entry.getValue());
}
} catch (ModelException e) {
exceptions.add(e);
} finally {
triples.iterator().close();
}
super.parseOneFile(project, metadata, job, fileSource, input, limit, options, exceptions);
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
row.setCell(cellIndex, new Cell(object, null));
}
}
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
project.rows.addAll(entry.getValue());
}
} catch (ModelException e) {
exceptions.add(e);
}
super.parseOneFile(project, metadata, job, fileSource, input, limit, options, exceptions);
}
}

View File

@ -106,19 +106,19 @@ public class RdfTripleImporterTests extends ImporterTest {
//row0
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
//row1
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
Assert.assertNull(project.rows.get(1).cells.get(0));
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky");
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
//row2
Assert.assertEquals(project.rows.get(2).cells.size(), 2);
Assert.assertNull(project.rows.get(2).cells.get(0));
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].cellIndex, 0);
}
@ -143,23 +143,23 @@ public class RdfTripleImporterTests extends ImporterTest {
//row0
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock");
//row1
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
Assert.assertNull(project.rows.get(1).cells.get(0));
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
}
@Test
public void canParseTripleWithValue() throws UnsupportedEncodingException {
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias> \"Robert Zimmerman\"@en.";
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias>\"Robert Zimmerman\"@en.";
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.NT);
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.N3);
parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 2);
@ -168,10 +168,11 @@ public class RdfTripleImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"Robert Zimmerman\"@en");
} @Test
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Robert Zimmerman@en");
}
public void parseRdfXml() throws UnsupportedEncodingException {
@Test
public void canParseRdfXml() throws UnsupportedEncodingException {
// From W3C spec http://www.w3.org/TR/REC-rdf-syntax/#example8
String sampleRdf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
@ -200,10 +201,37 @@ public class RdfTripleImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.size(), 5);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.w3.org/TR/rdf-syntax-grammar");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"RDF/XML Syntax Specification (Revised)\"");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "RDF/XML Syntax Specification (Revised)@en-US");
Assert.assertEquals(project.rows.get(3).cells.size(), 3);
Assert.assertEquals(project.rows.get(3).cells.get(0).value, "http://example.org/buecher/baum");
Assert.assertEquals(project.rows.get(3).cells.get(1).value, "\"Der Baum\"@de");
Assert.assertEquals(project.rows.get(3).cells.get(2).value, "\"Das Buch ist außergewöhnlich\"@de");
Assert.assertEquals(project.rows.get(3).cells.get(1).value, "The Tree@en");
Assert.assertEquals(project.rows.get(3).cells.get(2).value, "Das Buch ist außergewöhnlich@de");
}
@Test
public void canParseN3() throws UnsupportedEncodingException {
String sampleRdf = "@prefix p: <http://www.example.org/personal_details#> .\n" +
"@prefix m: <http://www.example.org/meeting_organization#> .\n\n" +
"<http://www.example.org/people#fred>\n" +
"p:GivenName \"Fred\";\n" +
"p:hasEmail <mailto:fred@example.com>;\n" +
"m:attending <http://meetings.example.com/cal#m1> .\n";
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.N3);
parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 4);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://www.example.org/meeting_organization#attending");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://www.example.org/personal_details#hasEmail");
Assert.assertEquals(project.columnModel.columns.get(3).getName(), "http://www.example.org/personal_details#GivenName");
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.example.org/people#fred");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://meetings.example.com/cal#m1");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "mailto:fred@example.com");
Assert.assertEquals(project.rows.get(0).cells.get(3).value, "Fred");
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -210,7 +210,8 @@ function registerImporting() {
IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI",
new Packages.com.google.refine.importers.FixedWidthImporter());
IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI",
new Packages.com.google.refine.importers.RdfTripleImporter(Packages.com.google.refine.importers.RdfTripleImporter.Mode.N3));
IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter());
IM.registerFormat("binary/text/xml/xls/xlsx", "Excel files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());