Merge pull request #1563 from OpenRefine/jena-migrate

migrate from JRDF to JENA library
This commit is contained in:
Jacky 2018-04-16 23:09:00 -04:00 committed by GitHub
commit 1a0d1fabd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 124 additions and 87 deletions

View File

@ -17,7 +17,6 @@
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.5.1-slim.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.5.1-slim.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/dom4j-1.6.1.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/dom4j-1.6.1.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/jcl-over-slf4j-1.5.6.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/jcl-over-slf4j-1.5.6.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/jrdf-0.5.6.jar" sourcepath="main/webapp/WEB-INF/lib-src/jrdf-0.5.6-sources.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/lessen-trunk-r8.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/lessen-trunk-r8.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/log4j-1.2.15.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/log4j-1.2.15.jar"/>
<classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/marc4j-2.4.jar"/> <classpathentry exported="true" kind="lib" path="main/webapp/WEB-INF/lib/marc4j-2.4.jar"/>
@ -113,5 +112,20 @@
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-beanutils-1.9.3.jar"/> <classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-beanutils-1.9.3.jar"/>
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-auth-library-oauth2-http-0.9.0.jar"/> <classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-auth-library-oauth2-http-0.9.0.jar"/>
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-oauth-client-jetty-1.23.0.jar"/> <classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-oauth-client-jetty-1.23.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-arq-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-base-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-cmds-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-core-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-base-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-index-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-trans-data-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-dboe-transaction-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-iri-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-rdfconnection-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-shaded-guava-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-tdb-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jena-tdb2-3.6.0.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jsonld-java-0.11.1.jar"/>
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/libthrift-0.10.0.jar"/>
<classpathentry kind="output" path="main/webapp/WEB-INF/classes"/> <classpathentry kind="output" path="main/webapp/WEB-INF/classes"/>
</classpath> </classpath>

View File

@ -33,10 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.importers; package com.google.refine.importers;
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
@ -44,10 +40,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.jrdf.graph.Graph; import org.apache.jena.rdf.model.Model;
import org.jrdf.graph.Triple; import org.apache.jena.rdf.model.ModelFactory;
import org.jrdf.parser.RdfReader; import org.apache.jena.rdf.model.Statement;
import org.jrdf.util.ClosableIterable; import org.apache.jena.rdf.model.StmtIterator;
import org.json.JSONObject; import org.json.JSONObject;
import com.google.refine.expr.ExpressionUtils; import com.google.refine.expr.ExpressionUtils;
@ -60,13 +56,13 @@ import com.google.refine.model.Row;
import com.google.refine.model.medadata.ProjectMetadata; import com.google.refine.model.medadata.ProjectMetadata;
public class RdfTripleImporter extends ImportingParserBase { public class RdfTripleImporter extends ImportingParserBase {
private RdfReader rdfReader;
private Mode mode; private Mode mode;
public enum Mode { public enum Mode {
RDFXML, RDFXML,
NT, NT,
N3 N3,
TTL
} }
public RdfTripleImporter() { public RdfTripleImporter() {
@ -75,27 +71,25 @@ public class RdfTripleImporter extends ImportingParserBase {
public RdfTripleImporter(Mode mode) { public RdfTripleImporter(Mode mode) {
super(true); super(true);
rdfReader = new RdfReader();
this.mode = mode; this.mode = mode;
} }
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource,
@Override InputStream input, int limit, JSONObject options, List<Exception> exceptions) {
public void parseOneFile(Project project, ProjectMetadata metadata, // create an empty model
ImportingJob job, String fileSource, InputStream input, int limit, Model model = ModelFactory.createDefaultModel();
JSONObject options, List<Exception> exceptions) {
Graph graph;
try { try {
switch (mode) { switch (mode) {
case NT: case NT:
graph = rdfReader.parseNTriples(input); model.read(input, null, "NT");
break; break;
case N3: case N3:
graph = rdfReader.parseN3(input); case TTL:
model.read(input, null, "TTL");
break; break;
case RDFXML: case RDFXML:
graph = rdfReader.parseRdfXml(input); model.read(input, null);
break; break;
default: default:
throw new IllegalArgumentException("Unknown parsing mode"); throw new IllegalArgumentException("Unknown parsing mode");
@ -104,63 +98,63 @@ public class RdfTripleImporter extends ImportingParserBase {
exceptions.add(e); exceptions.add(e);
return; return;
} }
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
try {
Map<String, List<Row>> subjectToRows = new LinkedHashMap<String, List<Row>>();
Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
project.columnModel.addColumn(0, subjectColumn, false);
project.columnModel.setKeyColumnIndex(0);
for (Triple triple : triples) {
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString();
Column column = project.columnModel.getColumnByName(predicate); StmtIterator triples = model.listStatements();
if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate); try {
project.columnModel.addColumn(-1, column, true); Map<String, List<Row>> subjectToRows = new LinkedHashMap<String, List<Row>>();
} Column subjectColumn = new Column(project.columnModel.allocateNewCellIndex(), "subject");
project.columnModel.addColumn(0, subjectColumn, false);
project.columnModel.setKeyColumnIndex(0);
while (triples.hasNext()) {
Statement triple = triples.nextStatement();
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString();
int cellIndex = column.getCellIndex(); Column column = project.columnModel.getColumnByName(predicate);
if (subjectToRows.containsKey(subject)) { if (column == null) {
List<Row> rows = subjectToRows.get(subject); column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
for (Row row : rows) { project.columnModel.addColumn(-1, column, true);
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) { }
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
if (object != null) { int cellIndex = column.getCellIndex();
Row row = new Row(project.columnModel.getMaxCellIndex() + 1); if (subjectToRows.containsKey(subject)) {
rows.add(row); List<Row> rows = subjectToRows.get(subject);
for (Row row : rows) {
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
row.setCell(cellIndex, new Cell(object, null)); if (object != null) {
} Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
} else { rows.add(row);
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
Row row = new Row(project.columnModel.getMaxCellIndex() + 1); row.setCell(cellIndex, new Cell(object, null));
rows.add(row); }
} else {
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null)); Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
row.setCell(cellIndex, new Cell(object, null)); rows.add(row);
}
}
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) { row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
project.rows.addAll(entry.getValue()); row.setCell(cellIndex, new Cell(object, null));
} }
} catch (ModelException e) { }
exceptions.add(e);
} finally { for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
triples.iterator().close(); project.rows.addAll(entry.getValue());
} }
} catch (ModelException e) {
super.parseOneFile(project, metadata, job, fileSource, input, limit, options, exceptions); exceptions.add(e);
}
super.parseOneFile(project, metadata, job, fileSource, input, limit, options, exceptions);
} }
} }

View File

@ -106,19 +106,19 @@ public class RdfTripleImporterTests extends ImporterTest {
//row0 //row0
Assert.assertEquals(project.rows.get(0).cells.size(), 2); Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
//row1 //row1
Assert.assertEquals(project.rows.get(1).cells.size(), 2); Assert.assertEquals(project.rows.get(1).cells.size(), 2);
Assert.assertNull(project.rows.get(1).cells.get(0)); Assert.assertNull(project.rows.get(1).cells.get(0));
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky");
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
//row2 //row2
Assert.assertEquals(project.rows.get(2).cells.size(), 2); Assert.assertEquals(project.rows.get(2).cells.size(), 2);
Assert.assertNull(project.rows.get(2).cells.get(0)); Assert.assertNull(project.rows.get(2).cells.get(0));
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].rowIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].cellIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].cellIndex, 0);
} }
@ -143,23 +143,23 @@ public class RdfTripleImporterTests extends ImporterTest {
//row0 //row0
Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock");
//row1 //row1
Assert.assertEquals(project.rows.get(1).cells.size(), 2); Assert.assertEquals(project.rows.get(1).cells.size(), 2);
Assert.assertNull(project.rows.get(1).cells.get(0)); Assert.assertNull(project.rows.get(1).cells.get(0));
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0); Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
} }
@Test @Test
public void canParseTripleWithValue() throws UnsupportedEncodingException { public void canParseTripleWithValue() throws UnsupportedEncodingException {
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias> \"Robert Zimmerman\"@en."; String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias>\"Robert Zimmerman\"@en.";
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8")); InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.NT); SUT = new RdfTripleImporter(RdfTripleImporter.Mode.N3);
parseOneFile(SUT, input); parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 2); Assert.assertEquals(project.columnModel.columns.size(), 2);
@ -168,10 +168,11 @@ public class RdfTripleImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 2); Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"Robert Zimmerman\"@en"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Robert Zimmerman@en");
} @Test }
public void parseRdfXml() throws UnsupportedEncodingException { @Test
public void canParseRdfXml() throws UnsupportedEncodingException {
// From W3C spec http://www.w3.org/TR/REC-rdf-syntax/#example8 // From W3C spec http://www.w3.org/TR/REC-rdf-syntax/#example8
String sampleRdf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" String sampleRdf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n" + "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
@ -200,10 +201,37 @@ public class RdfTripleImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.size(), 5); Assert.assertEquals(project.rows.size(), 5);
Assert.assertEquals(project.rows.get(0).cells.size(), 2); Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.w3.org/TR/rdf-syntax-grammar"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.w3.org/TR/rdf-syntax-grammar");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"RDF/XML Syntax Specification (Revised)\""); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "RDF/XML Syntax Specification (Revised)@en-US");
Assert.assertEquals(project.rows.get(3).cells.size(), 3); Assert.assertEquals(project.rows.get(3).cells.size(), 3);
Assert.assertEquals(project.rows.get(3).cells.get(0).value, "http://example.org/buecher/baum"); Assert.assertEquals(project.rows.get(3).cells.get(0).value, "http://example.org/buecher/baum");
Assert.assertEquals(project.rows.get(3).cells.get(1).value, "\"Der Baum\"@de"); Assert.assertEquals(project.rows.get(3).cells.get(1).value, "The Tree@en");
Assert.assertEquals(project.rows.get(3).cells.get(2).value, "\"Das Buch ist außergewöhnlich\"@de"); Assert.assertEquals(project.rows.get(3).cells.get(2).value, "Das Buch ist außergewöhnlich@de");
} }
@Test
public void canParseN3() throws UnsupportedEncodingException {
String sampleRdf = "@prefix p: <http://www.example.org/personal_details#> .\n" +
"@prefix m: <http://www.example.org/meeting_organization#> .\n\n" +
"<http://www.example.org/people#fred>\n" +
"p:GivenName \"Fred\";\n" +
"p:hasEmail <mailto:fred@example.com>;\n" +
"m:attending <http://meetings.example.com/cal#m1> .\n";
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.N3);
parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 4);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://www.example.org/meeting_organization#attending");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://www.example.org/personal_details#hasEmail");
Assert.assertEquals(project.columnModel.columns.get(3).getName(), "http://www.example.org/personal_details#GivenName");
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.example.org/people#fred");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://meetings.example.com/cal#m1");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "mailto:fred@example.com");
Assert.assertEquals(project.rows.get(0).cells.get(3).value, "Fred");
}
} }

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -210,7 +210,8 @@ function registerImporting() {
IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI", IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI",
new Packages.com.google.refine.importers.FixedWidthImporter()); new Packages.com.google.refine.importers.FixedWidthImporter());
IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter()); IM.registerFormat("text/rdf+n3", "RDF/N3 files", "RdfTriplesParserUI",
new Packages.com.google.refine.importers.RdfTripleImporter(Packages.com.google.refine.importers.RdfTripleImporter.Mode.N3));
IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter()); IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter());
IM.registerFormat("binary/text/xml/xls/xlsx", "Excel files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter()); IM.registerFormat("binary/text/xml/xls/xlsx", "Excel files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());