Added a basic RDF triple importer plus unit tests. Some more work required - it's not plugged into the client and it creates a very sparse data structure (each triple is a new row). It uses JRDF library (Apache 1.1 license).
git-svn-id: http://google-refine.googlecode.com/svn/trunk@813 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
c9303a884d
commit
25d3a9dfc1
@ -37,5 +37,6 @@
|
|||||||
<classpathentry kind="lib" path="lib/marc4j-2.4.jar"/>
|
<classpathentry kind="lib" path="lib/marc4j-2.4.jar"/>
|
||||||
<classpathentry kind="lib" path="tests/java/lib/mockito-all-1.8.4.jar" sourcepath="tests/java/lib-src/mockito-all-1.8.4-sources.jar"/>
|
<classpathentry kind="lib" path="tests/java/lib/mockito-all-1.8.4.jar" sourcepath="tests/java/lib-src/mockito-all-1.8.4-sources.jar"/>
|
||||||
<classpathentry kind="lib" path="tests/java/lib/testng-5.12.1.jar" sourcepath="tests/java/lib-src/testng-5.12.1-sources.jar"/>
|
<classpathentry kind="lib" path="tests/java/lib/testng-5.12.1.jar" sourcepath="tests/java/lib-src/testng-5.12.1-sources.jar"/>
|
||||||
|
<classpathentry kind="lib" path="lib/jrdf-0.5.6.jar"/>
|
||||||
<classpathentry kind="output" path="src/main/webapp/WEB-INF/classes"/>
|
<classpathentry kind="output" path="src/main/webapp/WEB-INF/classes"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
@ -57,6 +57,11 @@ licenses/apache2.0.LICENSE.txt
|
|||||||
signpost
|
signpost
|
||||||
opencsv
|
opencsv
|
||||||
textng
|
textng
|
||||||
|
|
||||||
|
Apache License 1.1
|
||||||
|
------------------
|
||||||
|
licenses/jrdf.LICENSE.txt
|
||||||
|
jrdf
|
||||||
|
|
||||||
LGPL
|
LGPL
|
||||||
----
|
----
|
||||||
|
57
licenses/jrdf.LICENSE.txt
Normal file
57
licenses/jrdf.LICENSE.txt
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* $Header$
|
||||||
|
* $Revision: 205 $
|
||||||
|
* $Date: 2005-06-22 21:16:18 +1000 (Wed, 22 Jun 2005) $
|
||||||
|
*
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2003-2005 The JRDF Project. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution, if
|
||||||
|
* any, must include the following acknowlegement:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* the JRDF Project (http://jrdf.sf.net/)."
|
||||||
|
* Alternately, this acknowlegement may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowlegements normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "The JRDF Project" and "JRDF" must not be used to endorse
|
||||||
|
* or promote products derived from this software without prior written
|
||||||
|
* permission. For written permission, please contact
|
||||||
|
* newmana@users.sourceforge.net.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "JRDF"
|
||||||
|
* nor may "JRDF" appear in their names without prior written
|
||||||
|
* permission of the JRDF Project.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED `AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the JRDF Project. For more
|
||||||
|
* information on JRDF, please see <http://jrdf.sourceforge.net/>.
|
||||||
|
*/
|
@ -0,0 +1,127 @@
|
|||||||
|
package com.metaweb.gridworks.importers;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.jrdf.JRDFFactory;
|
||||||
|
import org.jrdf.SortedMemoryJRDFFactory;
|
||||||
|
import org.jrdf.collection.MemMapFactory;
|
||||||
|
import org.jrdf.graph.Graph;
|
||||||
|
import org.jrdf.graph.Triple;
|
||||||
|
import org.jrdf.parser.line.GraphLineParser;
|
||||||
|
import org.jrdf.parser.line.LineParser;
|
||||||
|
import org.jrdf.parser.line.LineParserImpl;
|
||||||
|
import org.jrdf.parser.line.LineHandler;
|
||||||
|
import org.jrdf.parser.ntriples.NTriplesParser;
|
||||||
|
import org.jrdf.parser.ntriples.NTriplesParserFactory;
|
||||||
|
import org.jrdf.parser.ntriples.parser.BlankNodeParserImpl;
|
||||||
|
import org.jrdf.parser.ntriples.parser.LiteralParserImpl;
|
||||||
|
import org.jrdf.parser.ntriples.parser.NodeMaps;
|
||||||
|
import org.jrdf.parser.ntriples.parser.NodeMapsImpl;
|
||||||
|
import org.jrdf.parser.ntriples.parser.RegexTripleParser;
|
||||||
|
import org.jrdf.parser.ntriples.parser.TripleParser;
|
||||||
|
import org.jrdf.parser.ntriples.parser.TripleParserImpl;
|
||||||
|
import org.jrdf.parser.ntriples.parser.URIReferenceParserImpl;
|
||||||
|
import org.jrdf.util.ClosableIterable;
|
||||||
|
import org.jrdf.util.boundary.RegexMatcherFactory;
|
||||||
|
import org.jrdf.util.boundary.RegexMatcherFactoryImpl;
|
||||||
|
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
|
||||||
|
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
|
||||||
|
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.model.Cell;
|
||||||
|
import com.metaweb.gridworks.model.Column;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
import com.metaweb.gridworks.model.Row;
|
||||||
|
|
||||||
|
public class RdfTripleImporter implements Importer{
|
||||||
|
JRDFFactory JrdfFactory;
|
||||||
|
NTriplesParserFactory nTriplesParserFactory;
|
||||||
|
MemMapFactory newMapFactory;
|
||||||
|
|
||||||
|
public RdfTripleImporter(){
|
||||||
|
JrdfFactory = SortedMemoryJRDFFactory.getFactory();
|
||||||
|
nTriplesParserFactory = new NTriplesParserFactory();
|
||||||
|
newMapFactory = new MemMapFactory();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(Reader reader, Project project, Properties options) throws Exception {
|
||||||
|
String baseUrl = options.getProperty("base-url");
|
||||||
|
|
||||||
|
Graph graph = JrdfFactory.getNewGraph();
|
||||||
|
//System.out.println("--------------------------------");
|
||||||
|
//System.out.println("initial number of triples before parsing is : " + graph.getNumberOfTriples());
|
||||||
|
LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory);
|
||||||
|
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
|
||||||
|
parser.parse(reader, baseUrl); //fills JRDF graph
|
||||||
|
//System.out.println("number of triples parsed is : " + graph.getNumberOfTriples());
|
||||||
|
|
||||||
|
//first column is subject
|
||||||
|
project.columnModel.columns.add(0, new Column(0, "subject"));
|
||||||
|
project.columnModel.update();
|
||||||
|
|
||||||
|
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
|
||||||
|
try {
|
||||||
|
int numberOfColumns = 1;
|
||||||
|
for (Triple triple : triples) {
|
||||||
|
|
||||||
|
//System.out.println("Triple : " + triple);
|
||||||
|
String subject = triple.getSubject().toString();
|
||||||
|
String predicate = triple.getPredicate().toString();
|
||||||
|
String object = triple.getObject().toString();
|
||||||
|
|
||||||
|
//System.out.println("subject : " + subject);
|
||||||
|
//System.out.println("predicate : " + predicate);
|
||||||
|
//System.out.println("object : " + object);
|
||||||
|
//System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate));
|
||||||
|
|
||||||
|
//creates new column for every predicate
|
||||||
|
if(project.columnModel.getColumnByName(predicate) == null){
|
||||||
|
//System.out.println("adding new column");
|
||||||
|
project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate));
|
||||||
|
project.columnModel.update();
|
||||||
|
numberOfColumns++;
|
||||||
|
//System.out.println("New total number of columns : " + numberOfColumns);
|
||||||
|
}
|
||||||
|
|
||||||
|
//FIXME - this is sparse (one row per triple), need to reconcile on subjects.
|
||||||
|
AddNewRow(project, subject, predicate, object);
|
||||||
|
}
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
triples.iterator().close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void AddNewRow(Project project, String subject, String predicate, String object){
|
||||||
|
int numberOfColumns = project.columnModel.columns.size();
|
||||||
|
|
||||||
|
//add subject
|
||||||
|
Row row = new Row(numberOfColumns);
|
||||||
|
row.setCell(0, new Cell(subject, null));
|
||||||
|
|
||||||
|
//add object to a row
|
||||||
|
int columnIndex = project.columnModel.getColumnIndexByName(predicate);
|
||||||
|
//System.out.println("predicate relates to columnIndex : " + columnIndex);
|
||||||
|
row.setCell(columnIndex, new Cell(object, null));
|
||||||
|
//System.out.println("Number of cells in new row : " + row.cells.size());
|
||||||
|
project.rows.add(row);
|
||||||
|
//System.out.println("New total number of rows : " + project.rows.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
||||||
|
// TODO
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean takesReader() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,103 @@
|
|||||||
|
package com.metaweb.gridworks.tests.importers;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeMethod;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.importers.RdfTripleImporter;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
|
||||||
|
|
||||||
|
public class RdfTripleImporterTests {
|
||||||
|
// logging
|
||||||
|
final static protected Logger logger = LoggerFactory.getLogger("RdfImporterTests");
|
||||||
|
|
||||||
|
//System Under Test
|
||||||
|
RdfTripleImporter SUT = null;
|
||||||
|
Project project = null;
|
||||||
|
Properties options = null;
|
||||||
|
|
||||||
|
@BeforeMethod
|
||||||
|
public void SetUp(){
|
||||||
|
SUT = new RdfTripleImporter();
|
||||||
|
project = new Project();
|
||||||
|
options = new Properties();
|
||||||
|
options.put("base-url", "http://rdf.freebase.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void CanParseSingleLineTriple(){
|
||||||
|
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.";
|
||||||
|
StringReader reader = new StringReader(sampleRdf);
|
||||||
|
|
||||||
|
try {
|
||||||
|
SUT.read(reader, project, options);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assert.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(project.columnModel.columns.size(), 2);
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
|
||||||
|
Assert.assertEquals(project.rows.size(), 1);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void CanParseMultiLineTriple(){
|
||||||
|
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
|
||||||
|
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.under_the_red_sky>.\n" +
|
||||||
|
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
|
||||||
|
StringReader reader = new StringReader(sampleRdf);
|
||||||
|
|
||||||
|
try {
|
||||||
|
SUT.read(reader, project, options);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assert.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(project.columnModel.columns.size(), 2);
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
|
||||||
|
Assert.assertEquals(project.rows.size(), 3);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
|
||||||
|
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); //NB triples aren't created in order they were input
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void CanParseMultiLineMultiPredicatesTriple(){
|
||||||
|
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
|
||||||
|
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.genre> <http://rdf.freebase.com/ns/en.folk_rock>.\n" +
|
||||||
|
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
|
||||||
|
StringReader reader = new StringReader(sampleRdf);
|
||||||
|
|
||||||
|
try {
|
||||||
|
SUT.read(reader, project, options);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assert.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre");
|
||||||
|
Assert.assertEquals(project.rows.size(), 3);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
|
||||||
|
Assert.assertEquals(project.rows.get(2).cells.size(), 3);
|
||||||
|
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); //NB triples aren't created in order they were input
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user