Added a basic RDF triple importer plus unit tests. Some more work required - it's not plugged into the client and it creates a very sparse data structure (each triple is a new row). It uses JRDF library (Apache 1.1 license).

git-svn-id: http://google-refine.googlecode.com/svn/trunk@813 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-18 12:41:40 +00:00
parent c9303a884d
commit 25d3a9dfc1
5 changed files with 293 additions and 0 deletions

View File

@ -37,5 +37,6 @@
<classpathentry kind="lib" path="lib/marc4j-2.4.jar"/>
<classpathentry kind="lib" path="tests/java/lib/mockito-all-1.8.4.jar" sourcepath="tests/java/lib-src/mockito-all-1.8.4-sources.jar"/>
<classpathentry kind="lib" path="tests/java/lib/testng-5.12.1.jar" sourcepath="tests/java/lib-src/testng-5.12.1-sources.jar"/>
<classpathentry kind="lib" path="lib/jrdf-0.5.6.jar"/>
<classpathentry kind="output" path="src/main/webapp/WEB-INF/classes"/>
</classpath>

View File

@ -58,6 +58,11 @@ licenses/apache2.0.LICENSE.txt
opencsv
textng
Apache License 1.1
------------------
licenses/jrdf.LICENSE.txt
jrdf
LGPL
----

57
licenses/jrdf.LICENSE.txt Normal file
View File

@ -0,0 +1,57 @@
/*
* $Header$
* $Revision: 205 $
* $Date: 2005-06-22 21:16:18 +1000 (Wed, 22 Jun 2005) $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2003-2005 The JRDF Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* the JRDF Project (http://jrdf.sf.net/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The JRDF Project" and "JRDF" must not be used to endorse
* or promote products derived from this software without prior written
* permission. For written permission, please contact
* newmana@users.sourceforge.net.
*
* 5. Products derived from this software may not be called "JRDF"
* nor may "JRDF" appear in their names without prior written
* permission of the JRDF Project.
*
* THIS SOFTWARE IS PROVIDED `AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the JRDF Project. For more
* information on JRDF, please see <http://jrdf.sourceforge.net/>.
*/

View File

@ -0,0 +1,127 @@
package com.metaweb.gridworks.importers;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.jrdf.JRDFFactory;
import org.jrdf.SortedMemoryJRDFFactory;
import org.jrdf.collection.MemMapFactory;
import org.jrdf.graph.Graph;
import org.jrdf.graph.Triple;
import org.jrdf.parser.line.GraphLineParser;
import org.jrdf.parser.line.LineParser;
import org.jrdf.parser.line.LineParserImpl;
import org.jrdf.parser.line.LineHandler;
import org.jrdf.parser.ntriples.NTriplesParser;
import org.jrdf.parser.ntriples.NTriplesParserFactory;
import org.jrdf.parser.ntriples.parser.BlankNodeParserImpl;
import org.jrdf.parser.ntriples.parser.LiteralParserImpl;
import org.jrdf.parser.ntriples.parser.NodeMaps;
import org.jrdf.parser.ntriples.parser.NodeMapsImpl;
import org.jrdf.parser.ntriples.parser.RegexTripleParser;
import org.jrdf.parser.ntriples.parser.TripleParser;
import org.jrdf.parser.ntriples.parser.TripleParserImpl;
import org.jrdf.parser.ntriples.parser.URIReferenceParserImpl;
import org.jrdf.util.ClosableIterable;
import org.jrdf.util.boundary.RegexMatcherFactory;
import org.jrdf.util.boundary.RegexMatcherFactoryImpl;
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class RdfTripleImporter implements Importer{
JRDFFactory JrdfFactory;
NTriplesParserFactory nTriplesParserFactory;
MemMapFactory newMapFactory;
public RdfTripleImporter(){
JrdfFactory = SortedMemoryJRDFFactory.getFactory();
nTriplesParserFactory = new NTriplesParserFactory();
newMapFactory = new MemMapFactory();
}
@Override
public void read(Reader reader, Project project, Properties options) throws Exception {
String baseUrl = options.getProperty("base-url");
Graph graph = JrdfFactory.getNewGraph();
//System.out.println("--------------------------------");
//System.out.println("initial number of triples before parsing is : " + graph.getNumberOfTriples());
LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory);
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
parser.parse(reader, baseUrl); //fills JRDF graph
//System.out.println("number of triples parsed is : " + graph.getNumberOfTriples());
//first column is subject
project.columnModel.columns.add(0, new Column(0, "subject"));
project.columnModel.update();
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
try {
int numberOfColumns = 1;
for (Triple triple : triples) {
//System.out.println("Triple : " + triple);
String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString();
//System.out.println("subject : " + subject);
//System.out.println("predicate : " + predicate);
//System.out.println("object : " + object);
//System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate));
//creates new column for every predicate
if(project.columnModel.getColumnByName(predicate) == null){
//System.out.println("adding new column");
project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate));
project.columnModel.update();
numberOfColumns++;
//System.out.println("New total number of columns : " + numberOfColumns);
}
//FIXME - this is sparse (one row per triple), need to reconcile on subjects.
AddNewRow(project, subject, predicate, object);
}
} finally {
triples.iterator().close();
}
}
protected void AddNewRow(Project project, String subject, String predicate, String object){
int numberOfColumns = project.columnModel.columns.size();
//add subject
Row row = new Row(numberOfColumns);
row.setCell(0, new Cell(subject, null));
//add object to a row
int columnIndex = project.columnModel.getColumnIndexByName(predicate);
//System.out.println("predicate relates to columnIndex : " + columnIndex);
row.setCell(columnIndex, new Cell(object, null));
//System.out.println("Number of cells in new row : " + row.cells.size());
project.rows.add(row);
//System.out.println("New total number of rows : " + project.rows.size());
}
@Override
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
// TODO
throw new UnsupportedOperationException();
}
@Override
public boolean takesReader() {
return true;
}
}

View File

@ -0,0 +1,103 @@
package com.metaweb.gridworks.tests.importers;
import java.io.StringReader;
import java.util.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.metaweb.gridworks.importers.RdfTripleImporter;
import com.metaweb.gridworks.model.Project;
public class RdfTripleImporterTests {
// logging
final static protected Logger logger = LoggerFactory.getLogger("RdfImporterTests");
//System Under Test
RdfTripleImporter SUT = null;
Project project = null;
Properties options = null;
@BeforeMethod
public void SetUp(){
SUT = new RdfTripleImporter();
project = new Project();
options = new Properties();
options.put("base-url", "http://rdf.freebase.com");
}
@Test
public void CanParseSingleLineTriple(){
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.";
StringReader reader = new StringReader(sampleRdf);
try {
SUT.read(reader, project, options);
} catch (Exception e) {
Assert.fail();
}
Assert.assertEquals(project.columnModel.columns.size(), 2);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
}
@Test
public void CanParseMultiLineTriple(){
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.under_the_red_sky>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
StringReader reader = new StringReader(sampleRdf);
try {
SUT.read(reader, project, options);
} catch (Exception e) {
Assert.fail();
}
Assert.assertEquals(project.columnModel.columns.size(), 2);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); //NB triples aren't created in order they were input
}
@Test
public void CanParseMultiLineMultiPredicatesTriple(){
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.genre> <http://rdf.freebase.com/ns/en.folk_rock>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
StringReader reader = new StringReader(sampleRdf);
try {
SUT.read(reader, project, options);
} catch (Exception e) {
Assert.fail();
}
Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre");
Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertEquals(project.rows.get(1).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
Assert.assertEquals(project.rows.get(2).cells.size(), 3);
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); //NB triples aren't created in order they were input
}
}