From 25d3a9dfc17fd0e1bae3d4937c8e5af498cfd363 Mon Sep 17 00:00:00 2001 From: Iain Sproat Date: Tue, 18 May 2010 12:41:40 +0000 Subject: [PATCH] Added a basic RDF triple importer plus unit tests. Some more work required - it's not plugged into the client and it creates a very sparse data structure (each triple is a new row). It uses JRDF library (Apache 1.1 license). git-svn-id: http://google-refine.googlecode.com/svn/trunk@813 7d457c2a-affb-35e4-300a-418c747d4874 --- .classpath | 1 + LICENSE.txt | 5 + licenses/jrdf.LICENSE.txt | 57 ++++++++ .../importers/RdfTripleImporter.java | 127 ++++++++++++++++++ .../importers/RdfTripleImporterTests.java | 103 ++++++++++++++ 5 files changed, 293 insertions(+) create mode 100644 licenses/jrdf.LICENSE.txt create mode 100644 src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java create mode 100644 tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java diff --git a/.classpath b/.classpath index f49ba9c66..f451f1089 100644 --- a/.classpath +++ b/.classpath @@ -37,5 +37,6 @@ + diff --git a/LICENSE.txt b/LICENSE.txt index a644bf5d3..f4d2c9fe4 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -57,6 +57,11 @@ licenses/apache2.0.LICENSE.txt signpost opencsv textng + +Apache License 1.1 +------------------ +licenses/jrdf.LICENSE.txt + jrdf LGPL ---- diff --git a/licenses/jrdf.LICENSE.txt b/licenses/jrdf.LICENSE.txt new file mode 100644 index 000000000..4c70a9c78 --- /dev/null +++ b/licenses/jrdf.LICENSE.txt @@ -0,0 +1,57 @@ +/* + * $Header$ + * $Revision: 205 $ + * $Date: 2005-06-22 21:16:18 +1000 (Wed, 22 Jun 2005) $ + * + * ==================================================================== + * + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2003-2005 The JRDF Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, if + * any, must include the following acknowlegement: + * "This product includes software developed by the + * the JRDF Project (http://jrdf.sf.net/)." + * Alternately, this acknowlegement may appear in the software itself, + * if and wherever such third-party acknowlegements normally appear. + * + * 4. The names "The JRDF Project" and "JRDF" must not be used to endorse + * or promote products derived from this software without prior written + * permission. For written permission, please contact + * newmana@users.sourceforge.net. + * + * 5. Products derived from this software may not be called "JRDF" + * nor may "JRDF" appear in their names without prior written + * permission of the JRDF Project. + * + * THIS SOFTWARE IS PROVIDED `AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the JRDF Project. For more + * information on JRDF, please see . + */ diff --git a/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java b/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java new file mode 100644 index 000000000..4e25f3138 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java @@ -0,0 +1,127 @@ +package com.metaweb.gridworks.importers; + +import java.io.InputStream; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.jrdf.JRDFFactory; +import org.jrdf.SortedMemoryJRDFFactory; +import org.jrdf.collection.MemMapFactory; +import org.jrdf.graph.Graph; +import org.jrdf.graph.Triple; +import org.jrdf.parser.line.GraphLineParser; +import org.jrdf.parser.line.LineParser; +import org.jrdf.parser.line.LineParserImpl; +import org.jrdf.parser.line.LineHandler; +import org.jrdf.parser.ntriples.NTriplesParser; +import org.jrdf.parser.ntriples.NTriplesParserFactory; +import org.jrdf.parser.ntriples.parser.BlankNodeParserImpl; +import org.jrdf.parser.ntriples.parser.LiteralParserImpl; +import org.jrdf.parser.ntriples.parser.NodeMaps; +import org.jrdf.parser.ntriples.parser.NodeMapsImpl; +import org.jrdf.parser.ntriples.parser.RegexTripleParser; +import org.jrdf.parser.ntriples.parser.TripleParser; +import org.jrdf.parser.ntriples.parser.TripleParserImpl; +import org.jrdf.parser.ntriples.parser.URIReferenceParserImpl; +import org.jrdf.util.ClosableIterable; +import org.jrdf.util.boundary.RegexMatcherFactory; +import org.jrdf.util.boundary.RegexMatcherFactoryImpl; +import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE; +import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE; +import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + +public class RdfTripleImporter implements Importer{ + JRDFFactory JrdfFactory; + NTriplesParserFactory nTriplesParserFactory; + MemMapFactory newMapFactory; + + public RdfTripleImporter(){ + JrdfFactory = SortedMemoryJRDFFactory.getFactory(); + nTriplesParserFactory = new NTriplesParserFactory(); + newMapFactory = new MemMapFactory(); + } + + @Override + public void read(Reader reader, Project project, Properties options) throws Exception { + String baseUrl = options.getProperty("base-url"); + + Graph graph = JrdfFactory.getNewGraph(); + //System.out.println("--------------------------------"); + //System.out.println("initial number of triples before parsing is : " + graph.getNumberOfTriples()); + LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory); + GraphLineParser parser = new GraphLineParser(graph, lineHandler); + parser.parse(reader, baseUrl); //fills JRDF graph + //System.out.println("number of triples parsed is : " + graph.getNumberOfTriples()); + + //first column is subject + project.columnModel.columns.add(0, new Column(0, "subject")); + project.columnModel.update(); + + ClosableIterable triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE); + try { + int numberOfColumns = 1; + for (Triple triple : triples) { + + //System.out.println("Triple : " + triple); + String subject = triple.getSubject().toString(); + String predicate = triple.getPredicate().toString(); + String object = triple.getObject().toString(); + + //System.out.println("subject : " + subject); + //System.out.println("predicate : " + predicate); + //System.out.println("object : " + object); + //System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate)); + + //creates new column for every predicate + if(project.columnModel.getColumnByName(predicate) == null){ + //System.out.println("adding new column"); + project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate)); + project.columnModel.update(); + numberOfColumns++; + //System.out.println("New total number of columns : " + numberOfColumns); + } + + //FIXME - this is sparse (one row per triple), need to reconcile on subjects. + AddNewRow(project, subject, predicate, object); + } + + } finally { + triples.iterator().close(); + } + } + + protected void AddNewRow(Project project, String subject, String predicate, String object){ + int numberOfColumns = project.columnModel.columns.size(); + + //add subject + Row row = new Row(numberOfColumns); + row.setCell(0, new Cell(subject, null)); + + //add object to a row + int columnIndex = project.columnModel.getColumnIndexByName(predicate); + //System.out.println("predicate relates to columnIndex : " + columnIndex); + row.setCell(columnIndex, new Cell(object, null)); + //System.out.println("Number of cells in new row : " + row.cells.size()); + project.rows.add(row); + //System.out.println("New total number of rows : " + project.rows.size()); + } + + @Override + public void read(InputStream inputStream, Project project, Properties options) throws Exception { + // TODO + throw new UnsupportedOperationException(); + } + + @Override + public boolean takesReader() { + return true; + } + +} diff --git a/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java b/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java new file mode 100644 index 000000000..0ed5ce8df --- /dev/null +++ b/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java @@ -0,0 +1,103 @@ +package com.metaweb.gridworks.tests.importers; + +import java.io.StringReader; +import java.util.Properties; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import com.metaweb.gridworks.importers.RdfTripleImporter; +import com.metaweb.gridworks.model.Project; + + +public class RdfTripleImporterTests { + // logging + final static protected Logger logger = LoggerFactory.getLogger("RdfImporterTests"); + + //System Under Test + RdfTripleImporter SUT = null; + Project project = null; + Properties options = null; + + @BeforeMethod + public void SetUp(){ + SUT = new RdfTripleImporter(); + project = new Project(); + options = new Properties(); + options.put("base-url", "http://rdf.freebase.com"); + } + + @Test + public void CanParseSingleLineTriple(){ + String sampleRdf = " ."; + StringReader reader = new StringReader(sampleRdf); + + try { + SUT.read(reader, project, options); + } catch (Exception e) { + Assert.fail(); + } + + Assert.assertEquals(project.columnModel.columns.size(), 2); + Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject"); + Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album"); + Assert.assertEquals(project.rows.size(), 1); + Assert.assertEquals(project.rows.get(0).cells.size(), 2); + Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); + } + + @Test + public void CanParseMultiLineTriple(){ + String sampleRdf = " .\n" + + " .\n" + + " ."; + StringReader reader = new StringReader(sampleRdf); + + try { + SUT.read(reader, project, options); + } catch (Exception e) { + Assert.fail(); + } + + Assert.assertEquals(project.columnModel.columns.size(), 2); + Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject"); + Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album"); + Assert.assertEquals(project.rows.size(), 3); + Assert.assertEquals(project.rows.get(0).cells.size(), 2); + Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input + Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); //NB triples aren't created in order they were input + } + + @Test + public void CanParseMultiLineMultiPredicatesTriple(){ + String sampleRdf = " .\n" + + " .\n" + + " ."; + StringReader reader = new StringReader(sampleRdf); + + try { + SUT.read(reader, project, options); + } catch (Exception e) { + Assert.fail(); + } + + Assert.assertEquals(project.columnModel.columns.size(), 3); + Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject"); + Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album"); + Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre"); + Assert.assertEquals(project.rows.size(), 3); + Assert.assertEquals(project.rows.get(0).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created + Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); + Assert.assertEquals(project.rows.get(1).cells.size(), 2);//FIXME should the number of cells == 3? should be updated if a column is added after the row is created + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input + Assert.assertEquals(project.rows.get(2).cells.size(), 3); + Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); //NB triples aren't created in order they were input + } +}