From cfd0f2219e1d2bbcb3e3d27a0d2a9c708c83086f Mon Sep 17 00:00:00 2001 From: Iain Sproat Date: Tue, 18 May 2010 13:48:52 +0000 Subject: [PATCH] RdfTripleImporter does basic reconciliation on Rdf subject - data model is no longer sparse (now possible to have more than one data point per row) git-svn-id: http://google-refine.googlecode.com/svn/trunk@816 7d457c2a-affb-35e4-300a-418c747d4874 --- .../importers/RdfTripleImporter.java | 63 +++++++++++-------- .../importers/RdfTripleImporterTests.java | 9 +-- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java b/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java index 1de78798c..db03ca9dd 100644 --- a/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java +++ b/src/main/java/com/metaweb/gridworks/importers/RdfTripleImporter.java @@ -2,8 +2,6 @@ package com.metaweb.gridworks.importers; import java.io.InputStream; import java.io.Reader; -import java.util.ArrayList; -import java.util.List; import java.util.Properties; import org.jrdf.JRDFFactory; @@ -12,22 +10,9 @@ import org.jrdf.collection.MemMapFactory; import org.jrdf.graph.Graph; import org.jrdf.graph.Triple; import org.jrdf.parser.line.GraphLineParser; -import org.jrdf.parser.line.LineParser; -import org.jrdf.parser.line.LineParserImpl; import org.jrdf.parser.line.LineHandler; -import org.jrdf.parser.ntriples.NTriplesParser; import org.jrdf.parser.ntriples.NTriplesParserFactory; -import org.jrdf.parser.ntriples.parser.BlankNodeParserImpl; -import org.jrdf.parser.ntriples.parser.LiteralParserImpl; -import org.jrdf.parser.ntriples.parser.NodeMaps; -import org.jrdf.parser.ntriples.parser.NodeMapsImpl; -import org.jrdf.parser.ntriples.parser.RegexTripleParser; -import org.jrdf.parser.ntriples.parser.TripleParser; -import org.jrdf.parser.ntriples.parser.TripleParserImpl; -import org.jrdf.parser.ntriples.parser.URIReferenceParserImpl; import org.jrdf.util.ClosableIterable; -import org.jrdf.util.boundary.RegexMatcherFactory; -import org.jrdf.util.boundary.RegexMatcherFactoryImpl; import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE; import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE; import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE; @@ -78,33 +63,57 @@ public class RdfTripleImporter implements Importer{ //System.out.println("object : " + object); //System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate)); - //creates new column for every predicate - if(project.columnModel.getColumnByName(predicate) == null){ - AddNewColumn(project, predicate); - } + int candidateMergeRowIndex = -1; - //FIXME - this is sparse (one row per triple), need to reconcile on subjects. - AddNewRow(project, subject, predicate, object); + //creates new column for every predicate + int columnIndex = project.columnModel.getColumnIndexByName(predicate); + if(columnIndex == -1){ + candidateMergeRowIndex = AddNewColumn(project, predicate, subject); + } + columnIndex = project.columnModel.getColumnIndexByName(predicate); + + if(candidateMergeRowIndex > -1){ + if(project.rows.get(candidateMergeRowIndex).cells.get(columnIndex) == null){ + //empty, so merge in this value + MergeWithRow(project, candidateMergeRowIndex, columnIndex, object); + }else{ + //can't overwrite existing, so add new row + AddNewRow(project, subject, predicate, object); //TODO group to original row. + } + }else{ + AddNewRow(project, subject, predicate, object); + } } } finally { triples.iterator().close(); } } - - protected void AddNewColumn(Project project, String predicate){ + + protected int AddNewColumn(Project project, String predicate, String subject){ //System.out.println("adding new column"); int numberOfColumns = project.columnModel.columns.size(); - + project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate)); project.columnModel.update(); + + int candidateMergeRowIndex = -1; //update existing rows with new column - for(Row r : project.rows){ - r.cells.add(numberOfColumns, null); + for(int i = 0; i < project.rows.size(); i++){ + project.rows.get(i).cells.add(numberOfColumns, null); + if(project.rows.get(i).cells.get(0).value == subject){ + candidateMergeRowIndex = i; + } } + //numberOfColumns = project.columnModel.columns.size(); //System.out.println("New total number of columns : " + numberOfColumns); - + + return candidateMergeRowIndex; + } + + protected void MergeWithRow(Project project, int candidateMergeRowIndex, int columnIndex, String object){ + project.rows.get(candidateMergeRowIndex).setCell(columnIndex, new Cell(object, null)); } protected void AddNewRow(Project project, String subject, String predicate, String object){ diff --git a/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java b/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java index 5a00e309d..78b2ead02 100644 --- a/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java +++ b/tests/java/src/com/metaweb/gridworks/tests/importers/RdfTripleImporterTests.java @@ -91,13 +91,14 @@ public class RdfTripleImporterTests { Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album"); Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre"); - Assert.assertEquals(project.rows.size(), 3); + Assert.assertEquals(project.rows.size(), 2); Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); + Assert.assertNull(project.rows.get(0).cells.get(2)); Assert.assertEquals(project.rows.get(1).cells.size(), 3); - Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input - Assert.assertEquals(project.rows.get(2).cells.size(), 3); - Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); //NB triples aren't created in order they were input + Assert.assertEquals(project.rows.get(1).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); + Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); } }