RdfTripleImporter does basic reconciliation on Rdf subject - data model is no longer sparse (now possible to have more than one data point per row)

git-svn-id: http://google-refine.googlecode.com/svn/trunk@816 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-18 13:48:52 +00:00
parent 043f46c562
commit cfd0f2219e
2 changed files with 41 additions and 31 deletions

View File

@ -2,8 +2,6 @@ package com.metaweb.gridworks.importers;
import java.io.InputStream; import java.io.InputStream;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties; import java.util.Properties;
import org.jrdf.JRDFFactory; import org.jrdf.JRDFFactory;
@ -12,22 +10,9 @@ import org.jrdf.collection.MemMapFactory;
import org.jrdf.graph.Graph; import org.jrdf.graph.Graph;
import org.jrdf.graph.Triple; import org.jrdf.graph.Triple;
import org.jrdf.parser.line.GraphLineParser; import org.jrdf.parser.line.GraphLineParser;
import org.jrdf.parser.line.LineParser;
import org.jrdf.parser.line.LineParserImpl;
import org.jrdf.parser.line.LineHandler; import org.jrdf.parser.line.LineHandler;
import org.jrdf.parser.ntriples.NTriplesParser;
import org.jrdf.parser.ntriples.NTriplesParserFactory; import org.jrdf.parser.ntriples.NTriplesParserFactory;
import org.jrdf.parser.ntriples.parser.BlankNodeParserImpl;
import org.jrdf.parser.ntriples.parser.LiteralParserImpl;
import org.jrdf.parser.ntriples.parser.NodeMaps;
import org.jrdf.parser.ntriples.parser.NodeMapsImpl;
import org.jrdf.parser.ntriples.parser.RegexTripleParser;
import org.jrdf.parser.ntriples.parser.TripleParser;
import org.jrdf.parser.ntriples.parser.TripleParserImpl;
import org.jrdf.parser.ntriples.parser.URIReferenceParserImpl;
import org.jrdf.util.ClosableIterable; import org.jrdf.util.ClosableIterable;
import org.jrdf.util.boundary.RegexMatcherFactory;
import org.jrdf.util.boundary.RegexMatcherFactoryImpl;
import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE; import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE; import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE; import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
@ -78,33 +63,57 @@ public class RdfTripleImporter implements Importer{
//System.out.println("object : " + object); //System.out.println("object : " + object);
//System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate)); //System.out.println("predicate relates to column : " + project.columnModel.getColumnByName(predicate));
//creates new column for every predicate int candidateMergeRowIndex = -1;
if(project.columnModel.getColumnByName(predicate) == null){
AddNewColumn(project, predicate);
}
//FIXME - this is sparse (one row per triple), need to reconcile on subjects. //creates new column for every predicate
AddNewRow(project, subject, predicate, object); int columnIndex = project.columnModel.getColumnIndexByName(predicate);
if(columnIndex == -1){
candidateMergeRowIndex = AddNewColumn(project, predicate, subject);
}
columnIndex = project.columnModel.getColumnIndexByName(predicate);
if(candidateMergeRowIndex > -1){
if(project.rows.get(candidateMergeRowIndex).cells.get(columnIndex) == null){
//empty, so merge in this value
MergeWithRow(project, candidateMergeRowIndex, columnIndex, object);
}else{
//can't overwrite existing, so add new row
AddNewRow(project, subject, predicate, object); //TODO group to original row.
}
}else{
AddNewRow(project, subject, predicate, object);
}
} }
} finally { } finally {
triples.iterator().close(); triples.iterator().close();
} }
} }
protected void AddNewColumn(Project project, String predicate){ protected int AddNewColumn(Project project, String predicate, String subject){
//System.out.println("adding new column"); //System.out.println("adding new column");
int numberOfColumns = project.columnModel.columns.size(); int numberOfColumns = project.columnModel.columns.size();
project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate)); project.columnModel.columns.add(numberOfColumns, new Column(numberOfColumns, predicate));
project.columnModel.update(); project.columnModel.update();
int candidateMergeRowIndex = -1;
//update existing rows with new column //update existing rows with new column
for(Row r : project.rows){ for(int i = 0; i < project.rows.size(); i++){
r.cells.add(numberOfColumns, null); project.rows.get(i).cells.add(numberOfColumns, null);
if(project.rows.get(i).cells.get(0).value == subject){
candidateMergeRowIndex = i;
}
} }
//numberOfColumns = project.columnModel.columns.size(); //numberOfColumns = project.columnModel.columns.size();
//System.out.println("New total number of columns : " + numberOfColumns); //System.out.println("New total number of columns : " + numberOfColumns);
return candidateMergeRowIndex;
}
protected void MergeWithRow(Project project, int candidateMergeRowIndex, int columnIndex, String object){
project.rows.get(candidateMergeRowIndex).setCell(columnIndex, new Cell(object, null));
} }
protected void AddNewRow(Project project, String subject, String predicate, String object){ protected void AddNewRow(Project project, String subject, String predicate, String object){

View File

@ -91,13 +91,14 @@ public class RdfTripleImporterTests {
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject"); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://rdf.freebase.com/ns/music.artist.album");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre"); Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://rdf.freebase.com/ns/music.artist.genre");
Assert.assertEquals(project.rows.size(), 3); Assert.assertEquals(project.rows.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "http://rdf.freebase.com/ns/en.blood_on_the_tracks");
Assert.assertNull(project.rows.get(0).cells.get(2));
Assert.assertEquals(project.rows.get(1).cells.size(), 3); Assert.assertEquals(project.rows.get(1).cells.size(), 3);
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input Assert.assertEquals(project.rows.get(1).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(2).cells.size(), 3); Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock"); //NB triples aren't created in order they were input Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://rdf.freebase.com/ns/en.folk_rock");
} }
} }