FIXED - task 154: Can't import RDF/XML Data

http://code.google.com/p/google-refine/issues/detail?id=154

git-svn-id: http://google-refine.googlecode.com/svn/trunk@2526 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Tom Morris 2012-08-05 16:31:41 +00:00
parent 5881addac8
commit 4bf212c03d
4 changed files with 134 additions and 59 deletions

View File

@ -1,6 +1,6 @@
/*
Copyright 2010, Google Inc.
Copyright 2010,2012, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -37,21 +37,16 @@ import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
import java.io.Reader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.jrdf.JRDFFactory;
import org.jrdf.SortedMemoryJRDFFactory;
import org.jrdf.collection.MemMapFactory;
import org.jrdf.graph.Graph;
import org.jrdf.graph.Triple;
import org.jrdf.parser.line.GraphLineParser;
import org.jrdf.parser.line.LineHandler;
import org.jrdf.parser.ntriples.NTriplesParserFactory;
import org.jrdf.parser.RdfReader;
import org.jrdf.util.ClosableIterable;
import org.json.JSONObject;
@ -63,35 +58,46 @@ import com.google.refine.model.Column;
import com.google.refine.model.ModelException;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.util.JSONUtilities;
public class RdfTripleImporter extends ImportingParserBase {
private JRDFFactory _jrdfFactory;
private NTriplesParserFactory _nTriplesParserFactory;
private MemMapFactory _newMapFactory;
private RdfReader rdfReader;
private Mode mode;
public enum Mode {
RDFXML,
NT,
N3
}
public RdfTripleImporter() {
super(false);
_jrdfFactory = SortedMemoryJRDFFactory.getFactory();
_nTriplesParserFactory = new NTriplesParserFactory();
_newMapFactory = new MemMapFactory();
this(Mode.NT);
}
public RdfTripleImporter(Mode mode) {
super(true);
rdfReader = new RdfReader();
this.mode = mode;
}
@Override
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, Reader reader, int limit,
ImportingJob job, String fileSource, InputStream input, int limit,
JSONObject options, List<Exception> exceptions) {
String baseUrl = JSONUtilities.getString(options, "baseUrl", "");
Graph graph = _jrdfFactory.getNewGraph();
LineHandler lineHandler = _nTriplesParserFactory.createParser(graph, _newMapFactory);
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
try {
parser.parse(reader, baseUrl); // fills JRDF graph
} catch (Exception e) {
exceptions.add(e);
return;
Graph graph;
switch (mode) {
case NT:
graph = rdfReader.parseNTriples(input);
break;
case N3:
graph = rdfReader.parseN3(input);
break;
case RDFXML:
graph = rdfReader.parseRdfXml(input);
break;
default:
throw new IllegalArgumentException("Unknown parsing mode");
}
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);

View File

@ -0,0 +1,43 @@
/*
Copyright 2012, Thomas F. Morris
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.importers;
public class RdfXmlTripleImporter extends RdfTripleImporter {
public RdfXmlTripleImporter() {
super(RdfTripleImporter.Mode.RDFXML);
}
}

View File

@ -1,6 +1,6 @@
/*
Copyright 2010, Google Inc.
Copyright 2010,2012 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -33,7 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.tests.importers;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
@ -65,7 +68,7 @@ public class RdfTripleImporterTests extends ImporterTest {
}
@Test(enabled=false)
public void CanParseSingleLineTriple(){
public void canParseSingleLineTriple(){
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.";
StringReader reader = new StringReader(sampleRdf);
@ -85,17 +88,12 @@ public class RdfTripleImporterTests extends ImporterTest {
}
@Test
public void CanParseMultiLineTriple(){
public void canParseMultiLineTriple() throws UnsupportedEncodingException {
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.under_the_red_sky>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
StringReader reader = new StringReader(sampleRdf);
try {
parseOneFile(SUT, reader);
} catch (Exception e) {
Assert.fail();
}
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
parseOneFile(SUT, input);
//columns
Assert.assertEquals(project.columnModel.columns.size(), 2);
@ -113,30 +111,25 @@ public class RdfTripleImporterTests extends ImporterTest {
//row1
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
Assert.assertNull(project.rows.get(1).cells.get(0));
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky");
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
//row2
Assert.assertEquals(project.rows.get(2).cells.size(), 2);
Assert.assertNull(project.rows.get(2).cells.get(0));
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); //NB triples aren't created in order they were input
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].rowIndex, 0);
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].cellIndex, 0);
}
@Test
public void CanParseMultiLineMultiPredicatesTriple(){
public void canParseMultiLineMultiPredicatesTriple() throws UnsupportedEncodingException {
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.genre> <http://rdf.freebase.com/ns/en.folk_rock>.\n" +
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
StringReader reader = new StringReader(sampleRdf);
try {
parseOneFile(SUT, reader);
} catch (Exception e) {
Assert.fail();
}
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
parseOneFile(SUT, input);
//columns
Assert.assertEquals(project.columnModel.columns.size(), 3);
@ -162,15 +155,12 @@ public class RdfTripleImporterTests extends ImporterTest {
}
@Test
public void CanParseTripleWithValue(){
public void canParseTripleWithValue() throws UnsupportedEncodingException {
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias> \"Robert Zimmerman\"@en.";
StringReader reader = new StringReader(sampleRdf);
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
try {
parseOneFile(SUT, reader);
} catch (Exception e) {
Assert.fail();
}
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.NT);
parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 2);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
@ -179,5 +169,41 @@ public class RdfTripleImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"Robert Zimmerman\"@en");
} @Test
public void parseRdfXml() throws UnsupportedEncodingException {
// From W3C spec http://www.w3.org/TR/REC-rdf-syntax/#example8
String sampleRdf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
+ " xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n"
+ " <rdf:Description rdf:about=\"http://www.w3.org/TR/rdf-syntax-grammar\">\n"
+ " <dc:title>RDF/XML Syntax Specification (Revised)</dc:title>\n"
+ " <dc:title xml:lang=\"en\">RDF/XML Syntax Specification (Revised)</dc:title>\n"
+ " <dc:title xml:lang=\"en-US\">RDF/XML Syntax Specification (Revised)</dc:title>\n"
+ " </rdf:Description>\n"
+ "\n"
+ " <rdf:Description rdf:about=\"http://example.org/buecher/baum\" xml:lang=\"de\">\n"
+ " <dc:title>Der Baum</dc:title>\n"
+ " <dc:description>Das Buch ist außergewöhnlich</dc:description>\n"
+ " <dc:title xml:lang=\"en\">The Tree</dc:title>\n"
+ " </rdf:Description>\n"
+ "</rdf:RDF>\n";
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.RDFXML);
parseOneFile(SUT, input);
Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://purl.org/dc/elements/1.1/title");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://purl.org/dc/elements/1.1/description");
Assert.assertEquals(project.rows.size(), 5);
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.w3.org/TR/rdf-syntax-grammar");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"RDF/XML Syntax Specification (Revised)\"");
Assert.assertEquals(project.rows.get(3).cells.size(), 3);
Assert.assertEquals(project.rows.get(3).cells.get(0).value, "http://example.org/buecher/baum");
Assert.assertEquals(project.rows.get(3).cells.get(1).value, "\"Der Baum\"@de");
Assert.assertEquals(project.rows.get(3).cells.get(2).value, "\"Das Buch ist außergewöhnlich\"@de");
}
}

View File

@ -1,6 +1,6 @@
/*
Copyright 2010, Google Inc.
Copyright 2010,2012 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -204,7 +204,7 @@ function registerImporting() {
IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());
// TODO: Can we get away with just reusing Excel importer UI?
IM.registerFormat("text/xml/ods", "Open Document Format spreadsheets (.ods)", "ExcelParserUI", new Packages.com.google.refine.importers.OdsImporter());
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfXmlTripleImporter());
IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter());
IM.registerFormat("text/marc", "MARC files");