FIXED - task 154: Can't import RDF/XML Data
http://code.google.com/p/google-refine/issues/detail?id=154 git-svn-id: http://google-refine.googlecode.com/svn/trunk@2526 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
5881addac8
commit
4bf212c03d
@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
Copyright 2010,2012, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -37,21 +37,16 @@ import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE;
|
||||
import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE;
|
||||
import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.jrdf.JRDFFactory;
|
||||
import org.jrdf.SortedMemoryJRDFFactory;
|
||||
import org.jrdf.collection.MemMapFactory;
|
||||
import org.jrdf.graph.Graph;
|
||||
import org.jrdf.graph.Triple;
|
||||
import org.jrdf.parser.line.GraphLineParser;
|
||||
import org.jrdf.parser.line.LineHandler;
|
||||
import org.jrdf.parser.ntriples.NTriplesParserFactory;
|
||||
import org.jrdf.parser.RdfReader;
|
||||
import org.jrdf.util.ClosableIterable;
|
||||
import org.json.JSONObject;
|
||||
|
||||
@ -63,35 +58,46 @@ import com.google.refine.model.Column;
|
||||
import com.google.refine.model.ModelException;
|
||||
import com.google.refine.model.Project;
|
||||
import com.google.refine.model.Row;
|
||||
import com.google.refine.util.JSONUtilities;
|
||||
|
||||
public class RdfTripleImporter extends ImportingParserBase {
|
||||
private JRDFFactory _jrdfFactory;
|
||||
private NTriplesParserFactory _nTriplesParserFactory;
|
||||
private MemMapFactory _newMapFactory;
|
||||
private RdfReader rdfReader;
|
||||
private Mode mode;
|
||||
|
||||
public enum Mode {
|
||||
RDFXML,
|
||||
NT,
|
||||
N3
|
||||
}
|
||||
|
||||
public RdfTripleImporter() {
|
||||
super(false);
|
||||
_jrdfFactory = SortedMemoryJRDFFactory.getFactory();
|
||||
_nTriplesParserFactory = new NTriplesParserFactory();
|
||||
_newMapFactory = new MemMapFactory();
|
||||
this(Mode.NT);
|
||||
}
|
||||
|
||||
public RdfTripleImporter(Mode mode) {
|
||||
super(true);
|
||||
rdfReader = new RdfReader();
|
||||
this.mode = mode;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void parseOneFile(Project project, ProjectMetadata metadata,
|
||||
ImportingJob job, String fileSource, Reader reader, int limit,
|
||||
ImportingJob job, String fileSource, InputStream input, int limit,
|
||||
JSONObject options, List<Exception> exceptions) {
|
||||
|
||||
String baseUrl = JSONUtilities.getString(options, "baseUrl", "");
|
||||
|
||||
Graph graph = _jrdfFactory.getNewGraph();
|
||||
LineHandler lineHandler = _nTriplesParserFactory.createParser(graph, _newMapFactory);
|
||||
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
|
||||
try {
|
||||
parser.parse(reader, baseUrl); // fills JRDF graph
|
||||
} catch (Exception e) {
|
||||
exceptions.add(e);
|
||||
return;
|
||||
Graph graph;
|
||||
switch (mode) {
|
||||
case NT:
|
||||
graph = rdfReader.parseNTriples(input);
|
||||
break;
|
||||
case N3:
|
||||
graph = rdfReader.parseN3(input);
|
||||
break;
|
||||
case RDFXML:
|
||||
graph = rdfReader.parseRdfXml(input);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown parsing mode");
|
||||
}
|
||||
|
||||
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
|
||||
|
@ -0,0 +1,43 @@
|
||||
/*
|
||||
|
||||
Copyright 2012, Thomas F. Morris
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package com.google.refine.importers;
|
||||
|
||||
|
||||
public class RdfXmlTripleImporter extends RdfTripleImporter {
|
||||
|
||||
public RdfXmlTripleImporter() {
|
||||
super(RdfTripleImporter.Mode.RDFXML);
|
||||
}
|
||||
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
Copyright 2010,2012 Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -33,7 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
package com.google.refine.tests.importers;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testng.Assert;
|
||||
@ -65,7 +68,7 @@ public class RdfTripleImporterTests extends ImporterTest {
|
||||
}
|
||||
|
||||
@Test(enabled=false)
|
||||
public void CanParseSingleLineTriple(){
|
||||
public void canParseSingleLineTriple(){
|
||||
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.";
|
||||
StringReader reader = new StringReader(sampleRdf);
|
||||
|
||||
@ -85,17 +88,12 @@ public class RdfTripleImporterTests extends ImporterTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void CanParseMultiLineTriple(){
|
||||
public void canParseMultiLineTriple() throws UnsupportedEncodingException {
|
||||
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
|
||||
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.under_the_red_sky>.\n" +
|
||||
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
|
||||
StringReader reader = new StringReader(sampleRdf);
|
||||
|
||||
try {
|
||||
parseOneFile(SUT, reader);
|
||||
} catch (Exception e) {
|
||||
Assert.fail();
|
||||
}
|
||||
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
|
||||
parseOneFile(SUT, input);
|
||||
|
||||
//columns
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 2);
|
||||
@ -113,30 +111,25 @@ public class RdfTripleImporterTests extends ImporterTest {
|
||||
//row1
|
||||
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
|
||||
Assert.assertNull(project.rows.get(1).cells.get(0));
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home"); //NB triples aren't created in order they were input
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky");
|
||||
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].rowIndex, 0);
|
||||
Assert.assertEquals(project.recordModel.getRowDependency(1).cellDependencies[1].cellIndex, 0);
|
||||
|
||||
//row2
|
||||
Assert.assertEquals(project.rows.get(2).cells.size(), 2);
|
||||
Assert.assertNull(project.rows.get(2).cells.get(0));
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.under_the_red_sky"); //NB triples aren't created in order they were input
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(1).value, "http://rdf.freebase.com/ns/en.bringing_it_all_back_home");
|
||||
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].rowIndex, 0);
|
||||
Assert.assertEquals(project.recordModel.getRowDependency(2).cellDependencies[1].cellIndex, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void CanParseMultiLineMultiPredicatesTriple(){
|
||||
public void canParseMultiLineMultiPredicatesTriple() throws UnsupportedEncodingException {
|
||||
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.blood_on_the_tracks>.\n" +
|
||||
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.genre> <http://rdf.freebase.com/ns/en.folk_rock>.\n" +
|
||||
"<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/music.artist.album> <http://rdf.freebase.com/ns/en.bringing_it_all_back_home>.";
|
||||
StringReader reader = new StringReader(sampleRdf);
|
||||
|
||||
try {
|
||||
parseOneFile(SUT, reader);
|
||||
} catch (Exception e) {
|
||||
Assert.fail();
|
||||
}
|
||||
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
|
||||
parseOneFile(SUT, input);
|
||||
|
||||
//columns
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
@ -162,15 +155,12 @@ public class RdfTripleImporterTests extends ImporterTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void CanParseTripleWithValue(){
|
||||
public void canParseTripleWithValue() throws UnsupportedEncodingException {
|
||||
String sampleRdf = "<http://rdf.freebase.com/ns/en.bob_dylan> <http://rdf.freebase.com/ns/common.topic.alias> \"Robert Zimmerman\"@en.";
|
||||
StringReader reader = new StringReader(sampleRdf);
|
||||
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
|
||||
|
||||
try {
|
||||
parseOneFile(SUT, reader);
|
||||
} catch (Exception e) {
|
||||
Assert.fail();
|
||||
}
|
||||
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.NT);
|
||||
parseOneFile(SUT, input);
|
||||
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 2);
|
||||
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
|
||||
@ -179,5 +169,41 @@ public class RdfTripleImporterTests extends ImporterTest {
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://rdf.freebase.com/ns/en.bob_dylan");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"Robert Zimmerman\"@en");
|
||||
} @Test
|
||||
|
||||
public void parseRdfXml() throws UnsupportedEncodingException {
|
||||
// From W3C spec http://www.w3.org/TR/REC-rdf-syntax/#example8
|
||||
String sampleRdf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
|
||||
+ "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
|
||||
+ " xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n"
|
||||
+ " <rdf:Description rdf:about=\"http://www.w3.org/TR/rdf-syntax-grammar\">\n"
|
||||
+ " <dc:title>RDF/XML Syntax Specification (Revised)</dc:title>\n"
|
||||
+ " <dc:title xml:lang=\"en\">RDF/XML Syntax Specification (Revised)</dc:title>\n"
|
||||
+ " <dc:title xml:lang=\"en-US\">RDF/XML Syntax Specification (Revised)</dc:title>\n"
|
||||
+ " </rdf:Description>\n"
|
||||
+ "\n"
|
||||
+ " <rdf:Description rdf:about=\"http://example.org/buecher/baum\" xml:lang=\"de\">\n"
|
||||
+ " <dc:title>Der Baum</dc:title>\n"
|
||||
+ " <dc:description>Das Buch ist außergewöhnlich</dc:description>\n"
|
||||
+ " <dc:title xml:lang=\"en\">The Tree</dc:title>\n"
|
||||
+ " </rdf:Description>\n"
|
||||
+ "</rdf:RDF>\n";
|
||||
|
||||
InputStream input = new ByteArrayInputStream(sampleRdf.getBytes("UTF-8"));
|
||||
SUT = new RdfTripleImporter(RdfTripleImporter.Mode.RDFXML);
|
||||
parseOneFile(SUT, input);
|
||||
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "subject");
|
||||
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "http://purl.org/dc/elements/1.1/title");
|
||||
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "http://purl.org/dc/elements/1.1/description");
|
||||
Assert.assertEquals(project.rows.size(), 5);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 2);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "http://www.w3.org/TR/rdf-syntax-grammar");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "\"RDF/XML Syntax Specification (Revised)\"");
|
||||
Assert.assertEquals(project.rows.get(3).cells.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(3).cells.get(0).value, "http://example.org/buecher/baum");
|
||||
Assert.assertEquals(project.rows.get(3).cells.get(1).value, "\"Der Baum\"@de");
|
||||
Assert.assertEquals(project.rows.get(3).cells.get(2).value, "\"Das Buch ist außergewöhnlich\"@de");
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
Copyright 2010,2012 Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -204,7 +204,7 @@ function registerImporting() {
|
||||
IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());
|
||||
// TODO: Can we get away with just reusing Excel importer UI?
|
||||
IM.registerFormat("text/xml/ods", "Open Document Format spreadsheets (.ods)", "ExcelParserUI", new Packages.com.google.refine.importers.OdsImporter());
|
||||
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
|
||||
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfXmlTripleImporter());
|
||||
IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter());
|
||||
IM.registerFormat("text/marc", "MARC files");
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user