Add support for table headers
This commit is contained in:
parent
88aa6f113d
commit
e168c900e8
@ -1,20 +1,15 @@
|
|||||||
package com.google.refine.importers;
|
package com.google.refine.importers;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.LineNumberReader;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.json.JSONException;
|
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import com.google.common.io.CharStreams;
|
import com.google.common.io.CharStreams;
|
||||||
import de.fau.cs.osr.ptk.common.AstVisitor;
|
import de.fau.cs.osr.ptk.common.AstVisitor;
|
||||||
import de.fau.cs.osr.ptk.common.ParserCommon;
|
|
||||||
|
|
||||||
import org.sweble.wikitext.parser.ParserConfig;
|
import org.sweble.wikitext.parser.ParserConfig;
|
||||||
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
|
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
|
||||||
@ -32,7 +27,6 @@ import org.sweble.wikitext.parser.nodes.WtTableRow;
|
|||||||
import org.sweble.wikitext.parser.nodes.WtTableCell;
|
import org.sweble.wikitext.parser.nodes.WtTableCell;
|
||||||
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
|
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
|
||||||
import org.sweble.wikitext.parser.nodes.WtBody;
|
import org.sweble.wikitext.parser.nodes.WtBody;
|
||||||
import org.sweble.wikitext.parser.parser.LinkTargetException;
|
|
||||||
|
|
||||||
import org.sweble.wikitext.parser.WikitextEncodingValidator;
|
import org.sweble.wikitext.parser.WikitextEncodingValidator;
|
||||||
import org.sweble.wikitext.parser.WikitextPreprocessor;
|
import org.sweble.wikitext.parser.WikitextPreprocessor;
|
||||||
@ -52,7 +46,7 @@ import com.google.refine.util.JSONUtilities;
|
|||||||
|
|
||||||
|
|
||||||
public class WikitextImporter extends TabularImportingParserBase {
|
public class WikitextImporter extends TabularImportingParserBase {
|
||||||
static final Logger logger = LoggerFactory.getLogger(WikitextImporter.class);
|
static final private Logger logger = LoggerFactory.getLogger(WikitextImporter.class);
|
||||||
|
|
||||||
public WikitextImporter() {
|
public WikitextImporter() {
|
||||||
super(false);
|
super(false);
|
||||||
@ -73,28 +67,25 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
public List<String> header;
|
public List<String> header;
|
||||||
public List<List<String>> rows;
|
public List<List<String>> rows;
|
||||||
private List<String> currentRow;
|
private List<String> currentRow;
|
||||||
private StringBuilder currentCellString;
|
private StringBuilder currentStringBuilder;
|
||||||
private String currentInternalLink;
|
private String currentInternalLink;
|
||||||
private String currentExternalLink;
|
private String currentExternalLink;
|
||||||
|
|
||||||
public WikitextTableVisitor() {
|
public WikitextTableVisitor() {
|
||||||
header = null;
|
header = new ArrayList<String>();
|
||||||
rows = new ArrayList<List<String>>();
|
rows = new ArrayList<List<String>>();
|
||||||
currentCellString = null;
|
currentStringBuilder = null;
|
||||||
currentInternalLink = null;
|
currentInternalLink = null;
|
||||||
currentExternalLink = null;
|
currentExternalLink = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean before(WtNode node) {
|
protected WtNode before(WtNode node) {
|
||||||
return super.before(node);
|
return super.before(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtNode e) {
|
public void visit(WtNode e) {
|
||||||
/*
|
// Ignore other nodes
|
||||||
System.out.println("ignoring node:");
|
|
||||||
System.out.println(e.getNodeTypeName());
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtParsedWikitextPage e) {
|
public void visit(WtParsedWikitextPage e) {
|
||||||
@ -110,10 +101,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtTableHeader e) {
|
public void visit(WtTableHeader e) {
|
||||||
currentRow = new ArrayList<String>();
|
header.add(renderAsString(e));
|
||||||
iterate(e);
|
|
||||||
header = currentRow;
|
|
||||||
currentRow = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtTableRow e)
|
public void visit(WtTableRow e)
|
||||||
@ -131,24 +119,30 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
public void visit(WtTableCell e)
|
public void visit(WtTableCell e)
|
||||||
{
|
{
|
||||||
if (currentRow != null) {
|
if (currentRow != null) {
|
||||||
currentCellString = new StringBuilder();
|
currentRow.add(renderAsString(e));
|
||||||
iterate(e);
|
|
||||||
String cellValue = currentCellString.toString().trim();
|
|
||||||
currentRow.add(cellValue);
|
|
||||||
currentCellString = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String renderAsString(WtNode e) {
|
||||||
|
currentStringBuilder = new StringBuilder();
|
||||||
|
iterate(e);
|
||||||
|
String value = currentStringBuilder.toString().trim();
|
||||||
|
currentStringBuilder = null;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void visit(WtText text) {
|
public void visit(WtText text) {
|
||||||
currentCellString.append(text.getContent());
|
if (currentStringBuilder != null) {
|
||||||
|
currentStringBuilder.append(text.getContent());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtNoLinkTitle e) {
|
public void visit(WtNoLinkTitle e) {
|
||||||
if (currentInternalLink != null) {
|
if (currentInternalLink != null) {
|
||||||
currentCellString.append(currentInternalLink);
|
currentStringBuilder.append(currentInternalLink);
|
||||||
} else if (currentExternalLink != null) {
|
} else if (currentExternalLink != null) {
|
||||||
currentCellString.append(currentExternalLink);
|
currentStringBuilder.append(currentExternalLink);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,6 +169,36 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
return rows;
|
return rows;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public class WikiTableDataReader implements TableDataReader {
|
||||||
|
private int currentRow = -1;
|
||||||
|
private WikitextTableVisitor visitor = null;
|
||||||
|
|
||||||
|
public WikiTableDataReader(WikitextTableVisitor visitor) {
|
||||||
|
this.visitor = visitor;
|
||||||
|
currentRow = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Object> getNextRowOfCells() throws IOException {
|
||||||
|
List<Object> row = null;
|
||||||
|
List<String> origRow = null;
|
||||||
|
if (currentRow == -1) {
|
||||||
|
origRow = this.visitor.header;
|
||||||
|
} else if(currentRow < this.visitor.rows.size()) {
|
||||||
|
origRow = this.visitor.rows.get(currentRow);
|
||||||
|
}
|
||||||
|
currentRow++;
|
||||||
|
|
||||||
|
if (origRow != null) {
|
||||||
|
row = new ArrayList<Object>();
|
||||||
|
for (int i = 0; i < origRow.size(); i++) {
|
||||||
|
row.add(origRow.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return row;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void parseOneFile(
|
public void parseOneFile(
|
||||||
@ -187,33 +211,6 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
JSONObject options,
|
JSONObject options,
|
||||||
List<Exception> exceptions
|
List<Exception> exceptions
|
||||||
) {
|
) {
|
||||||
/*
|
|
||||||
final List<Object> columnNames;
|
|
||||||
if (options.has("columnNames")) {
|
|
||||||
columnNames = new ArrayList<Object>();
|
|
||||||
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
|
|
||||||
for (String s : strings) {
|
|
||||||
columnNames.add(s);
|
|
||||||
}
|
|
||||||
JSONUtilities.safePut(options, "headerLines", 1);
|
|
||||||
} else {
|
|
||||||
columnNames = null;
|
|
||||||
JSONUtilities.safePut(options, "headerLines", 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
final LineNumberReader lnReader = new LineNumberReader(reader);
|
|
||||||
|
|
||||||
try {
|
|
||||||
int skip = JSONUtilities.getInt(options, "ignoreLines", -1);
|
|
||||||
while (skip > 0) {
|
|
||||||
lnReader.readLine();
|
|
||||||
skip--;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("Error reading line-based file", e);
|
|
||||||
}
|
|
||||||
JSONUtilities.safePut(options, "ignoreLines", -1); */
|
|
||||||
|
|
||||||
// Set-up a simple wiki configuration
|
// Set-up a simple wiki configuration
|
||||||
ParserConfig parserConfig = new SimpleParserConfig();
|
ParserConfig parserConfig = new SimpleParserConfig();
|
||||||
|
|
||||||
@ -227,14 +224,12 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
|
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
|
||||||
|
|
||||||
// Pre-processing
|
// Pre-processing
|
||||||
|
|
||||||
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
|
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
|
||||||
|
|
||||||
WtPreproWikitextPage prepArticle =
|
WtPreproWikitextPage prepArticle =
|
||||||
(WtPreproWikitextPage) prep.parseArticle(validated, title, false);
|
(WtPreproWikitextPage) prep.parseArticle(validated, title, false);
|
||||||
|
|
||||||
// Parsing
|
// Parsing
|
||||||
|
|
||||||
PreprocessedWikitext ppw = PreprocessorToParserTransformer
|
PreprocessedWikitext ppw = PreprocessorToParserTransformer
|
||||||
.transform(prepArticle);
|
.transform(prepArticle);
|
||||||
|
|
||||||
@ -247,25 +242,9 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
final WikitextTableVisitor vs = new WikitextTableVisitor();
|
final WikitextTableVisitor vs = new WikitextTableVisitor();
|
||||||
vs.go(parsedArticle);
|
vs.go(parsedArticle);
|
||||||
|
|
||||||
TableDataReader dataReader = new TableDataReader() {
|
TableDataReader dataReader = new WikiTableDataReader(vs);
|
||||||
private int currentRow = 0;
|
|
||||||
@Override
|
|
||||||
public List<Object> getNextRowOfCells() throws IOException {
|
|
||||||
List<Object> row = null;
|
|
||||||
if(currentRow < vs.rows.size()) {
|
|
||||||
List<String> origRow = vs.rows.get(currentRow);
|
|
||||||
row = new ArrayList<Object>();
|
|
||||||
for (int i = 0; i < origRow.size(); i++) {
|
|
||||||
row.add(origRow.get(i));
|
|
||||||
}
|
|
||||||
currentRow++;
|
|
||||||
}
|
|
||||||
return row;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
int headerLines = vs.header != null ? 1 : 0;
|
|
||||||
|
|
||||||
JSONUtilities.safePut(options, "headerLines", headerLines);
|
JSONUtilities.safePut(options, "headerLines", 1);
|
||||||
|
|
||||||
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
|
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
|
||||||
} catch (IOException e1) {
|
} catch (IOException e1) {
|
||||||
|
@ -44,25 +44,20 @@ import org.testng.Assert;
|
|||||||
import org.testng.annotations.AfterMethod;
|
import org.testng.annotations.AfterMethod;
|
||||||
import org.testng.annotations.BeforeMethod;
|
import org.testng.annotations.BeforeMethod;
|
||||||
import org.testng.annotations.BeforeTest;
|
import org.testng.annotations.BeforeTest;
|
||||||
import org.testng.annotations.DataProvider;
|
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import com.google.refine.importers.WikitextImporter;
|
import com.google.refine.importers.WikitextImporter;
|
||||||
|
|
||||||
public class WikitextImporterTests extends ImporterTest {
|
public class WikitextImporterTests extends ImporterTest {
|
||||||
|
//System Under Test
|
||||||
|
private WikitextImporter importer = null;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@BeforeTest
|
@BeforeTest
|
||||||
public void init() {
|
public void init() {
|
||||||
logger = LoggerFactory.getLogger(this.getClass());
|
logger = LoggerFactory.getLogger(this.getClass());
|
||||||
}
|
}
|
||||||
|
|
||||||
//constants
|
|
||||||
String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water";
|
|
||||||
|
|
||||||
//System Under Test
|
|
||||||
WikitextImporter importer = null;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@BeforeMethod
|
@BeforeMethod
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
@ -88,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
+ "|-\n"
|
+ "|-\n"
|
||||||
+ "|}\n";
|
+ "|}\n";
|
||||||
try {
|
try {
|
||||||
prepareOptions(0, 0, 0, 0, true);
|
prepareOptions(0, 0, 0, true);
|
||||||
parse(input);
|
parse(input);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Assert.fail("Parsing failed", e);
|
Assert.fail("Parsing failed", e);
|
||||||
@ -110,12 +105,12 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
+"|-\n"
|
+"|-\n"
|
||||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
|
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
|
||||||
+"|-\n"
|
+"|-\n"
|
||||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/]\n"
|
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/ europa.eu]\n"
|
||||||
+"|-\n"
|
+"|-\n"
|
||||||
+"|}\n";
|
+"|}\n";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
prepareOptions(0, 0, 0, 0, true);
|
prepareOptions(0, 0, 0, true);
|
||||||
parse(input);
|
parse(input);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Assert.fail("Parsing failed", e);
|
Assert.fail("Parsing failed", e);
|
||||||
@ -124,9 +119,11 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
Assert.assertEquals(project.rows.size(), 3);
|
Assert.assertEquals(project.rows.size(), 3);
|
||||||
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
||||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
|
||||||
|
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
|
||||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||||
|
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu");
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
@Test
|
@Test
|
||||||
public void readStyledTableWithHeader() {
|
public void readStyledTableWithHeader() {
|
||||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||||
@ -149,17 +146,19 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
+"|}\n";
|
+"|}\n";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
prepareOptions(0, 0, 0, 0, true);
|
prepareOptions(-1, 0, -1, true);
|
||||||
parse(input);
|
parse(input);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Assert.fail("Parsing failed", e);
|
Assert.fail("Parsing failed", e);
|
||||||
}
|
}
|
||||||
Assert.assertEquals(project.columnModel.columns.size(), 7);
|
Assert.assertEquals(project.columnModel.columns.size(), 7);
|
||||||
Assert.assertEquals(project.rows.size(), 3);
|
|
||||||
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
|
|
||||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
|
||||||
|
|
||||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||||
}*/
|
}
|
||||||
|
|
||||||
//--helpers--
|
//--helpers--
|
||||||
|
|
||||||
@ -169,12 +168,12 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
|
|
||||||
private void prepareOptions(
|
private void prepareOptions(
|
||||||
int limit, int skip, int ignoreLines,
|
int limit, int skip, int ignoreLines,
|
||||||
int headerLines, boolean guessValueType) {
|
boolean guessValueType) {
|
||||||
|
|
||||||
whenGetIntegerOption("limit", options, limit);
|
whenGetIntegerOption("limit", options, limit);
|
||||||
whenGetIntegerOption("skipDataLines", options, skip);
|
whenGetIntegerOption("skipDataLines", options, skip);
|
||||||
whenGetIntegerOption("ignoreLines", options, ignoreLines);
|
whenGetIntegerOption("ignoreLines", options, ignoreLines);
|
||||||
whenGetIntegerOption("headerLines", options, headerLines);
|
whenGetIntegerOption("headerLines", options, 1);
|
||||||
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
||||||
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
||||||
}
|
}
|
||||||
@ -185,7 +184,6 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
verify(options, times(1)).getInt("limit");
|
verify(options, times(1)).getInt("limit");
|
||||||
verify(options, times(1)).getInt("skipDataLines");
|
verify(options, times(1)).getInt("skipDataLines");
|
||||||
verify(options, times(1)).getInt("ignoreLines");
|
verify(options, times(1)).getInt("ignoreLines");
|
||||||
verify(options, times(1)).getInt("headerLines");
|
|
||||||
verify(options, times(1)).getBoolean("guessCellValueTypes");
|
verify(options, times(1)).getBoolean("guessCellValueTypes");
|
||||||
verify(options, times(1)).getBoolean("processQuotes");
|
verify(options, times(1)).getBoolean("processQuotes");
|
||||||
verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");
|
verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");
|
||||||
|
Loading…
Reference in New Issue
Block a user