Add support for table headers
This commit is contained in:
parent
88aa6f113d
commit
e168c900e8
@ -1,20 +1,15 @@
|
||||
package com.google.refine.importers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import com.google.common.io.CharStreams;
|
||||
import de.fau.cs.osr.ptk.common.AstVisitor;
|
||||
import de.fau.cs.osr.ptk.common.ParserCommon;
|
||||
|
||||
import org.sweble.wikitext.parser.ParserConfig;
|
||||
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
|
||||
@ -32,7 +27,6 @@ import org.sweble.wikitext.parser.nodes.WtTableRow;
|
||||
import org.sweble.wikitext.parser.nodes.WtTableCell;
|
||||
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
|
||||
import org.sweble.wikitext.parser.nodes.WtBody;
|
||||
import org.sweble.wikitext.parser.parser.LinkTargetException;
|
||||
|
||||
import org.sweble.wikitext.parser.WikitextEncodingValidator;
|
||||
import org.sweble.wikitext.parser.WikitextPreprocessor;
|
||||
@ -52,7 +46,7 @@ import com.google.refine.util.JSONUtilities;
|
||||
|
||||
|
||||
public class WikitextImporter extends TabularImportingParserBase {
|
||||
static final Logger logger = LoggerFactory.getLogger(WikitextImporter.class);
|
||||
static final private Logger logger = LoggerFactory.getLogger(WikitextImporter.class);
|
||||
|
||||
public WikitextImporter() {
|
||||
super(false);
|
||||
@ -73,28 +67,25 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
public List<String> header;
|
||||
public List<List<String>> rows;
|
||||
private List<String> currentRow;
|
||||
private StringBuilder currentCellString;
|
||||
private StringBuilder currentStringBuilder;
|
||||
private String currentInternalLink;
|
||||
private String currentExternalLink;
|
||||
|
||||
public WikitextTableVisitor() {
|
||||
header = null;
|
||||
header = new ArrayList<String>();
|
||||
rows = new ArrayList<List<String>>();
|
||||
currentCellString = null;
|
||||
currentStringBuilder = null;
|
||||
currentInternalLink = null;
|
||||
currentExternalLink = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean before(WtNode node) {
|
||||
protected WtNode before(WtNode node) {
|
||||
return super.before(node);
|
||||
}
|
||||
|
||||
public void visit(WtNode e) {
|
||||
/*
|
||||
System.out.println("ignoring node:");
|
||||
System.out.println(e.getNodeTypeName());
|
||||
*/
|
||||
// Ignore other nodes
|
||||
}
|
||||
|
||||
public void visit(WtParsedWikitextPage e) {
|
||||
@ -110,10 +101,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
}
|
||||
|
||||
public void visit(WtTableHeader e) {
|
||||
currentRow = new ArrayList<String>();
|
||||
iterate(e);
|
||||
header = currentRow;
|
||||
currentRow = null;
|
||||
header.add(renderAsString(e));
|
||||
}
|
||||
|
||||
public void visit(WtTableRow e)
|
||||
@ -131,24 +119,30 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
public void visit(WtTableCell e)
|
||||
{
|
||||
if (currentRow != null) {
|
||||
currentCellString = new StringBuilder();
|
||||
iterate(e);
|
||||
String cellValue = currentCellString.toString().trim();
|
||||
currentRow.add(cellValue);
|
||||
currentCellString = null;
|
||||
currentRow.add(renderAsString(e));
|
||||
}
|
||||
}
|
||||
|
||||
public String renderAsString(WtNode e) {
|
||||
currentStringBuilder = new StringBuilder();
|
||||
iterate(e);
|
||||
String value = currentStringBuilder.toString().trim();
|
||||
currentStringBuilder = null;
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
public void visit(WtText text) {
|
||||
currentCellString.append(text.getContent());
|
||||
if (currentStringBuilder != null) {
|
||||
currentStringBuilder.append(text.getContent());
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtNoLinkTitle e) {
|
||||
if (currentInternalLink != null) {
|
||||
currentCellString.append(currentInternalLink);
|
||||
currentStringBuilder.append(currentInternalLink);
|
||||
} else if (currentExternalLink != null) {
|
||||
currentCellString.append(currentExternalLink);
|
||||
currentStringBuilder.append(currentExternalLink);
|
||||
}
|
||||
}
|
||||
|
||||
@ -175,6 +169,36 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
return rows;
|
||||
}
|
||||
}
|
||||
|
||||
public class WikiTableDataReader implements TableDataReader {
|
||||
private int currentRow = -1;
|
||||
private WikitextTableVisitor visitor = null;
|
||||
|
||||
public WikiTableDataReader(WikitextTableVisitor visitor) {
|
||||
this.visitor = visitor;
|
||||
currentRow = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Object> getNextRowOfCells() throws IOException {
|
||||
List<Object> row = null;
|
||||
List<String> origRow = null;
|
||||
if (currentRow == -1) {
|
||||
origRow = this.visitor.header;
|
||||
} else if(currentRow < this.visitor.rows.size()) {
|
||||
origRow = this.visitor.rows.get(currentRow);
|
||||
}
|
||||
currentRow++;
|
||||
|
||||
if (origRow != null) {
|
||||
row = new ArrayList<Object>();
|
||||
for (int i = 0; i < origRow.size(); i++) {
|
||||
row.add(origRow.get(i));
|
||||
}
|
||||
}
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseOneFile(
|
||||
@ -187,33 +211,6 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
JSONObject options,
|
||||
List<Exception> exceptions
|
||||
) {
|
||||
/*
|
||||
final List<Object> columnNames;
|
||||
if (options.has("columnNames")) {
|
||||
columnNames = new ArrayList<Object>();
|
||||
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
|
||||
for (String s : strings) {
|
||||
columnNames.add(s);
|
||||
}
|
||||
JSONUtilities.safePut(options, "headerLines", 1);
|
||||
} else {
|
||||
columnNames = null;
|
||||
JSONUtilities.safePut(options, "headerLines", 0);
|
||||
}
|
||||
|
||||
final LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
|
||||
try {
|
||||
int skip = JSONUtilities.getInt(options, "ignoreLines", -1);
|
||||
while (skip > 0) {
|
||||
lnReader.readLine();
|
||||
skip--;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("Error reading line-based file", e);
|
||||
}
|
||||
JSONUtilities.safePut(options, "ignoreLines", -1); */
|
||||
|
||||
// Set-up a simple wiki configuration
|
||||
ParserConfig parserConfig = new SimpleParserConfig();
|
||||
|
||||
@ -227,14 +224,12 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
|
||||
|
||||
// Pre-processing
|
||||
|
||||
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
|
||||
|
||||
WtPreproWikitextPage prepArticle =
|
||||
(WtPreproWikitextPage) prep.parseArticle(validated, title, false);
|
||||
|
||||
// Parsing
|
||||
|
||||
PreprocessedWikitext ppw = PreprocessorToParserTransformer
|
||||
.transform(prepArticle);
|
||||
|
||||
@ -247,25 +242,9 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
final WikitextTableVisitor vs = new WikitextTableVisitor();
|
||||
vs.go(parsedArticle);
|
||||
|
||||
TableDataReader dataReader = new TableDataReader() {
|
||||
private int currentRow = 0;
|
||||
@Override
|
||||
public List<Object> getNextRowOfCells() throws IOException {
|
||||
List<Object> row = null;
|
||||
if(currentRow < vs.rows.size()) {
|
||||
List<String> origRow = vs.rows.get(currentRow);
|
||||
row = new ArrayList<Object>();
|
||||
for (int i = 0; i < origRow.size(); i++) {
|
||||
row.add(origRow.get(i));
|
||||
}
|
||||
currentRow++;
|
||||
}
|
||||
return row;
|
||||
}
|
||||
};
|
||||
int headerLines = vs.header != null ? 1 : 0;
|
||||
TableDataReader dataReader = new WikiTableDataReader(vs);
|
||||
|
||||
JSONUtilities.safePut(options, "headerLines", headerLines);
|
||||
JSONUtilities.safePut(options, "headerLines", 1);
|
||||
|
||||
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
|
||||
} catch (IOException e1) {
|
||||
|
@ -44,25 +44,20 @@ import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.refine.importers.WikitextImporter;
|
||||
|
||||
public class WikitextImporterTests extends ImporterTest {
|
||||
|
||||
//System Under Test
|
||||
private WikitextImporter importer = null;
|
||||
|
||||
@Override
|
||||
@BeforeTest
|
||||
public void init() {
|
||||
logger = LoggerFactory.getLogger(this.getClass());
|
||||
}
|
||||
|
||||
//constants
|
||||
String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water";
|
||||
|
||||
//System Under Test
|
||||
WikitextImporter importer = null;
|
||||
|
||||
@Override
|
||||
@BeforeMethod
|
||||
public void setUp() {
|
||||
@ -88,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
try {
|
||||
prepareOptions(0, 0, 0, 0, true);
|
||||
prepareOptions(0, 0, 0, true);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
@ -110,12 +105,12 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|-\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/]\n"
|
||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/ europa.eu]\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(0, 0, 0, 0, true);
|
||||
prepareOptions(0, 0, 0, true);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
@ -124,9 +119,11 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
Assert.assertEquals(project.rows.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu");
|
||||
}
|
||||
/*
|
||||
|
||||
@Test
|
||||
public void readStyledTableWithHeader() {
|
||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||
@ -149,17 +146,19 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(0, 0, 0, 0, true);
|
||||
prepareOptions(-1, 0, -1, true);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 7);
|
||||
Assert.assertEquals(project.rows.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
|
||||
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
|
||||
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
|
||||
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||
}*/
|
||||
}
|
||||
|
||||
//--helpers--
|
||||
|
||||
@ -169,12 +168,12 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
|
||||
private void prepareOptions(
|
||||
int limit, int skip, int ignoreLines,
|
||||
int headerLines, boolean guessValueType) {
|
||||
boolean guessValueType) {
|
||||
|
||||
whenGetIntegerOption("limit", options, limit);
|
||||
whenGetIntegerOption("skipDataLines", options, skip);
|
||||
whenGetIntegerOption("ignoreLines", options, ignoreLines);
|
||||
whenGetIntegerOption("headerLines", options, headerLines);
|
||||
whenGetIntegerOption("headerLines", options, 1);
|
||||
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
||||
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
||||
}
|
||||
@ -185,7 +184,6 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
verify(options, times(1)).getInt("limit");
|
||||
verify(options, times(1)).getInt("skipDataLines");
|
||||
verify(options, times(1)).getInt("ignoreLines");
|
||||
verify(options, times(1)).getInt("headerLines");
|
||||
verify(options, times(1)).getBoolean("guessCellValueTypes");
|
||||
verify(options, times(1)).getBoolean("processQuotes");
|
||||
verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");
|
||||
|
Loading…
Reference in New Issue
Block a user