Add support for table headers

This commit is contained in:
Antonin Delpeuch 2017-08-13 20:14:48 +01:00
parent 88aa6f113d
commit e168c900e8
2 changed files with 70 additions and 93 deletions

View File

@ -1,20 +1,15 @@
package com.google.refine.importers; package com.google.refine.importers;
import java.io.IOException; import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader; import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.io.FileUtils;
import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.io.CharStreams; import com.google.common.io.CharStreams;
import de.fau.cs.osr.ptk.common.AstVisitor; import de.fau.cs.osr.ptk.common.AstVisitor;
import de.fau.cs.osr.ptk.common.ParserCommon;
import org.sweble.wikitext.parser.ParserConfig; import org.sweble.wikitext.parser.ParserConfig;
import org.sweble.wikitext.parser.utils.SimpleParserConfig; import org.sweble.wikitext.parser.utils.SimpleParserConfig;
@ -32,7 +27,6 @@ import org.sweble.wikitext.parser.nodes.WtTableRow;
import org.sweble.wikitext.parser.nodes.WtTableCell; import org.sweble.wikitext.parser.nodes.WtTableCell;
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage; import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
import org.sweble.wikitext.parser.nodes.WtBody; import org.sweble.wikitext.parser.nodes.WtBody;
import org.sweble.wikitext.parser.parser.LinkTargetException;
import org.sweble.wikitext.parser.WikitextEncodingValidator; import org.sweble.wikitext.parser.WikitextEncodingValidator;
import org.sweble.wikitext.parser.WikitextPreprocessor; import org.sweble.wikitext.parser.WikitextPreprocessor;
@ -52,7 +46,7 @@ import com.google.refine.util.JSONUtilities;
public class WikitextImporter extends TabularImportingParserBase { public class WikitextImporter extends TabularImportingParserBase {
static final Logger logger = LoggerFactory.getLogger(WikitextImporter.class); static final private Logger logger = LoggerFactory.getLogger(WikitextImporter.class);
public WikitextImporter() { public WikitextImporter() {
super(false); super(false);
@ -73,28 +67,25 @@ public class WikitextImporter extends TabularImportingParserBase {
public List<String> header; public List<String> header;
public List<List<String>> rows; public List<List<String>> rows;
private List<String> currentRow; private List<String> currentRow;
private StringBuilder currentCellString; private StringBuilder currentStringBuilder;
private String currentInternalLink; private String currentInternalLink;
private String currentExternalLink; private String currentExternalLink;
public WikitextTableVisitor() { public WikitextTableVisitor() {
header = null; header = new ArrayList<String>();
rows = new ArrayList<List<String>>(); rows = new ArrayList<List<String>>();
currentCellString = null; currentStringBuilder = null;
currentInternalLink = null; currentInternalLink = null;
currentExternalLink = null; currentExternalLink = null;
} }
@Override @Override
protected boolean before(WtNode node) { protected WtNode before(WtNode node) {
return super.before(node); return super.before(node);
} }
public void visit(WtNode e) { public void visit(WtNode e) {
/* // Ignore other nodes
System.out.println("ignoring node:");
System.out.println(e.getNodeTypeName());
*/
} }
public void visit(WtParsedWikitextPage e) { public void visit(WtParsedWikitextPage e) {
@ -110,10 +101,7 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
public void visit(WtTableHeader e) { public void visit(WtTableHeader e) {
currentRow = new ArrayList<String>(); header.add(renderAsString(e));
iterate(e);
header = currentRow;
currentRow = null;
} }
public void visit(WtTableRow e) public void visit(WtTableRow e)
@ -131,24 +119,30 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtTableCell e) public void visit(WtTableCell e)
{ {
if (currentRow != null) { if (currentRow != null) {
currentCellString = new StringBuilder(); currentRow.add(renderAsString(e));
iterate(e);
String cellValue = currentCellString.toString().trim();
currentRow.add(cellValue);
currentCellString = null;
} }
} }
public String renderAsString(WtNode e) {
currentStringBuilder = new StringBuilder();
iterate(e);
String value = currentStringBuilder.toString().trim();
currentStringBuilder = null;
return value;
}
public void visit(WtText text) { public void visit(WtText text) {
currentCellString.append(text.getContent()); if (currentStringBuilder != null) {
currentStringBuilder.append(text.getContent());
}
} }
public void visit(WtNoLinkTitle e) { public void visit(WtNoLinkTitle e) {
if (currentInternalLink != null) { if (currentInternalLink != null) {
currentCellString.append(currentInternalLink); currentStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) { } else if (currentExternalLink != null) {
currentCellString.append(currentExternalLink); currentStringBuilder.append(currentExternalLink);
} }
} }
@ -176,6 +170,36 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
} }
public class WikiTableDataReader implements TableDataReader {
private int currentRow = -1;
private WikitextTableVisitor visitor = null;
public WikiTableDataReader(WikitextTableVisitor visitor) {
this.visitor = visitor;
currentRow = -1;
}
@Override
public List<Object> getNextRowOfCells() throws IOException {
List<Object> row = null;
List<String> origRow = null;
if (currentRow == -1) {
origRow = this.visitor.header;
} else if(currentRow < this.visitor.rows.size()) {
origRow = this.visitor.rows.get(currentRow);
}
currentRow++;
if (origRow != null) {
row = new ArrayList<Object>();
for (int i = 0; i < origRow.size(); i++) {
row.add(origRow.get(i));
}
}
return row;
}
}
@Override @Override
public void parseOneFile( public void parseOneFile(
Project project, Project project,
@ -187,33 +211,6 @@ public class WikitextImporter extends TabularImportingParserBase {
JSONObject options, JSONObject options,
List<Exception> exceptions List<Exception> exceptions
) { ) {
/*
final List<Object> columnNames;
if (options.has("columnNames")) {
columnNames = new ArrayList<Object>();
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
for (String s : strings) {
columnNames.add(s);
}
JSONUtilities.safePut(options, "headerLines", 1);
} else {
columnNames = null;
JSONUtilities.safePut(options, "headerLines", 0);
}
final LineNumberReader lnReader = new LineNumberReader(reader);
try {
int skip = JSONUtilities.getInt(options, "ignoreLines", -1);
while (skip > 0) {
lnReader.readLine();
skip--;
}
} catch (IOException e) {
logger.error("Error reading line-based file", e);
}
JSONUtilities.safePut(options, "ignoreLines", -1); */
// Set-up a simple wiki configuration // Set-up a simple wiki configuration
ParserConfig parserConfig = new SimpleParserConfig(); ParserConfig parserConfig = new SimpleParserConfig();
@ -227,14 +224,12 @@ public class WikitextImporter extends TabularImportingParserBase {
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title); ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
// Pre-processing // Pre-processing
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig); WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
WtPreproWikitextPage prepArticle = WtPreproWikitextPage prepArticle =
(WtPreproWikitextPage) prep.parseArticle(validated, title, false); (WtPreproWikitextPage) prep.parseArticle(validated, title, false);
// Parsing // Parsing
PreprocessedWikitext ppw = PreprocessorToParserTransformer PreprocessedWikitext ppw = PreprocessorToParserTransformer
.transform(prepArticle); .transform(prepArticle);
@ -247,25 +242,9 @@ public class WikitextImporter extends TabularImportingParserBase {
final WikitextTableVisitor vs = new WikitextTableVisitor(); final WikitextTableVisitor vs = new WikitextTableVisitor();
vs.go(parsedArticle); vs.go(parsedArticle);
TableDataReader dataReader = new TableDataReader() { TableDataReader dataReader = new WikiTableDataReader(vs);
private int currentRow = 0;
@Override
public List<Object> getNextRowOfCells() throws IOException {
List<Object> row = null;
if(currentRow < vs.rows.size()) {
List<String> origRow = vs.rows.get(currentRow);
row = new ArrayList<Object>();
for (int i = 0; i < origRow.size(); i++) {
row.add(origRow.get(i));
}
currentRow++;
}
return row;
}
};
int headerLines = vs.header != null ? 1 : 0;
JSONUtilities.safePut(options, "headerLines", headerLines); JSONUtilities.safePut(options, "headerLines", 1);
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
} catch (IOException e1) { } catch (IOException e1) {

View File

@ -44,12 +44,13 @@ import org.testng.Assert;
import org.testng.annotations.AfterMethod; import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
import org.testng.annotations.BeforeTest; import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import com.google.refine.importers.WikitextImporter; import com.google.refine.importers.WikitextImporter;
public class WikitextImporterTests extends ImporterTest { public class WikitextImporterTests extends ImporterTest {
//System Under Test
private WikitextImporter importer = null;
@Override @Override
@BeforeTest @BeforeTest
@ -57,12 +58,6 @@ public class WikitextImporterTests extends ImporterTest {
logger = LoggerFactory.getLogger(this.getClass()); logger = LoggerFactory.getLogger(this.getClass());
} }
//constants
String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water";
//System Under Test
WikitextImporter importer = null;
@Override @Override
@BeforeMethod @BeforeMethod
public void setUp() { public void setUp() {
@ -88,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
+ "|-\n" + "|-\n"
+ "|}\n"; + "|}\n";
try { try {
prepareOptions(0, 0, 0, 0, true); prepareOptions(0, 0, 0, true);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -110,12 +105,12 @@ public class WikitextImporterTests extends ImporterTest {
+"|-\n" +"|-\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n" +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
+"|-\n" +"|-\n"
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/]\n" +"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/ europa.eu]\n"
+"|-\n" +"|-\n"
+"|}\n"; +"|}\n";
try { try {
prepareOptions(0, 0, 0, 0, true); prepareOptions(0, 0, 0, true);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -124,9 +119,11 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.size(), 3); Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu");
} }
/*
@Test @Test
public void readStyledTableWithHeader() { public void readStyledTableWithHeader() {
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
@ -149,17 +146,19 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(0, 0, 0, 0, true); prepareOptions(-1, 0, -1, true);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
} }
Assert.assertEquals(project.columnModel.columns.size(), 7); Assert.assertEquals(project.columnModel.columns.size(), 7);
Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
}*/ }
//--helpers-- //--helpers--
@ -169,12 +168,12 @@ public class WikitextImporterTests extends ImporterTest {
private void prepareOptions( private void prepareOptions(
int limit, int skip, int ignoreLines, int limit, int skip, int ignoreLines,
int headerLines, boolean guessValueType) { boolean guessValueType) {
whenGetIntegerOption("limit", options, limit); whenGetIntegerOption("limit", options, limit);
whenGetIntegerOption("skipDataLines", options, skip); whenGetIntegerOption("skipDataLines", options, skip);
whenGetIntegerOption("ignoreLines", options, ignoreLines); whenGetIntegerOption("ignoreLines", options, ignoreLines);
whenGetIntegerOption("headerLines", options, headerLines); whenGetIntegerOption("headerLines", options, 1);
whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
whenGetBooleanOption("storeBlankCellsAsNulls", options, true); whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
} }
@ -185,7 +184,6 @@ public class WikitextImporterTests extends ImporterTest {
verify(options, times(1)).getInt("limit"); verify(options, times(1)).getInt("limit");
verify(options, times(1)).getInt("skipDataLines"); verify(options, times(1)).getInt("skipDataLines");
verify(options, times(1)).getInt("ignoreLines"); verify(options, times(1)).getInt("ignoreLines");
verify(options, times(1)).getInt("headerLines");
verify(options, times(1)).getBoolean("guessCellValueTypes"); verify(options, times(1)).getBoolean("guessCellValueTypes");
verify(options, times(1)).getBoolean("processQuotes"); verify(options, times(1)).getBoolean("processQuotes");
verify(options, times(1)).getBoolean("storeBlankCellsAsNulls"); verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");