diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index 08ce76e6a..a6a848380 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -1,20 +1,15 @@ package com.google.refine.importers; import java.io.IOException; -import java.io.LineNumberReader; import java.io.Reader; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; -import org.apache.commons.io.FileUtils; -import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.io.CharStreams; import de.fau.cs.osr.ptk.common.AstVisitor; -import de.fau.cs.osr.ptk.common.ParserCommon; import org.sweble.wikitext.parser.ParserConfig; import org.sweble.wikitext.parser.utils.SimpleParserConfig; @@ -32,7 +27,6 @@ import org.sweble.wikitext.parser.nodes.WtTableRow; import org.sweble.wikitext.parser.nodes.WtTableCell; import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage; import org.sweble.wikitext.parser.nodes.WtBody; -import org.sweble.wikitext.parser.parser.LinkTargetException; import org.sweble.wikitext.parser.WikitextEncodingValidator; import org.sweble.wikitext.parser.WikitextPreprocessor; @@ -52,7 +46,7 @@ import com.google.refine.util.JSONUtilities; public class WikitextImporter extends TabularImportingParserBase { - static final Logger logger = LoggerFactory.getLogger(WikitextImporter.class); + static final private Logger logger = LoggerFactory.getLogger(WikitextImporter.class); public WikitextImporter() { super(false); @@ -73,28 +67,25 @@ public class WikitextImporter extends TabularImportingParserBase { public List header; public List> rows; private List currentRow; - private StringBuilder currentCellString; + private StringBuilder currentStringBuilder; private String currentInternalLink; private String currentExternalLink; public WikitextTableVisitor() { - header = null; + header = new ArrayList(); rows = new ArrayList>(); - currentCellString = null; + currentStringBuilder = null; currentInternalLink = null; currentExternalLink = null; } @Override - protected boolean before(WtNode node) { + protected WtNode before(WtNode node) { return super.before(node); } public void visit(WtNode e) { - /* - System.out.println("ignoring node:"); - System.out.println(e.getNodeTypeName()); - */ + // Ignore other nodes } public void visit(WtParsedWikitextPage e) { @@ -110,10 +101,7 @@ public class WikitextImporter extends TabularImportingParserBase { } public void visit(WtTableHeader e) { - currentRow = new ArrayList(); - iterate(e); - header = currentRow; - currentRow = null; + header.add(renderAsString(e)); } public void visit(WtTableRow e) @@ -131,24 +119,30 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtTableCell e) { if (currentRow != null) { - currentCellString = new StringBuilder(); - iterate(e); - String cellValue = currentCellString.toString().trim(); - currentRow.add(cellValue); - currentCellString = null; + currentRow.add(renderAsString(e)); } } + public String renderAsString(WtNode e) { + currentStringBuilder = new StringBuilder(); + iterate(e); + String value = currentStringBuilder.toString().trim(); + currentStringBuilder = null; + return value; + } + public void visit(WtText text) { - currentCellString.append(text.getContent()); + if (currentStringBuilder != null) { + currentStringBuilder.append(text.getContent()); + } } public void visit(WtNoLinkTitle e) { if (currentInternalLink != null) { - currentCellString.append(currentInternalLink); + currentStringBuilder.append(currentInternalLink); } else if (currentExternalLink != null) { - currentCellString.append(currentExternalLink); + currentStringBuilder.append(currentExternalLink); } } @@ -175,6 +169,36 @@ public class WikitextImporter extends TabularImportingParserBase { return rows; } } + + public class WikiTableDataReader implements TableDataReader { + private int currentRow = -1; + private WikitextTableVisitor visitor = null; + + public WikiTableDataReader(WikitextTableVisitor visitor) { + this.visitor = visitor; + currentRow = -1; + } + + @Override + public List getNextRowOfCells() throws IOException { + List row = null; + List origRow = null; + if (currentRow == -1) { + origRow = this.visitor.header; + } else if(currentRow < this.visitor.rows.size()) { + origRow = this.visitor.rows.get(currentRow); + } + currentRow++; + + if (origRow != null) { + row = new ArrayList(); + for (int i = 0; i < origRow.size(); i++) { + row.add(origRow.get(i)); + } + } + return row; + } + } @Override public void parseOneFile( @@ -187,33 +211,6 @@ public class WikitextImporter extends TabularImportingParserBase { JSONObject options, List exceptions ) { - /* - final List columnNames; - if (options.has("columnNames")) { - columnNames = new ArrayList(); - String[] strings = JSONUtilities.getStringArray(options, "columnNames"); - for (String s : strings) { - columnNames.add(s); - } - JSONUtilities.safePut(options, "headerLines", 1); - } else { - columnNames = null; - JSONUtilities.safePut(options, "headerLines", 0); - } - - final LineNumberReader lnReader = new LineNumberReader(reader); - - try { - int skip = JSONUtilities.getInt(options, "ignoreLines", -1); - while (skip > 0) { - lnReader.readLine(); - skip--; - } - } catch (IOException e) { - logger.error("Error reading line-based file", e); - } - JSONUtilities.safePut(options, "ignoreLines", -1); */ - // Set-up a simple wiki configuration ParserConfig parserConfig = new SimpleParserConfig(); @@ -227,14 +224,12 @@ public class WikitextImporter extends TabularImportingParserBase { ValidatedWikitext validated = v.validate(parserConfig, wikitext, title); // Pre-processing - WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig); WtPreproWikitextPage prepArticle = (WtPreproWikitextPage) prep.parseArticle(validated, title, false); // Parsing - PreprocessedWikitext ppw = PreprocessorToParserTransformer .transform(prepArticle); @@ -247,25 +242,9 @@ public class WikitextImporter extends TabularImportingParserBase { final WikitextTableVisitor vs = new WikitextTableVisitor(); vs.go(parsedArticle); - TableDataReader dataReader = new TableDataReader() { - private int currentRow = 0; - @Override - public List getNextRowOfCells() throws IOException { - List row = null; - if(currentRow < vs.rows.size()) { - List origRow = vs.rows.get(currentRow); - row = new ArrayList(); - for (int i = 0; i < origRow.size(); i++) { - row.add(origRow.get(i)); - } - currentRow++; - } - return row; - } - }; - int headerLines = vs.header != null ? 1 : 0; + TableDataReader dataReader = new WikiTableDataReader(vs); - JSONUtilities.safePut(options, "headerLines", headerLines); + JSONUtilities.safePut(options, "headerLines", 1); TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); } catch (IOException e1) { diff --git a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java index 018b4a922..200fbf94b 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java @@ -44,25 +44,20 @@ import org.testng.Assert; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import com.google.refine.importers.WikitextImporter; public class WikitextImporterTests extends ImporterTest { - + //System Under Test + private WikitextImporter importer = null; + @Override @BeforeTest public void init() { logger = LoggerFactory.getLogger(this.getClass()); } - //constants - String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water"; - - //System Under Test - WikitextImporter importer = null; - @Override @BeforeMethod public void setUp() { @@ -88,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest { + "|-\n" + "|}\n"; try { - prepareOptions(0, 0, 0, 0, true); + prepareOptions(0, 0, 0, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -110,12 +105,12 @@ public class WikitextImporterTests extends ImporterTest { +"|-\n" +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n" +"|-\n" - +"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/]\n" + +"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/ europa.eu]\n" +"|-\n" +"|}\n"; try { - prepareOptions(0, 0, 0, 0, true); + prepareOptions(0, 0, 0, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -124,9 +119,11 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertEquals(project.rows.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop"); + Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); + Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu"); } -/* + @Test public void readStyledTableWithHeader() { // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit @@ -149,17 +146,19 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(0, 0, 0, 0, true); + prepareOptions(-1, 0, -1, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 7); - Assert.assertEquals(project.rows.size(), 3); - Assert.assertEquals(project.rows.get(0).cells.size(), 7); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung"); + Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name"); + Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen"); + Assert.assertEquals(project.rows.get(0).cells.size(), 7); + Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); - }*/ + } //--helpers-- @@ -169,12 +168,12 @@ public class WikitextImporterTests extends ImporterTest { private void prepareOptions( int limit, int skip, int ignoreLines, - int headerLines, boolean guessValueType) { + boolean guessValueType) { whenGetIntegerOption("limit", options, limit); whenGetIntegerOption("skipDataLines", options, skip); whenGetIntegerOption("ignoreLines", options, ignoreLines); - whenGetIntegerOption("headerLines", options, headerLines); + whenGetIntegerOption("headerLines", options, 1); whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("storeBlankCellsAsNulls", options, true); } @@ -185,7 +184,6 @@ public class WikitextImporterTests extends ImporterTest { verify(options, times(1)).getInt("limit"); verify(options, times(1)).getInt("skipDataLines"); verify(options, times(1)).getInt("ignoreLines"); - verify(options, times(1)).getInt("headerLines"); verify(options, times(1)).getBoolean("guessCellValueTypes"); verify(options, times(1)).getBoolean("processQuotes"); verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");