Fix the text format guesser so it doesn't inappropriately guess WikiText (#2924)
* Fix text guesser so it doesn't guess wikitext Fixes #2850 - Add simple magic detector for zip & gzip files to keep it from attempting to guess binary files - Add a counter for C0 controls for the same reason - Tighten wikitable counters to require marker at beginning of the line, per the specification - Refactor to use Apache Commons instead of private counting methods - Add tests for most TextGuesser formats * Remove misplaced duplicate test data file * Fix LGTM warning + minor cleanups * Use BoundedInputStream to prevent runaway lines
This commit is contained in:
parent
fb9c8e5fef
commit
a3fab26cca
@ -26,66 +26,92 @@
|
||||
******************************************************************************/
|
||||
package com.google.refine.importers;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.commons.io.input.BoundedInputStream;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.refine.importing.FormatGuesser;
|
||||
|
||||
public class TextFormatGuesser implements FormatGuesser {
|
||||
|
||||
private static final int XML_BRACKETS_THRESHOLD = 5;
|
||||
private static final int JSON_BRACES_THRESHOLD = 5;
|
||||
private static final long CONTROLS_THRESHOLD = 10;
|
||||
|
||||
@Override
|
||||
public String guess(File file, String encoding, String seedFormat) {
|
||||
try {
|
||||
InputStream is = new FileInputStream(file);
|
||||
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
|
||||
try(InputStream fis = new FileInputStream(file)) {
|
||||
if (isCompressed(file)) {
|
||||
return "binary";
|
||||
};
|
||||
|
||||
try {
|
||||
int totalBytes = 0;
|
||||
int openBraces = 0;
|
||||
InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
|
||||
try (BufferedReader reader = new BufferedReader(
|
||||
encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
|
||||
int totalChars = 0;
|
||||
long openBraces = 0;
|
||||
int closeBraces = 0;
|
||||
int openAngleBrackets = 0;
|
||||
int closeAngleBrackets = 0;
|
||||
int wikiTableBegin = 0;
|
||||
int wikiTableEnd = 0;
|
||||
int wikiTableRow = 0;
|
||||
int trailingPeriods = 0;
|
||||
|
||||
int controls = 0;
|
||||
|
||||
char firstChar = ' ';
|
||||
boolean foundFirstChar = false;
|
||||
|
||||
char[] chars = new char[4096];
|
||||
int c;
|
||||
while (totalBytes < 64 * 1024 && (c = reader.read(chars)) > 0) {
|
||||
String chunk = String.valueOf(chars, 0, c);
|
||||
openBraces += countSubstrings(chunk, "{");
|
||||
closeBraces += countSubstrings(chunk, "}");
|
||||
openAngleBrackets += countSubstrings(chunk, "<");
|
||||
closeAngleBrackets += countSubstrings(chunk, ">");
|
||||
wikiTableBegin += countSubstrings(chunk, "{|");
|
||||
wikiTableRow += countSubstrings(chunk, "|-");
|
||||
trailingPeriods += countLineSuffix(chunk, ".");
|
||||
|
||||
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null && controls < CONTROLS_THRESHOLD) {
|
||||
line = line.trim();
|
||||
controls += CharMatcher.javaIsoControl().countIn(line);
|
||||
openBraces += line.chars().filter(ch -> ch == '{').count();
|
||||
closeBraces += StringUtils.countMatches(line, "}");
|
||||
openAngleBrackets += StringUtils.countMatches(line, "<");
|
||||
closeAngleBrackets += StringUtils.countMatches(line, ">");
|
||||
if (line.startsWith("{|")) {
|
||||
wikiTableBegin++;
|
||||
} else if (line.startsWith("|}")) {
|
||||
wikiTableEnd++;
|
||||
} else if (line.startsWith("|-")) {
|
||||
wikiTableRow++;
|
||||
}
|
||||
if (line.endsWith(".")) {
|
||||
trailingPeriods++;
|
||||
}
|
||||
|
||||
if (!foundFirstChar) {
|
||||
chunk = chunk.trim();
|
||||
if (chunk.length() > 0) {
|
||||
firstChar = chunk.charAt(0);
|
||||
if (line.length() > 0) {
|
||||
firstChar = line.charAt(0);
|
||||
foundFirstChar = true;
|
||||
}
|
||||
}
|
||||
totalBytes += c;
|
||||
totalChars += line.length();
|
||||
}
|
||||
|
||||
|
||||
// TODO: Make thresholds proportional to the amount of data read?
|
||||
if (controls >= CONTROLS_THRESHOLD) {
|
||||
return "binary";
|
||||
}
|
||||
|
||||
if (foundFirstChar) {
|
||||
if (wikiTableBegin >= 1 && wikiTableRow >= 2) {
|
||||
if (wikiTableBegin >= 1 && (wikiTableBegin - wikiTableEnd <= 1) && wikiTableRow >= 2) {
|
||||
return "text/wiki";
|
||||
} if ((firstChar == '{' || firstChar == '[') &&
|
||||
openBraces >= 5 && closeBraces >= 5) {
|
||||
openBraces >= JSON_BRACES_THRESHOLD && closeBraces >= JSON_BRACES_THRESHOLD) {
|
||||
return "text/json";
|
||||
} else if (openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
|
||||
} else if (openAngleBrackets >= XML_BRACKETS_THRESHOLD
|
||||
&& closeAngleBrackets >= XML_BRACKETS_THRESHOLD) {
|
||||
if (trailingPeriods > 0) {
|
||||
return "text/rdf/n3";
|
||||
} else if (firstChar == '<') {
|
||||
@ -94,9 +120,6 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
}
|
||||
}
|
||||
return "text/line-based";
|
||||
} finally {
|
||||
reader.close();
|
||||
is.close();
|
||||
}
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
@ -105,46 +128,20 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
static public int countSubstrings(String s, String sub) {
|
||||
int count = 0;
|
||||
int from = 0;
|
||||
while (from < s.length()) {
|
||||
int i = s.indexOf(sub, from);
|
||||
if (i < 0) {
|
||||
break;
|
||||
} else {
|
||||
from = i + sub.length();
|
||||
count++;
|
||||
|
||||
private boolean isCompressed(File file) throws IOException {
|
||||
// Check for common compressed file types to protect ourselves from binary data
|
||||
try(InputStream is = new FileInputStream(file)) {
|
||||
byte[] magic = new byte[4];
|
||||
int count = is.read(magic);
|
||||
if (count == 4 && Arrays.equals(magic, new byte[] {0x50,0x4B, 0x03, 0x04}) || // zip
|
||||
Arrays.equals(magic, new byte[] {0x50,0x4B, 0x07, 0x08}) ||
|
||||
(magic[0] == 0x1F && magic[1] == (byte)0x8B) // gzip
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static public int countLineSuffix(String s, String suffix) {
|
||||
int count = 0;
|
||||
int from = 0;
|
||||
while (from < s.length()) {
|
||||
int lineEnd = s.indexOf('\n', from);
|
||||
if (lineEnd < 0) {
|
||||
break;
|
||||
} else {
|
||||
int i = lineEnd - 1;
|
||||
while (i >= from + suffix.length() - 1) {
|
||||
if (Character.isWhitespace(s.charAt(i))) {
|
||||
i--;
|
||||
} else {
|
||||
String suffix2 = s.subSequence(i - suffix.length() + 1, i + 1).toString();
|
||||
if (suffix2.equals(suffix)) {
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
from = lineEnd + 1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
BIN
main/tests/data/Colorado-Municipalities-small-xlsx.gz
Normal file
BIN
main/tests/data/Colorado-Municipalities-small-xlsx.gz
Normal file
Binary file not shown.
@ -0,0 +1,269 @@
|
||||
/*
|
||||
|
||||
Copyright 2020 OpenRefine committers
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package com.google.refine.importers;
|
||||
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.common.io.PatternFilenameFilter;
|
||||
import com.google.refine.importing.FormatGuesser;
|
||||
|
||||
|
||||
public class TextFormatGuesserTests extends ImporterTest {
|
||||
|
||||
FormatGuesser guesser;
|
||||
|
||||
@Override
|
||||
@BeforeTest
|
||||
public void init() {
|
||||
logger = LoggerFactory.getLogger(this.getClass());
|
||||
}
|
||||
|
||||
@Override
|
||||
@BeforeMethod
|
||||
public void setUp() {
|
||||
super.setUp();
|
||||
guesser = new TextFormatGuesser();
|
||||
}
|
||||
|
||||
@Override
|
||||
@AfterMethod
|
||||
public void tearDown(){
|
||||
guesser = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xlsTextGuessTest() throws FileNotFoundException, IOException {
|
||||
String dir = ClassLoader.getSystemResource("Colorado-Municipalities-small-xlsx.gz").getPath();
|
||||
InputStream is = new GZIPInputStream(new FileInputStream(new File(dir)));
|
||||
File tmp = File.createTempFile("openrefinetests-textguesser", "");
|
||||
FileUtils.copyInputStreamToFile(is, tmp);
|
||||
String format = guesser.guess(tmp, "UTF-8", "text");
|
||||
assertEquals(format, "binary");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void csvGuesserTest() {
|
||||
extensionGuesserTests("csv", "text/line-based");
|
||||
}
|
||||
|
||||
@Test(enabled=false) // FIXME: Our JSON guesser doesn't work on small files
|
||||
public void jsonGuesserTest() {
|
||||
extensionGuesserTests("json", "text/json");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xmlGuesserTest() {
|
||||
extensionGuesserTests("xml", "text/xml");
|
||||
}
|
||||
|
||||
private void extensionGuesserTests(String extension, String expectedFormat) {
|
||||
String dir = ClassLoader.getSystemResource("food.csv").getPath();
|
||||
dir = dir.substring(0, dir.lastIndexOf('/'));
|
||||
File testDataDir = new File(dir);
|
||||
for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
|
||||
String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
|
||||
logger.info(format + " " + testFile);
|
||||
assertEquals(format, expectedFormat);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessWikiTable() throws IOException {
|
||||
String input = "\n"
|
||||
+ "{|\n"
|
||||
+ "|-\n"
|
||||
+ "| a || b<br/>2 || c \n"
|
||||
+ "|-\n"
|
||||
+ "| d || e || f<br>\n"
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
private void testWikiTableString(String input) throws IOException, FileNotFoundException {
|
||||
File tmp = File.createTempFile("openrefinetests-textguesser", "");
|
||||
OutputStreamWriter writer = new OutputStreamWriter(
|
||||
new FileOutputStream(tmp),
|
||||
Charset.forName("UTF-8").newEncoder()
|
||||
);
|
||||
writer.write(input);
|
||||
writer.close();
|
||||
String format = guesser.guess(tmp, "UTF-8", "text");
|
||||
assertEquals(format, "text/wiki");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessTableWithMisplacedHeaders() throws FileNotFoundException, IOException {
|
||||
String input = "\n"
|
||||
+ "{|\n"
|
||||
+ "|-\n"
|
||||
+ "| a || b<br/>2 || c \n"
|
||||
+ "|-\n"
|
||||
+ "| d\n"
|
||||
+ "! e\n"
|
||||
+ "| f<br>\n"
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessTableWithLinks() throws FileNotFoundException, IOException {
|
||||
|
||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||
String input = "\n"
|
||||
+"{|\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || http://www.cedefop.europa.eu/\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/ europa.eu]\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readStyledTableWithHeader() throws FileNotFoundException, IOException {
|
||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||
String input = "\n"
|
||||
+"==Agenturen==\n"
|
||||
+"{| class=\"wikitable sortable\"\n"
|
||||
+"! style=\"text-align:left; width: 60em\" | Offizieller Name\n"
|
||||
+"! style=\"text-align:left; width: 9em\" | Abkürzung\n"
|
||||
+"! style=\"text-align:left; width: 6em\" | Website\n"
|
||||
+"! style=\"text-align:left; width: 15em\" | Standort\n"
|
||||
+"! style=\"text-align:left; width: 18em\" | Staat\n"
|
||||
+"! style=\"text-align:left; width: 6em\" | Gründung\n"
|
||||
+"! style=\"text-align:left; width: 50em\" | Anmerkungen\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || '''Cedefop''' || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || ''EUROFOUND'' || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessTableWithSpanningCells() throws FileNotFoundException, IOException {
|
||||
// inspired from https://www.mediawiki.org/wiki/Help:Tables
|
||||
String input = "{| class=\"wikitable\"\n"
|
||||
+"!colspan=\"6\"|Shopping List\n"
|
||||
+"|-\n"
|
||||
+"|Bread & Butter\n"
|
||||
+"|Pie\n"
|
||||
+"|Buns\n"
|
||||
+"|rowspan=\"2\"|Danish\n"
|
||||
+"|colspan=\"2\"|Croissant\n"
|
||||
+"|-\n"
|
||||
+"|Cheese\n"
|
||||
+"|colspan=\"2\"|Ice cream\n"
|
||||
+"|Butter\n"
|
||||
+"|Yogurt\n"
|
||||
+"|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessTableWithReferences() throws FileNotFoundException, IOException {
|
||||
// inspired from https://www.mediawiki.org/wiki/Help:Tables
|
||||
String input = "{|\n"
|
||||
+"! price\n"
|
||||
+"! fruit\n"
|
||||
+"! merchant\n"
|
||||
+"|-\n"
|
||||
+"| a || b <ref name=\"myref\"> See [http://gnu.org here]</ref> || c <ref name=\"ms\"> or http://microsoft.com/ </ref>\n"
|
||||
+"|-\n"
|
||||
+"| d || e <ref name=\"ms\"/>|| f <ref name=\"myref\" />\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void guessTableWithReferencesTemplates() throws FileNotFoundException, IOException {
|
||||
// inspired from https://www.mediawiki.org/wiki/Help:Tables
|
||||
String input = "{|\n"
|
||||
+"! price\n"
|
||||
+"! fruit\n"
|
||||
+"! merchant\n"
|
||||
+"|-\n"
|
||||
+"| a || b <ref name=\"myref\">{{cite web|url=http://gnu.org|accessdate=2017-08-30}}</ref> || c <ref name=\"ms\"> or {{cite journal|url=http://microsoft.com/|title=BLah}} </ref>\n"
|
||||
+"|-\n"
|
||||
+"| d || e <ref name=\"ms\"/>|| f <ref name=\"myref\" />\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void guessTableWithTemplates() throws FileNotFoundException, IOException {
|
||||
String input = "\n"
|
||||
+ "{|\n"
|
||||
+ "|-\n"
|
||||
+ "| {{free to read}} || b || c \n"
|
||||
+ "|-\n"
|
||||
+ "| d\n"
|
||||
+ "| [[File:My logo.svg|70px]]\n"
|
||||
+ "| f<br>\n"
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
testWikiTableString(input);
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user