Don't count TABs as control characters - fixes #3061 (#3068)

* Don't count TABs as control characters - fixes #3061

* Add TSV test. Replace info logging w/assert message
This commit is contained in:
Tom Morris 2020-08-16 04:35:25 -04:00 committed by GitHub
parent 62ae8ae946
commit fc21d58ed1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 3 deletions

View File

@ -74,7 +74,7 @@ public class TextFormatGuesser implements FormatGuesser {
String line; String line;
while ((line = reader.readLine()) != null && controls < CONTROLS_THRESHOLD) { while ((line = reader.readLine()) != null && controls < CONTROLS_THRESHOLD) {
line = line.trim(); line = line.trim();
controls += CharMatcher.javaIsoControl().countIn(line); controls += CharMatcher.javaIsoControl().and(CharMatcher.whitespace().negate()).countIn(line);
openBraces += line.chars().filter(ch -> ch == '{').count(); openBraces += line.chars().filter(ch -> ch == '{').count();
closeBraces += StringUtils.countMatches(line, "}"); closeBraces += StringUtils.countMatches(line, "}");
openAngleBrackets += StringUtils.countMatches(line, "<"); openAngleBrackets += StringUtils.countMatches(line, "<");

View File

@ -83,6 +83,7 @@ public class TextFormatGuesserTests extends ImporterTest {
@Test @Test
public void xlsTextGuessTest() throws FileNotFoundException, IOException { public void xlsTextGuessTest() throws FileNotFoundException, IOException {
// Test an XLSX file without the correct file extension
String dir = ClassLoader.getSystemResource("Colorado-Municipalities-small-xlsx.gz").getPath(); String dir = ClassLoader.getSystemResource("Colorado-Municipalities-small-xlsx.gz").getPath();
InputStream is = new GZIPInputStream(new FileInputStream(new File(dir))); InputStream is = new GZIPInputStream(new FileInputStream(new File(dir)));
File tmp = File.createTempFile("openrefinetests-textguesser", ""); File tmp = File.createTempFile("openrefinetests-textguesser", "");
@ -96,6 +97,11 @@ public class TextFormatGuesserTests extends ImporterTest {
extensionGuesserTests("csv", "text/line-based"); extensionGuesserTests("csv", "text/line-based");
} }
@Test
public void tsvGuesserTest() {
extensionGuesserTests("tsv", "text/line-based");
}
@Test(enabled=false) // FIXME: Our JSON guesser doesn't work on small files @Test(enabled=false) // FIXME: Our JSON guesser doesn't work on small files
public void jsonGuesserTest() { public void jsonGuesserTest() {
extensionGuesserTests("json", "text/json"); extensionGuesserTests("json", "text/json");
@ -112,8 +118,7 @@ public class TextFormatGuesserTests extends ImporterTest {
File testDataDir = new File(dir); File testDataDir = new File(dir);
for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) { for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
String format = guesser.guess(new File(dir, testFile), "UTF-8", "text"); String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
logger.info(format + " " + testFile); assertEquals(format, expectedFormat, "Format guess failed for " + testFile);
assertEquals(format, expectedFormat);
} }
} }