* Don't count TABs as control characters - fixes #3061 * Add TSV test. Replace info logging w/assert message
This commit is contained in:
parent
62ae8ae946
commit
fc21d58ed1
@ -74,7 +74,7 @@ public class TextFormatGuesser implements FormatGuesser {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null && controls < CONTROLS_THRESHOLD) {
|
||||
line = line.trim();
|
||||
controls += CharMatcher.javaIsoControl().countIn(line);
|
||||
controls += CharMatcher.javaIsoControl().and(CharMatcher.whitespace().negate()).countIn(line);
|
||||
openBraces += line.chars().filter(ch -> ch == '{').count();
|
||||
closeBraces += StringUtils.countMatches(line, "}");
|
||||
openAngleBrackets += StringUtils.countMatches(line, "<");
|
||||
|
@ -83,6 +83,7 @@ public class TextFormatGuesserTests extends ImporterTest {
|
||||
|
||||
@Test
|
||||
public void xlsTextGuessTest() throws FileNotFoundException, IOException {
|
||||
// Test an XLSX file without the correct file extension
|
||||
String dir = ClassLoader.getSystemResource("Colorado-Municipalities-small-xlsx.gz").getPath();
|
||||
InputStream is = new GZIPInputStream(new FileInputStream(new File(dir)));
|
||||
File tmp = File.createTempFile("openrefinetests-textguesser", "");
|
||||
@ -96,6 +97,11 @@ public class TextFormatGuesserTests extends ImporterTest {
|
||||
extensionGuesserTests("csv", "text/line-based");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tsvGuesserTest() {
|
||||
extensionGuesserTests("tsv", "text/line-based");
|
||||
}
|
||||
|
||||
@Test(enabled=false) // FIXME: Our JSON guesser doesn't work on small files
|
||||
public void jsonGuesserTest() {
|
||||
extensionGuesserTests("json", "text/json");
|
||||
@ -112,8 +118,7 @@ public class TextFormatGuesserTests extends ImporterTest {
|
||||
File testDataDir = new File(dir);
|
||||
for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
|
||||
String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
|
||||
logger.info(format + " " + testFile);
|
||||
assertEquals(format, expectedFormat);
|
||||
assertEquals(format, expectedFormat, "Format guess failed for " + testFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user