Merge pull request #1275 from OpenRefine/wikitext-url-fix

Forbid pipe characters in URL references to ease parsing.
This commit is contained in:
Antonin Delpeuch 2017-10-20 16:41:00 +02:00 committed by GitHub
commit 21f4d62474
2 changed files with 30 additions and 1 deletions

View File

@ -156,7 +156,7 @@ public class WikitextImporter extends TabularImportingParserBase {
private int spanningCellIdx;
private List<String> internalLinksInCell;
private final Pattern urlPattern = Pattern.compile("\\b(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]",
private final Pattern urlPattern = Pattern.compile("\\b(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_!:,.;]*[-a-zA-Z0-9+&@#/%=~_]",
Pattern.CASE_INSENSITIVE);
public WikitextTableVisitor(boolean blankSpanningCells, boolean includeRawTemplates) {

View File

@ -224,6 +224,35 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "http://gnu.org");
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/");
}
@Test
public void readTableWithReferencesTemplates() {
// inspired from https://www.mediawiki.org/wiki/Help:Tables
String input = "{|\n"
+"! price\n"
+"! fruit\n"
+"! merchant\n"
+"|-\n"
+"| a || b <ref name=\"myref\">{{cite web|url=http://gnu.org|accessdate=2017-08-30}}</ref> || c <ref name=\"ms\"> or {{cite journal|url=http://microsoft.com/|title=BLah}} </ref>\n"
+"|-\n"
+"| d || e <ref name=\"ms\"/>|| f <ref name=\"myref\" />\n"
+"|-\n"
+"|}\n";
try {
prepareOptions(-1, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
}
Assert.assertEquals(project.columnModel.columns.size(), 5);
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://gnu.org");
Assert.assertEquals(project.rows.get(0).cells.get(4).value, "http://microsoft.com/");
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "http://gnu.org");
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/");
}
//--helpers--
private void parse(String wikitext) {