* Use standard text normalization - fixes #2898 Fixes #2898. Fixes #409. Refs #650 Replaces homegrown ISO Latin-1 only character subsitition with standard Java Normalize to NFD, followed by diacritic removal and a few custom character expansions/replacements. * Fix Mac build * Improve compatibility with previous code One intentional change is folding O with stroke to oe instead of o. - Use more powerful NFKD instead of NFD - strip punctuation after decomposition since it can generate new punctuation - Add compatibility test for old asciify() method - Add some graphically similar characters to substitution table * Add oe character/ligature & more long S forms * More tests for ligatures and Latin Extended * Add Latin-1 Supplement tests
This commit is contained in:
parent
ab2ec8d28f
commit
0562638ffa
@ -21,7 +21,7 @@ jobs:
|
||||
- brew services start mysql
|
||||
- brew services start postgresql
|
||||
- brew services start mariadb@10.3
|
||||
#- sleep 15 # wait for databases to start up
|
||||
- sleep 15 # wait for databases to start up
|
||||
# Homebrew postgres workaround - create expected user postgres
|
||||
- /usr/local/opt/postgres/bin/createuser -s postgres
|
||||
# FIXME this is duplicated from linux config, but don't know a better way to do it
|
||||
|
@ -33,27 +33,62 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
package com.google.refine.clustering.binning;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
public class FingerprintKeyer extends Keyer {
|
||||
|
||||
// Punctuation and control characters (except for TAB which we need for split to work)
|
||||
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]");
|
||||
|
||||
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
|
||||
Pattern.UNICODE_CHARACTER_CLASS);
|
||||
|
||||
public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
|
||||
.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
|
||||
|
||||
// First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
|
||||
private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
|
||||
//Replace non-diacritics as their equivalent characters
|
||||
.put("ß", "ss")
|
||||
.put("æ", "ae")
|
||||
.put("ø", "oe")
|
||||
.put("å", "aa") // TODO: We'll never see this after decomposition
|
||||
.put("©", "c") // copyright character
|
||||
.put("\u00F0", "d") // Small letter Icelandic eth
|
||||
.put("\u0111", "d") // Small letter D with stroke
|
||||
.put("\u0256", "d") // Small letter African D
|
||||
.put("\u00FE", "th") // Lower case Icelandic thorn þ
|
||||
// Visually similar replacements from our private former asciify() method
|
||||
// (only need lower case forms since we're already downcased)
|
||||
.put("\u0127", "h") // small H with stroke
|
||||
.put("\u0131", "i") // dotless I
|
||||
.put("\u0138", "k") // small letter Kra
|
||||
.put("\u0142", "l") // Bialystock
|
||||
.put("\u014B", "n") // Small letter Eng
|
||||
.put("\u017F", "s") // long s
|
||||
.put("\u0167", "t") // small letter T with stroke
|
||||
// Additional characters following the same principle
|
||||
.put("œ", "oe")
|
||||
.put("ẜ", "s") // more long S forms
|
||||
.put("ẝ", "s")
|
||||
.build();
|
||||
|
||||
|
||||
@Override
|
||||
public String key(String s, Object... o) {
|
||||
if (s == null || o !=null && o.length > 0) {
|
||||
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
|
||||
}
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
s = asciify(s); // find ASCII equivalent to characters
|
||||
String[] frags = StringUtils.split(s); // split by whitespace
|
||||
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
|
||||
s = normalize(s);
|
||||
s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
|
||||
String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
for (String ss : frags) {
|
||||
set.add(ss); // order fragments and dedupe
|
||||
@ -69,222 +104,32 @@ public class FingerprintKeyer extends Keyer {
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
protected String normalize(String s) {
|
||||
s = stripDiacritics(s);
|
||||
s = stripNonDiacritics(s);
|
||||
return s;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected String asciify(String s) {
|
||||
char[] c = s.toCharArray();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (char element : c) {
|
||||
b.append(translate(element));
|
||||
}
|
||||
return b.toString();
|
||||
return normalize(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate the given unicode char in the closest ASCII representation
|
||||
* NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts
|
||||
*/
|
||||
private char translate(char c) {
|
||||
switch(c) {
|
||||
case '\u00C0':
|
||||
case '\u00C1':
|
||||
case '\u00C2':
|
||||
case '\u00C3':
|
||||
case '\u00C4':
|
||||
case '\u00C5':
|
||||
case '\u00E0':
|
||||
case '\u00E1':
|
||||
case '\u00E2':
|
||||
case '\u00E3':
|
||||
case '\u00E4':
|
||||
case '\u00E5':
|
||||
case '\u0100':
|
||||
case '\u0101':
|
||||
case '\u0102':
|
||||
case '\u0103':
|
||||
case '\u0104':
|
||||
case '\u0105':
|
||||
return 'a';
|
||||
case '\u00C7':
|
||||
case '\u00E7':
|
||||
case '\u0106':
|
||||
case '\u0107':
|
||||
case '\u0108':
|
||||
case '\u0109':
|
||||
case '\u010A':
|
||||
case '\u010B':
|
||||
case '\u010C':
|
||||
case '\u010D':
|
||||
return 'c';
|
||||
case '\u00D0':
|
||||
case '\u00F0':
|
||||
case '\u010E':
|
||||
case '\u010F':
|
||||
case '\u0110':
|
||||
case '\u0111':
|
||||
return 'd';
|
||||
case '\u00C8':
|
||||
case '\u00C9':
|
||||
case '\u00CA':
|
||||
case '\u00CB':
|
||||
case '\u00E8':
|
||||
case '\u00E9':
|
||||
case '\u00EA':
|
||||
case '\u00EB':
|
||||
case '\u0112':
|
||||
case '\u0113':
|
||||
case '\u0114':
|
||||
case '\u0115':
|
||||
case '\u0116':
|
||||
case '\u0117':
|
||||
case '\u0118':
|
||||
case '\u0119':
|
||||
case '\u011A':
|
||||
case '\u011B':
|
||||
return 'e';
|
||||
case '\u011C':
|
||||
case '\u011D':
|
||||
case '\u011E':
|
||||
case '\u011F':
|
||||
case '\u0120':
|
||||
case '\u0121':
|
||||
case '\u0122':
|
||||
case '\u0123':
|
||||
return 'g';
|
||||
case '\u0124':
|
||||
case '\u0125':
|
||||
case '\u0126':
|
||||
case '\u0127':
|
||||
return 'h';
|
||||
case '\u00CC':
|
||||
case '\u00CD':
|
||||
case '\u00CE':
|
||||
case '\u00CF':
|
||||
case '\u00EC':
|
||||
case '\u00ED':
|
||||
case '\u00EE':
|
||||
case '\u00EF':
|
||||
case '\u0128':
|
||||
case '\u0129':
|
||||
case '\u012A':
|
||||
case '\u012B':
|
||||
case '\u012C':
|
||||
case '\u012D':
|
||||
case '\u012E':
|
||||
case '\u012F':
|
||||
case '\u0130':
|
||||
case '\u0131':
|
||||
return 'i';
|
||||
case '\u0134':
|
||||
case '\u0135':
|
||||
return 'j';
|
||||
case '\u0136':
|
||||
case '\u0137':
|
||||
case '\u0138':
|
||||
return 'k';
|
||||
case '\u0139':
|
||||
case '\u013A':
|
||||
case '\u013B':
|
||||
case '\u013C':
|
||||
case '\u013D':
|
||||
case '\u013E':
|
||||
case '\u013F':
|
||||
case '\u0140':
|
||||
case '\u0141':
|
||||
case '\u0142':
|
||||
return 'l';
|
||||
case '\u00D1':
|
||||
case '\u00F1':
|
||||
case '\u0143':
|
||||
case '\u0144':
|
||||
case '\u0145':
|
||||
case '\u0146':
|
||||
case '\u0147':
|
||||
case '\u0148':
|
||||
case '\u0149':
|
||||
case '\u014A':
|
||||
case '\u014B':
|
||||
return 'n';
|
||||
case '\u00D2':
|
||||
case '\u00D3':
|
||||
case '\u00D4':
|
||||
case '\u00D5':
|
||||
case '\u00D6':
|
||||
case '\u00D8':
|
||||
case '\u00F2':
|
||||
case '\u00F3':
|
||||
case '\u00F4':
|
||||
case '\u00F5':
|
||||
case '\u00F6':
|
||||
case '\u00F8':
|
||||
case '\u014C':
|
||||
case '\u014D':
|
||||
case '\u014E':
|
||||
case '\u014F':
|
||||
case '\u0150':
|
||||
case '\u0151':
|
||||
return 'o';
|
||||
case '\u0154':
|
||||
case '\u0155':
|
||||
case '\u0156':
|
||||
case '\u0157':
|
||||
case '\u0158':
|
||||
case '\u0159':
|
||||
return 'r';
|
||||
case '\u015A':
|
||||
case '\u015B':
|
||||
case '\u015C':
|
||||
case '\u015D':
|
||||
case '\u015E':
|
||||
case '\u015F':
|
||||
case '\u0160':
|
||||
case '\u0161':
|
||||
case '\u017F':
|
||||
return 's';
|
||||
case '\u0162':
|
||||
case '\u0163':
|
||||
case '\u0164':
|
||||
case '\u0165':
|
||||
case '\u0166':
|
||||
case '\u0167':
|
||||
return 't';
|
||||
case '\u00D9':
|
||||
case '\u00DA':
|
||||
case '\u00DB':
|
||||
case '\u00DC':
|
||||
case '\u00F9':
|
||||
case '\u00FA':
|
||||
case '\u00FB':
|
||||
case '\u00FC':
|
||||
case '\u0168':
|
||||
case '\u0169':
|
||||
case '\u016A':
|
||||
case '\u016B':
|
||||
case '\u016C':
|
||||
case '\u016D':
|
||||
case '\u016E':
|
||||
case '\u016F':
|
||||
case '\u0170':
|
||||
case '\u0171':
|
||||
case '\u0172':
|
||||
case '\u0173':
|
||||
return 'u';
|
||||
case '\u0174':
|
||||
case '\u0175':
|
||||
return 'w';
|
||||
case '\u00DD':
|
||||
case '\u00FD':
|
||||
case '\u00FF':
|
||||
case '\u0176':
|
||||
case '\u0177':
|
||||
case '\u0178':
|
||||
return 'y';
|
||||
case '\u0179':
|
||||
case '\u017A':
|
||||
case '\u017B':
|
||||
case '\u017C':
|
||||
case '\u017D':
|
||||
case '\u017E':
|
||||
return 'z';
|
||||
}
|
||||
return c;
|
||||
|
||||
protected static String stripDiacritics(String str) {
|
||||
str = Normalizer.normalize(str, Normalizer.Form.NFKD);
|
||||
str = DIACRITICS_AND_FRIENDS.matcher(str).replaceAll("");
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
||||
// Based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
|
||||
private static String stripNonDiacritics(String orig) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < orig.length(); i++) {
|
||||
String source = orig.substring(i, i + 1);
|
||||
String replace = NONDIACRITICS.get(source);
|
||||
result.append(replace == null ? String.valueOf(source) : replace);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
}
|
@ -55,11 +55,70 @@ public class KeyerTests extends RefineTest {
|
||||
{" d c b a ","a b c d"},
|
||||
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
|
||||
{"bbb\taaa","aaa bbb"},
|
||||
{"",""},
|
||||
{"",""},
|
||||
{"",""},
|
||||
// {"å","aa"}, // Requested by issue #650, but conflicts with diacritic folding
|
||||
{"æø","aeoe"}, // Norwegian replacements from #650
|
||||
{"©ß","css"}, // issue #409 esszet
|
||||
{"\u00D0\u00DE", "dth"}, // Icelandic eth and thorn
|
||||
{"fffiflffiſtst", "fffiflffistst"}, // ligatures
|
||||
// Test legacy replacements
|
||||
{"\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102\u0103\u0104\u0105", "aaaaaaaaaaaaaaaaaa"},
|
||||
{"\u00C7\u00E7\u0106\u0107\u0108\u0109\u010A\u010B\u010C\u010D", "cccccccccc"},
|
||||
{"\u00D0\u00F0\u010E\u010F\u0110\u0111", "dddddd"},
|
||||
{"\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118\u0119\u011A\u011B", "eeeeeeeeeeeeeeeeee"},
|
||||
{"\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123", "gggggggg"},
|
||||
{"\u0124\u0125", "hh"},
|
||||
{"\u0126\u0127", "hh"},
|
||||
{"\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F\u0130", "iiiiiiiiiiiiiiiii"},
|
||||
{"\u0131", "i"},
|
||||
{"\u0134\u0135", "jj"},
|
||||
{"\u0136\u0137", "kk"},
|
||||
{"\u0138", "k"},
|
||||
{"\u0139\u013A\u013B\u013C\u013D\u013E\u0141\u0142", "llllllll"},
|
||||
{"\u013F\u0140", "ll"},
|
||||
{"\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148", "nnnnnnnn"},
|
||||
{"\u0149", "n"}, // decomposed to 'n, but then punctuation is stripped
|
||||
{"\u014A\u014B", "nn"},
|
||||
{"\u00D2\u00D3\u00D4\u00D5\u00D6\u00F2\u00F3\u00F4\u00F5\u00F6\u014C\u014D\u014E\u014F\u0150\u0151", "oooooooooooooooo"},
|
||||
{"\u00D8\u00F8", "oeoe"}, // O with stroke we decompose to oe instead of o
|
||||
{"\u0154\u0155\u0156\u0157\u0158\u0159", "rrrrrr"},
|
||||
{"\u015A\u015B\u015C\u015D\u015E\u015F\u0160\u0161\u017F", "sssssssss"},
|
||||
{"\u0162\u0163\u0164\u0165\u0166\u0167", "tttttt"},
|
||||
{"\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u0168\u0169\u016A\u016B\u016C\u016D\u016E\u016F\u0170\u0171\u0172\u0173", "uuuuuuuuuuuuuuuuuuuu"},
|
||||
{"\u0174\u0175", "ww"},
|
||||
{"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
|
||||
{"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
|
||||
// Latin-1 Supplement
|
||||
{//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
|
||||
//"¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
|
||||
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ",
|
||||
"aaaaaaaeceeeeiiiidnooooooeuuuuythssaaaaaaaeceeeeiiiidnooooooeuuuuythy"},
|
||||
// Latin Extended A
|
||||
{"ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ",
|
||||
"aaaaaaccccccccddddeeeeeeeeeegggggggghhhhiiiiiiiiiiijijjjkkkllllllllllnnnnnnnnnoooooooeoerrrrrrssssssssttttttuuuuuuuuuuuuwwyyyzzzzzzs"},
|
||||
// Latin Extended B
|
||||
// TODO: These don't get folded to ASCII equivalents. Not sure if they should be
|
||||
// {"ƄƅƉƊƋƌƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝ",
|
||||
// "bbddddhiikkllmnnooooopprssssttttuuuuyyzz aaiioouuuuuuuuuux"},
|
||||
// {"ǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ",
|
||||
// "aaaaaeaeggggkkqqqqqssjdzdzggpnnaaaeaeooaaaeeeeeiiiioooorrrruuuusstt zzaaeeooooooooyy"},
|
||||
// Latin Extended Additional
|
||||
{"ḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗ",
|
||||
"aabbbbbbccddddddddddeeeeeeeeeeffgghhhhhhhhhhiiiikkkkkkllllllllmmmmmmnnnnnnnnoooooooopppp"},
|
||||
{"ṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕ",
|
||||
"rrrrrrrrssssssssssttttttttuuuuuuuuuuvvvvwwwwwwwwwwxxxxyyzzzzzz"},
|
||||
{"ẖẗẘẙẚẛẜẝẞẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệ",
|
||||
"htwyasssssaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeee"},
|
||||
// Latin Extended C - TODO: not supported yet
|
||||
// {"ⱠⱡⱢⱣⱤⱥⱦⱧⱨⱩⱪⱫⱬⱭⱮⱯⱰⱱⱲⱳⱴⱵⱶⱷⱸⱹⱺⱻⱼⱽⱾⱿ",
|
||||
// "lllprathhkkzzamaaavwwwv e ejvsz"},
|
||||
// Latin Extended D
|
||||
{"", ""},
|
||||
// Latin Extended E
|
||||
{"", ""},
|
||||
// TODO: Add tests for non-Western languages
|
||||
{"", ""},
|
||||
};
|
||||
|
||||
|
||||
private static final String[][] testNGramStrings = {
|
||||
{"abcdefg","abbccddeeffg"},
|
||||
{" a,b.c d\te!f?g ","abbccddeeffg"},
|
||||
@ -107,7 +166,5 @@ public class KeyerTests extends RefineTest {
|
||||
"Fingerprint for string: " + ss[0] + " failed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user