From 3d30897b3b5b28541582f527d20ef6f648c65377 Mon Sep 17 00:00:00 2001 From: Thad Guidry Date: Sun, 22 Nov 2020 14:00:48 -0600 Subject: [PATCH] Add Wynn to fingerprint to support Old English texts --- .../com/google/refine/clustering/binning/FingerprintKeyer.java | 3 ++- .../src/com/google/refine/clustering/binning/KeyerTests.java | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java index 728fa510c..c7757ea98 100644 --- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java @@ -72,6 +72,7 @@ public class FingerprintKeyer extends Keyer { .put("\u0111", "d") // Small letter D with stroke .put("\u0256", "d") // Small letter African D .put("\u00FE", "th") // Lower case Icelandic thorn þ + .put("ƿ","w") // Lower case Wynn from Old English modernly transliterated to w // Visually similar replacements from our private former asciify() method // (only need lower case forms since we're already downcased) .put("\u0127", "h") // small H with stroke @@ -141,4 +142,4 @@ public class FingerprintKeyer extends Keyer { return result.toString(); } -} \ No newline at end of file +} diff --git a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java index 1335ee463..d141fdf96 100644 --- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java @@ -55,7 +55,7 @@ public class KeyerTests extends RefineTest { // {"å","aa"}, // Requested by issue #650, but conflicts with diacritic folding {"æø","aeoe"}, // Norwegian replacements from #650 {"©ß","css"}, // issue #409 esszet - {"\u00D0\u00DE", "dth"}, // Icelandic eth and thorn + {"\u00D0\u00F0\u00DE\u00FEǷƿ", "ddththww"}, // eth, thorn, & wynn for Icelandic / Olde English {"fffiflffiſtst", "fffiflffistst"}, // ligatures // Test legacy replacements {"\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102\u0103\u0104\u0105", "aaaaaaaaaaaaaaaaaa"},