From 1c9012982929dafe4df072f3d76c89d759913a6f Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Sun, 30 Dec 2018 23:11:00 +0100 Subject: [PATCH] Add a phonetic clustering extension Closes #926. Closes #927. --- .../phonetic/module/MOD-INF/controller.js | 50 +++++++ .../phonetic/module/MOD-INF/module.properties | 4 + extensions/phonetic/pom.xml | 123 ++++++++++++++++++ .../phonetic/keyers/BeiderMorseKeyer.java | 21 +++ .../phonetic/keyers/DaitchMokotoffKeyer.java | 15 +++ .../phonetic/keyers/BeiderMorseKeyerTest.java | 24 ++++ .../keyers/DaitchMokotoffKeyerTest.java | 21 +++ extensions/pom.xml | 1 + 8 files changed, 259 insertions(+) create mode 100644 extensions/phonetic/module/MOD-INF/controller.js create mode 100644 extensions/phonetic/module/MOD-INF/module.properties create mode 100644 extensions/phonetic/pom.xml create mode 100644 extensions/phonetic/src/org/openrefine/phonetic/keyers/BeiderMorseKeyer.java create mode 100644 extensions/phonetic/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyer.java create mode 100644 extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/BeiderMorseKeyerTest.java create mode 100644 extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyerTest.java diff --git a/extensions/phonetic/module/MOD-INF/controller.js b/extensions/phonetic/module/MOD-INF/controller.js new file mode 100644 index 000000000..5328e5d41 --- /dev/null +++ b/extensions/phonetic/module/MOD-INF/controller.js @@ -0,0 +1,50 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ + +var html = "text/html"; +var encoding = "UTF-8"; +var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager; + +/* + * Function invoked to initialize the extension. + */ +function init() { + // Register new keyers + Packages.com.google.refine.clustering.binning.KeyerFactory.put("daitch-mokotoff", new Packages.org.openrefine.phonetic.keyers.DaitchMokotoffKeyer()); + Packages.com.google.refine.clustering.binning.KeyerFactory.put("beider-morse", new Packages.org.openrefine.phonetic.keyers.BeiderMorseKeyer()); + + // Similarly, we could register new distances like this: + // Packages.com.google.refine.clustering.knn.DistanceFactory.put("my-distance", new Packages.org.openrefine.mydistances.MyDistance()); +} + + diff --git a/extensions/phonetic/module/MOD-INF/module.properties b/extensions/phonetic/module/MOD-INF/module.properties new file mode 100644 index 000000000..e207201d3 --- /dev/null +++ b/extensions/phonetic/module/MOD-INF/module.properties @@ -0,0 +1,4 @@ +name = phonetic +description = OpenRefine Phonetic Clustering extension +templating.macros = macros.vm +requires = core diff --git a/extensions/phonetic/pom.xml b/extensions/phonetic/pom.xml new file mode 100644 index 000000000..e173f5e10 --- /dev/null +++ b/extensions/phonetic/pom.xml @@ -0,0 +1,123 @@ + + 4.0.0 + + org.openrefine + phonetic + jar + 3.2-SNAPSHOT + + OpenRefine - Phonetic clustering extension + Adds a few advanced phonetic clustering methods + http://openrefine.org/ + + org.openrefine + extensions + 3.2-SNAPSHOT + + + + openrefine-sample + + + src + + + tests/src + module/MOD-INF/classes + + + org.codehaus.mojo + build-helper-maven-plugin + 1.8 + + + generate-sources + + add-source + + + + src + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.8 + 1.8 + UTF-8 + false + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + UTF-8 + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.1.1 + + + compile + + copy-dependencies + + + module/MOD-INF/lib + runtime + + + + + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + + module/MOD-INF/lib + + + + + + + + + + ${project.groupId} + main + ${project.version} + provided + + + javax.servlet + servlet-api + 2.5 + provided + + + + + + org.testng + testng + 6.9.10 + test + + + + + diff --git a/extensions/phonetic/src/org/openrefine/phonetic/keyers/BeiderMorseKeyer.java b/extensions/phonetic/src/org/openrefine/phonetic/keyers/BeiderMorseKeyer.java new file mode 100644 index 000000000..710b89fb4 --- /dev/null +++ b/extensions/phonetic/src/org/openrefine/phonetic/keyers/BeiderMorseKeyer.java @@ -0,0 +1,21 @@ +package org.openrefine.phonetic.keyers; + +import com.google.refine.clustering.binning.Keyer; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.language.bm.BeiderMorseEncoder; + +public class BeiderMorseKeyer extends Keyer { + + protected BeiderMorseEncoder encoder = new BeiderMorseEncoder(); + + @Override + public String key(String string, Object... params) { + try { + return encoder.encode(string); + } catch (EncoderException e) { + return string; + } + } + +} diff --git a/extensions/phonetic/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyer.java b/extensions/phonetic/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyer.java new file mode 100644 index 000000000..9a4af5216 --- /dev/null +++ b/extensions/phonetic/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyer.java @@ -0,0 +1,15 @@ +package org.openrefine.phonetic.keyers; + +import com.google.refine.clustering.binning.Keyer; +import org.apache.commons.codec.language.DaitchMokotoffSoundex; + +public class DaitchMokotoffKeyer extends Keyer { + + protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex(); + + @Override + public String key(String string, Object... params) { + return encoder.encode(string); + } + +} diff --git a/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/BeiderMorseKeyerTest.java b/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/BeiderMorseKeyerTest.java new file mode 100644 index 000000000..5830cfe6e --- /dev/null +++ b/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/BeiderMorseKeyerTest.java @@ -0,0 +1,24 @@ +package org.openrefine.phonetic.keyers; + +import com.google.refine.clustering.binning.Keyer; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import org.testng.annotations.Test; + +public class BeiderMorseKeyerTest { + + Keyer keyer = new BeiderMorseKeyer(); + + @Test + public void testKey() { + assertTrue(keyer.key("Alphonse").contains("alponzi")); + } + + @Test + public void testAccents() { + assertEquals(keyer.key("Éléonore"), "ilionor|ilionori"); + } + +} diff --git a/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyerTest.java b/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyerTest.java new file mode 100644 index 000000000..d0251ab95 --- /dev/null +++ b/extensions/phonetic/tests/src/org/openrefine/phonetic/keyers/DaitchMokotoffKeyerTest.java @@ -0,0 +1,21 @@ +package org.openrefine.phonetic.keyers; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + +import com.google.refine.clustering.binning.Keyer; + +public class DaitchMokotoffKeyerTest { + protected Keyer keyer = new DaitchMokotoffKeyer(); + + @Test + public void testDaitchMokotoff() { + assertEquals(keyer.key("Alphonse"), "087640"); + } + + @Test + public void testAccents() { + assertEquals(keyer.key("Éléonore"), "086900"); + } +} diff --git a/extensions/pom.xml b/extensions/pom.xml index 98574dd8e..ca1fb7de3 100644 --- a/extensions/pom.xml +++ b/extensions/pom.xml @@ -23,6 +23,7 @@ database gdata pc-axis + phonetic