Add a phonetic clustering extension

Closes #926. Closes #927.
This commit is contained in:
Antonin Delpeuch 2018-12-30 23:11:00 +01:00
parent 4984837c9f
commit 1c90129829
8 changed files with 259 additions and 0 deletions

View File

@ -0,0 +1,50 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
var html = "text/html";
var encoding = "UTF-8";
var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager;
/*
* Function invoked to initialize the extension.
*/
function init() {
// Register new keyers
Packages.com.google.refine.clustering.binning.KeyerFactory.put("daitch-mokotoff", new Packages.org.openrefine.phonetic.keyers.DaitchMokotoffKeyer());
Packages.com.google.refine.clustering.binning.KeyerFactory.put("beider-morse", new Packages.org.openrefine.phonetic.keyers.BeiderMorseKeyer());
// Similarly, we could register new distances like this:
// Packages.com.google.refine.clustering.knn.DistanceFactory.put("my-distance", new Packages.org.openrefine.mydistances.MyDistance());
}

View File

@ -0,0 +1,4 @@
name = phonetic
description = OpenRefine Phonetic Clustering extension
templating.macros = macros.vm
requires = core

123
extensions/phonetic/pom.xml Normal file
View File

@ -0,0 +1,123 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.openrefine</groupId>
<artifactId>phonetic</artifactId>
<packaging>jar</packaging>
<version>3.2-SNAPSHOT</version>
<name>OpenRefine - Phonetic clustering extension</name>
<description>Adds a few advanced phonetic clustering methods</description>
<url>http://openrefine.org/</url>
<parent>
<groupId>org.openrefine</groupId>
<artifactId>extensions</artifactId>
<version>3.2-SNAPSHOT</version>
</parent>
<build>
<finalName>openrefine-sample</finalName>
<resources>
<resource>
<directory>src</directory>
</resource>
</resources>
<testSourceDirectory>tests/src</testSourceDirectory>
<outputDirectory>module/MOD-INF/classes</outputDirectory>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.8</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<showDeprecation>false</showDeprecation>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>module/MOD-INF/lib</outputDirectory>
<includeScope>runtime</includeScope>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<filesets>
<fileset>
<directory>module/MOD-INF/lib</directory>
</fileset>
</filesets>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>main</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<version>2.5</version>
<scope>provided</scope>
</dependency>
<!-- add here the dependencies of your extension -->
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.9.10</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,21 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
public class BeiderMorseKeyer extends Keyer {
protected BeiderMorseEncoder encoder = new BeiderMorseEncoder();
@Override
public String key(String string, Object... params) {
try {
return encoder.encode(string);
} catch (EncoderException e) {
return string;
}
}
}

View File

@ -0,0 +1,15 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
public class DaitchMokotoffKeyer extends Keyer {
protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
@Override
public String key(String string, Object... params) {
return encoder.encode(string);
}
}

View File

@ -0,0 +1,24 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.Test;
public class BeiderMorseKeyerTest {
Keyer keyer = new BeiderMorseKeyer();
@Test
public void testKey() {
assertTrue(keyer.key("Alphonse").contains("alponzi"));
}
@Test
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "ilionor|ilionori");
}
}

View File

@ -0,0 +1,21 @@
package org.openrefine.phonetic.keyers;
import static org.testng.Assert.assertEquals;
import org.testng.annotations.Test;
import com.google.refine.clustering.binning.Keyer;
public class DaitchMokotoffKeyerTest {
protected Keyer keyer = new DaitchMokotoffKeyer();
@Test
public void testDaitchMokotoff() {
assertEquals(keyer.key("Alphonse"), "087640");
}
@Test
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "086900");
}
}

View File

@ -23,6 +23,7 @@
<module>database</module> <module>database</module>
<module>gdata</module> <module>gdata</module>
<module>pc-axis</module> <module>pc-axis</module>
<module>phonetic</module>
<!-- Add new extensions here --> <!-- Add new extensions here -->
</modules> </modules>