Merge pull request #1921 from OpenRefine/phonetic_clustering
Sample extension to add some clustering methods
This commit is contained in:
commit
1ed2da338c
1
extensions/phonetic/.gitignore
vendored
Normal file
1
extensions/phonetic/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
module/MOD-INF/classes/
|
60
extensions/phonetic/module/MOD-INF/controller.js
Normal file
60
extensions/phonetic/module/MOD-INF/controller.js
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
Copyright 2010, Google Inc.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
* Neither the name of Google Inc. nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
var html = "text/html";
|
||||||
|
var encoding = "UTF-8";
|
||||||
|
var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Function invoked to initialize the extension.
|
||||||
|
*/
|
||||||
|
function init() {
|
||||||
|
// Register new keyers
|
||||||
|
Packages.com.google.refine.clustering.binning.KeyerFactory.put("daitch-mokotoff", new Packages.org.openrefine.phonetic.keyers.DaitchMokotoffKeyer());
|
||||||
|
Packages.com.google.refine.clustering.binning.KeyerFactory.put("beider-morse", new Packages.org.openrefine.phonetic.keyers.BeiderMorseKeyer());
|
||||||
|
|
||||||
|
// Similarly, we could register new distances like this:
|
||||||
|
// Packages.com.google.refine.clustering.knn.DistanceFactory.put("my-distance", new Packages.org.openrefine.mydistances.MyDistance());
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Resources: just some javascript to load the localized strings
|
||||||
|
*/
|
||||||
|
ClientSideResourceManager.addPaths(
|
||||||
|
"project/scripts",
|
||||||
|
module,
|
||||||
|
[
|
||||||
|
"scripts/load-language.js",
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
4
extensions/phonetic/module/MOD-INF/module.properties
Normal file
4
extensions/phonetic/module/MOD-INF/module.properties
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
name = phonetic
|
||||||
|
description = OpenRefine Phonetic Clustering extension
|
||||||
|
templating.macros = macros.vm
|
||||||
|
requires = core
|
4
extensions/phonetic/module/langs/translation-en.json
Normal file
4
extensions/phonetic/module/langs/translation-en.json
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"clustering-keyers/beider-morse": "Beider-Morse",
|
||||||
|
"clustering-keyers/daitch-mokotoff": "Daitch-Mokotoff"
|
||||||
|
}
|
14
extensions/phonetic/module/macros.vm
Normal file
14
extensions/phonetic/module/macros.vm
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#*
|
||||||
|
This file contains common velocity macros used in all .vt files.
|
||||||
|
For Velocity documentation, see:
|
||||||
|
|
||||||
|
http://velocity.apache.org/engine/releases/velocity-1.5/user-guide.html
|
||||||
|
*#
|
||||||
|
|
||||||
|
#macro( makeAList $list )
|
||||||
|
<ul>
|
||||||
|
#foreach($item in $list)
|
||||||
|
<li>$item</li>
|
||||||
|
#end
|
||||||
|
</ul>
|
||||||
|
#end
|
19
extensions/phonetic/module/scripts/load-language.js
Normal file
19
extensions/phonetic/module/scripts/load-language.js
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
// Load the localization file
|
||||||
|
var dictionary = {};
|
||||||
|
$.ajax({
|
||||||
|
url : "command/core/load-language?",
|
||||||
|
type : "POST",
|
||||||
|
async : false,
|
||||||
|
data : {
|
||||||
|
module : "phonetic",
|
||||||
|
// lang : lang
|
||||||
|
},
|
||||||
|
success : function(data) {
|
||||||
|
dictionary = data['dictionary'];
|
||||||
|
lang = data['lang'];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
$.i18n().load(dictionary, lang);
|
||||||
|
|
||||||
|
|
||||||
|
|
133
extensions/phonetic/pom.xml
Normal file
133
extensions/phonetic/pom.xml
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>org.openrefine</groupId>
|
||||||
|
<artifactId>phonetic</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<version>3.2-SNAPSHOT</version>
|
||||||
|
|
||||||
|
<name>OpenRefine - Phonetic clustering extension</name>
|
||||||
|
<description>Adds a few advanced phonetic clustering methods</description>
|
||||||
|
<url>http://openrefine.org/</url>
|
||||||
|
<parent>
|
||||||
|
<groupId>org.openrefine</groupId>
|
||||||
|
<artifactId>extensions</artifactId>
|
||||||
|
<version>3.2-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<finalName>openrefine-sample</finalName>
|
||||||
|
<resources>
|
||||||
|
<resource>
|
||||||
|
<directory>src</directory>
|
||||||
|
</resource>
|
||||||
|
</resources>
|
||||||
|
<testSourceDirectory>tests/src</testSourceDirectory>
|
||||||
|
<outputDirectory>module/MOD-INF/classes</outputDirectory>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>3.1</version>
|
||||||
|
<configuration>
|
||||||
|
<source>1.8</source>
|
||||||
|
<target>1.8</target>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
<showDeprecation>false</showDeprecation>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-resources-plugin</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
<configuration>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
|
<version>2.22.0</version>
|
||||||
|
<configuration>
|
||||||
|
<suiteXmlFiles>
|
||||||
|
<suiteXmlFile>tests/conf/tests.xml</suiteXmlFile>
|
||||||
|
</suiteXmlFiles>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-dependency-plugin</artifactId>
|
||||||
|
<version>3.1.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>compile</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>copy-dependencies</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<outputDirectory>module/MOD-INF/lib</outputDirectory>
|
||||||
|
<includeScope>runtime</includeScope>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-clean-plugin</artifactId>
|
||||||
|
<version>3.1.0</version>
|
||||||
|
<configuration>
|
||||||
|
<filesets>
|
||||||
|
<fileset>
|
||||||
|
<directory>module/MOD-INF/lib</directory>
|
||||||
|
</fileset>
|
||||||
|
</filesets>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>${project.groupId}</groupId>
|
||||||
|
<artifactId>main</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>javax.servlet</groupId>
|
||||||
|
<artifactId>servlet-api</artifactId>
|
||||||
|
<version>2.5</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- add here the dependencies of your extension -->
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.testng</groupId>
|
||||||
|
<artifactId>testng</artifactId>
|
||||||
|
<version>6.9.10</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
||||||
|
|
@ -0,0 +1,32 @@
|
|||||||
|
package org.openrefine.phonetic.keyers;
|
||||||
|
|
||||||
|
import com.google.refine.clustering.binning.Keyer;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.EncoderException;
|
||||||
|
import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
|
||||||
|
|
||||||
|
public class BeiderMorseKeyer extends Keyer {
|
||||||
|
|
||||||
|
protected BeiderMorseEncoder encoder = new BeiderMorseEncoder();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String key(String string, Object... params) {
|
||||||
|
try {
|
||||||
|
/*
|
||||||
|
* Beider Morse encoding can return multiple phonetic
|
||||||
|
* encodings, separated by |.
|
||||||
|
* Ideally the Keyer interface should be changed to allow
|
||||||
|
* for multiple values to be returned (and the clustering code
|
||||||
|
* should be adapted accourdingly).
|
||||||
|
*
|
||||||
|
* As a simple workaround we only return the first value.
|
||||||
|
* We could also return the entire list but it would make
|
||||||
|
* matching harder.
|
||||||
|
*/
|
||||||
|
return encoder.encode(string).split("\\|")[0];
|
||||||
|
} catch (EncoderException e) {
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,15 @@
|
|||||||
|
package org.openrefine.phonetic.keyers;
|
||||||
|
|
||||||
|
import com.google.refine.clustering.binning.Keyer;
|
||||||
|
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
|
||||||
|
|
||||||
|
public class DaitchMokotoffKeyer extends Keyer {
|
||||||
|
|
||||||
|
protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String key(String string, Object... params) {
|
||||||
|
return encoder.encode(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
14
extensions/phonetic/tests/conf/tests.xml
Normal file
14
extensions/phonetic/tests/conf/tests.xml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd" >
|
||||||
|
|
||||||
|
<suite name="Wikidata extension">
|
||||||
|
<test name="tests">
|
||||||
|
<groups>
|
||||||
|
<run>
|
||||||
|
<exclude name="broken" />
|
||||||
|
</run>
|
||||||
|
</groups>
|
||||||
|
<packages>
|
||||||
|
<package name="org.openrefine.phonetic.*" />
|
||||||
|
</packages>
|
||||||
|
</test>
|
||||||
|
</suite>
|
@ -0,0 +1,28 @@
|
|||||||
|
package org.openrefine.phonetic.keyers;
|
||||||
|
|
||||||
|
import static org.testng.Assert.assertEquals;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import com.google.refine.clustering.binning.Keyer;
|
||||||
|
|
||||||
|
public class BeiderMorseKeyerTest {
|
||||||
|
|
||||||
|
Keyer keyer = new BeiderMorseKeyer();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKey() {
|
||||||
|
assertEquals(keyer.key("Alphonse"), "YlfYnzi");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAccents() {
|
||||||
|
assertEquals(keyer.key("Éléonore"), "ilionor");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmpty() {
|
||||||
|
assertEquals(keyer.key(""), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
package org.openrefine.phonetic.keyers;
|
||||||
|
|
||||||
|
import static org.testng.Assert.assertEquals;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import com.google.refine.clustering.binning.Keyer;
|
||||||
|
|
||||||
|
public class DaitchMokotoffKeyerTest {
|
||||||
|
protected Keyer keyer = new DaitchMokotoffKeyer();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDaitchMokotoff() {
|
||||||
|
assertEquals(keyer.key("Alphonse"), "087640");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAccents() {
|
||||||
|
assertEquals(keyer.key("Éléonore"), "086900");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmpty() {
|
||||||
|
assertEquals(keyer.key(""), "000000");
|
||||||
|
}
|
||||||
|
}
|
@ -23,6 +23,7 @@
|
|||||||
<module>database</module>
|
<module>database</module>
|
||||||
<module>gdata</module>
|
<module>gdata</module>
|
||||||
<module>pc-axis</module>
|
<module>pc-axis</module>
|
||||||
|
<module>phonetic</module>
|
||||||
<!-- Add new extensions here -->
|
<!-- Add new extensions here -->
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
|
@ -348,7 +348,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.poi</groupId>
|
<groupId>org.apache.poi</groupId>
|
||||||
<artifactId>poi</artifactId>
|
<artifactId>poi</artifactId>
|
||||||
<version>3.8</version> <!-- in classpath: 3.13-20150929 -->
|
<version>4.0.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.poi</groupId>
|
<groupId>org.apache.poi</groupId>
|
||||||
|
@ -39,7 +39,8 @@ import java.time.OffsetDateTime;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
import org.apache.poi.hssf.usermodel.HSSFHyperlink;
|
import org.apache.poi.common.usermodel.Hyperlink;
|
||||||
|
import org.apache.poi.common.usermodel.HyperlinkType;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
import org.apache.poi.ss.usermodel.Cell;
|
import org.apache.poi.ss.usermodel.Cell;
|
||||||
import org.apache.poi.ss.usermodel.CellStyle;
|
import org.apache.poi.ss.usermodel.CellStyle;
|
||||||
@ -126,7 +127,7 @@ public class XlsExporter implements StreamExporter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (cellData.link != null) {
|
if (cellData.link != null) {
|
||||||
HSSFHyperlink hl = new HSSFHyperlink(HSSFHyperlink.LINK_URL);
|
Hyperlink hl = wb.getCreationHelper().createHyperlink(HyperlinkType.URL);
|
||||||
hl.setLabel(cellData.text);
|
hl.setLabel(cellData.text);
|
||||||
hl.setAddress(cellData.link);
|
hl.setAddress(cellData.link);
|
||||||
}
|
}
|
||||||
|
@ -50,6 +50,7 @@ import org.apache.poi.common.usermodel.Hyperlink;
|
|||||||
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.ss.usermodel.CellType;
|
||||||
import org.apache.poi.ss.usermodel.Sheet;
|
import org.apache.poi.ss.usermodel.Sheet;
|
||||||
import org.apache.poi.ss.usermodel.Workbook;
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||||
@ -239,19 +240,19 @@ public class ExcelImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
||||||
int cellType = cell.getCellType();
|
CellType cellType = cell.getCellType();
|
||||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
|
if (cellType.equals(CellType.FORMULA)) {
|
||||||
cellType = cell.getCachedFormulaResultType();
|
cellType = cell.getCachedFormulaResultType();
|
||||||
}
|
}
|
||||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
|
if (cellType.equals(CellType.ERROR) ||
|
||||||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
|
cellType.equals(CellType.BLANK)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
Serializable value = null;
|
Serializable value = null;
|
||||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
|
if (cellType.equals(CellType.BOOLEAN)) {
|
||||||
value = cell.getBooleanCellValue();
|
value = cell.getBooleanCellValue();
|
||||||
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
|
} else if (cellType.equals(CellType.NUMERIC)) {
|
||||||
double d = cell.getNumericCellValue();
|
double d = cell.getNumericCellValue();
|
||||||
|
|
||||||
if (HSSFDateUtil.isCellDateFormatted(cell)) {
|
if (HSSFDateUtil.isCellDateFormatted(cell)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user