Merge pull request #1921 from OpenRefine/phonetic_clustering

Sample extension to add some clustering methods
This commit is contained in:
Antonin Delpeuch 2019-01-10 04:44:24 +01:00 committed by GitHub
commit 1ed2da338c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 363 additions and 9 deletions

1
extensions/phonetic/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
module/MOD-INF/classes/

View File

@ -0,0 +1,60 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
var html = "text/html";
var encoding = "UTF-8";
var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager;
/*
* Function invoked to initialize the extension.
*/
function init() {
// Register new keyers
Packages.com.google.refine.clustering.binning.KeyerFactory.put("daitch-mokotoff", new Packages.org.openrefine.phonetic.keyers.DaitchMokotoffKeyer());
Packages.com.google.refine.clustering.binning.KeyerFactory.put("beider-morse", new Packages.org.openrefine.phonetic.keyers.BeiderMorseKeyer());
// Similarly, we could register new distances like this:
// Packages.com.google.refine.clustering.knn.DistanceFactory.put("my-distance", new Packages.org.openrefine.mydistances.MyDistance());
/*
* Resources: just some javascript to load the localized strings
*/
ClientSideResourceManager.addPaths(
"project/scripts",
module,
[
"scripts/load-language.js",
]
);
}

View File

@ -0,0 +1,4 @@
name = phonetic
description = OpenRefine Phonetic Clustering extension
templating.macros = macros.vm
requires = core

View File

@ -0,0 +1,4 @@
{
"clustering-keyers/beider-morse": "Beider-Morse",
"clustering-keyers/daitch-mokotoff": "Daitch-Mokotoff"
}

View File

@ -0,0 +1,14 @@
#*
This file contains common velocity macros used in all .vt files.
For Velocity documentation, see:
http://velocity.apache.org/engine/releases/velocity-1.5/user-guide.html
*#
#macro( makeAList $list )
<ul>
#foreach($item in $list)
<li>$item</li>
#end
</ul>
#end

View File

@ -0,0 +1,19 @@
// Load the localization file
var dictionary = {};
$.ajax({
url : "command/core/load-language?",
type : "POST",
async : false,
data : {
module : "phonetic",
// lang : lang
},
success : function(data) {
dictionary = data['dictionary'];
lang = data['lang'];
}
});
$.i18n().load(dictionary, lang);

133
extensions/phonetic/pom.xml Normal file
View File

@ -0,0 +1,133 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.openrefine</groupId>
<artifactId>phonetic</artifactId>
<packaging>jar</packaging>
<version>3.2-SNAPSHOT</version>
<name>OpenRefine - Phonetic clustering extension</name>
<description>Adds a few advanced phonetic clustering methods</description>
<url>http://openrefine.org/</url>
<parent>
<groupId>org.openrefine</groupId>
<artifactId>extensions</artifactId>
<version>3.2-SNAPSHOT</version>
</parent>
<build>
<finalName>openrefine-sample</finalName>
<resources>
<resource>
<directory>src</directory>
</resource>
</resources>
<testSourceDirectory>tests/src</testSourceDirectory>
<outputDirectory>module/MOD-INF/classes</outputDirectory>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.8</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<showDeprecation>false</showDeprecation>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.0</version>
<configuration>
<suiteXmlFiles>
<suiteXmlFile>tests/conf/tests.xml</suiteXmlFile>
</suiteXmlFiles>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>module/MOD-INF/lib</outputDirectory>
<includeScope>runtime</includeScope>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<filesets>
<fileset>
<directory>module/MOD-INF/lib</directory>
</fileset>
</filesets>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>main</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<version>2.5</version>
<scope>provided</scope>
</dependency>
<!-- add here the dependencies of your extension -->
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.9.10</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,32 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
public class BeiderMorseKeyer extends Keyer {
protected BeiderMorseEncoder encoder = new BeiderMorseEncoder();
@Override
public String key(String string, Object... params) {
try {
/*
* Beider Morse encoding can return multiple phonetic
* encodings, separated by |.
* Ideally the Keyer interface should be changed to allow
* for multiple values to be returned (and the clustering code
* should be adapted accourdingly).
*
* As a simple workaround we only return the first value.
* We could also return the entire list but it would make
* matching harder.
*/
return encoder.encode(string).split("\\|")[0];
} catch (EncoderException e) {
return string;
}
}
}

View File

@ -0,0 +1,15 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
public class DaitchMokotoffKeyer extends Keyer {
protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
@Override
public String key(String string, Object... params) {
return encoder.encode(string);
}
}

View File

@ -0,0 +1,14 @@
<!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd" >
<suite name="Wikidata extension">
<test name="tests">
<groups>
<run>
<exclude name="broken" />
</run>
</groups>
<packages>
<package name="org.openrefine.phonetic.*" />
</packages>
</test>
</suite>

View File

@ -0,0 +1,28 @@
package org.openrefine.phonetic.keyers;
import static org.testng.Assert.assertEquals;
import org.testng.annotations.Test;
import com.google.refine.clustering.binning.Keyer;
public class BeiderMorseKeyerTest {
Keyer keyer = new BeiderMorseKeyer();
@Test
public void testKey() {
assertEquals(keyer.key("Alphonse"), "YlfYnzi");
}
@Test
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "ilionor");
}
@Test
public void testEmpty() {
assertEquals(keyer.key(""), "");
}
}

View File

@ -0,0 +1,27 @@
package org.openrefine.phonetic.keyers;
import static org.testng.Assert.assertEquals;
import org.testng.annotations.Test;
import com.google.refine.clustering.binning.Keyer;
public class DaitchMokotoffKeyerTest {
protected Keyer keyer = new DaitchMokotoffKeyer();
@Test
public void testDaitchMokotoff() {
assertEquals(keyer.key("Alphonse"), "087640");
}
@Test
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "086900");
}
@Test
public void testEmpty() {
assertEquals(keyer.key(""), "000000");
}
}

View File

@ -23,6 +23,7 @@
<module>database</module>
<module>gdata</module>
<module>pc-axis</module>
<module>phonetic</module>
<!-- Add new extensions here -->
</modules>

View File

@ -348,7 +348,7 @@
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version> <!-- in classpath: 3.13-20150929 -->
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>

View File

@ -39,7 +39,8 @@ import java.time.OffsetDateTime;
import java.util.List;
import java.util.Properties;
import org.apache.poi.hssf.usermodel.HSSFHyperlink;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
@ -126,7 +127,7 @@ public class XlsExporter implements StreamExporter {
}
if (cellData.link != null) {
HSSFHyperlink hl = new HSSFHyperlink(HSSFHyperlink.LINK_URL);
Hyperlink hl = wb.getCreationHelper().createHyperlink(HyperlinkType.URL);
hl.setLabel(cellData.text);
hl.setAddress(cellData.link);
}

View File

@ -50,6 +50,7 @@ import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
@ -239,19 +240,19 @@ public class ExcelImporter extends TabularImportingParserBase {
}
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
int cellType = cell.getCellType();
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
CellType cellType = cell.getCellType();
if (cellType.equals(CellType.FORMULA)) {
cellType = cell.getCachedFormulaResultType();
}
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
if (cellType.equals(CellType.ERROR) ||
cellType.equals(CellType.BLANK)) {
return null;
}
Serializable value = null;
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
if (cellType.equals(CellType.BOOLEAN)) {
value = cell.getBooleanCellValue();
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
} else if (cellType.equals(CellType.NUMERIC)) {
double d = cell.getNumericCellValue();
if (HSSFDateUtil.isCellDateFormatted(cell)) {