Merge pull request #1921 from OpenRefine/phonetic_clustering
Sample extension to add some clustering methods
This commit is contained in:
commit
1ed2da338c
1
extensions/phonetic/.gitignore
vendored
Normal file
1
extensions/phonetic/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
module/MOD-INF/classes/
|
60
extensions/phonetic/module/MOD-INF/controller.js
Normal file
60
extensions/phonetic/module/MOD-INF/controller.js
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
var html = "text/html";
|
||||
var encoding = "UTF-8";
|
||||
var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager;
|
||||
|
||||
/*
|
||||
* Function invoked to initialize the extension.
|
||||
*/
|
||||
function init() {
|
||||
// Register new keyers
|
||||
Packages.com.google.refine.clustering.binning.KeyerFactory.put("daitch-mokotoff", new Packages.org.openrefine.phonetic.keyers.DaitchMokotoffKeyer());
|
||||
Packages.com.google.refine.clustering.binning.KeyerFactory.put("beider-morse", new Packages.org.openrefine.phonetic.keyers.BeiderMorseKeyer());
|
||||
|
||||
// Similarly, we could register new distances like this:
|
||||
// Packages.com.google.refine.clustering.knn.DistanceFactory.put("my-distance", new Packages.org.openrefine.mydistances.MyDistance());
|
||||
|
||||
/*
|
||||
* Resources: just some javascript to load the localized strings
|
||||
*/
|
||||
ClientSideResourceManager.addPaths(
|
||||
"project/scripts",
|
||||
module,
|
||||
[
|
||||
"scripts/load-language.js",
|
||||
]
|
||||
);
|
||||
|
||||
}
|
4
extensions/phonetic/module/MOD-INF/module.properties
Normal file
4
extensions/phonetic/module/MOD-INF/module.properties
Normal file
@ -0,0 +1,4 @@
|
||||
name = phonetic
|
||||
description = OpenRefine Phonetic Clustering extension
|
||||
templating.macros = macros.vm
|
||||
requires = core
|
4
extensions/phonetic/module/langs/translation-en.json
Normal file
4
extensions/phonetic/module/langs/translation-en.json
Normal file
@ -0,0 +1,4 @@
|
||||
{
|
||||
"clustering-keyers/beider-morse": "Beider-Morse",
|
||||
"clustering-keyers/daitch-mokotoff": "Daitch-Mokotoff"
|
||||
}
|
14
extensions/phonetic/module/macros.vm
Normal file
14
extensions/phonetic/module/macros.vm
Normal file
@ -0,0 +1,14 @@
|
||||
#*
|
||||
This file contains common velocity macros used in all .vt files.
|
||||
For Velocity documentation, see:
|
||||
|
||||
http://velocity.apache.org/engine/releases/velocity-1.5/user-guide.html
|
||||
*#
|
||||
|
||||
#macro( makeAList $list )
|
||||
<ul>
|
||||
#foreach($item in $list)
|
||||
<li>$item</li>
|
||||
#end
|
||||
</ul>
|
||||
#end
|
19
extensions/phonetic/module/scripts/load-language.js
Normal file
19
extensions/phonetic/module/scripts/load-language.js
Normal file
@ -0,0 +1,19 @@
|
||||
// Load the localization file
|
||||
var dictionary = {};
|
||||
$.ajax({
|
||||
url : "command/core/load-language?",
|
||||
type : "POST",
|
||||
async : false,
|
||||
data : {
|
||||
module : "phonetic",
|
||||
// lang : lang
|
||||
},
|
||||
success : function(data) {
|
||||
dictionary = data['dictionary'];
|
||||
lang = data['lang'];
|
||||
}
|
||||
});
|
||||
$.i18n().load(dictionary, lang);
|
||||
|
||||
|
||||
|
133
extensions/phonetic/pom.xml
Normal file
133
extensions/phonetic/pom.xml
Normal file
@ -0,0 +1,133 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>org.openrefine</groupId>
|
||||
<artifactId>phonetic</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<version>3.2-SNAPSHOT</version>
|
||||
|
||||
<name>OpenRefine - Phonetic clustering extension</name>
|
||||
<description>Adds a few advanced phonetic clustering methods</description>
|
||||
<url>http://openrefine.org/</url>
|
||||
<parent>
|
||||
<groupId>org.openrefine</groupId>
|
||||
<artifactId>extensions</artifactId>
|
||||
<version>3.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<build>
|
||||
<finalName>openrefine-sample</finalName>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
<testSourceDirectory>tests/src</testSourceDirectory>
|
||||
<outputDirectory>module/MOD-INF/classes</outputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>1.8</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.1</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
<encoding>UTF-8</encoding>
|
||||
<showDeprecation>false</showDeprecation>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>2.6</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>2.22.0</version>
|
||||
<configuration>
|
||||
<suiteXmlFiles>
|
||||
<suiteXmlFile>tests/conf/tests.xml</suiteXmlFile>
|
||||
</suiteXmlFiles>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>3.1.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>module/MOD-INF/lib</outputDirectory>
|
||||
<includeScope>runtime</includeScope>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
<configuration>
|
||||
<filesets>
|
||||
<fileset>
|
||||
<directory>module/MOD-INF/lib</directory>
|
||||
</fileset>
|
||||
</filesets>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>main</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.servlet</groupId>
|
||||
<artifactId>servlet-api</artifactId>
|
||||
<version>2.5</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- add here the dependencies of your extension -->
|
||||
|
||||
<dependency>
|
||||
<groupId>org.testng</groupId>
|
||||
<artifactId>testng</artifactId>
|
||||
<version>6.9.10</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
@ -0,0 +1,32 @@
|
||||
package org.openrefine.phonetic.keyers;
|
||||
|
||||
import com.google.refine.clustering.binning.Keyer;
|
||||
|
||||
import org.apache.commons.codec.EncoderException;
|
||||
import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
|
||||
|
||||
public class BeiderMorseKeyer extends Keyer {
|
||||
|
||||
protected BeiderMorseEncoder encoder = new BeiderMorseEncoder();
|
||||
|
||||
@Override
|
||||
public String key(String string, Object... params) {
|
||||
try {
|
||||
/*
|
||||
* Beider Morse encoding can return multiple phonetic
|
||||
* encodings, separated by |.
|
||||
* Ideally the Keyer interface should be changed to allow
|
||||
* for multiple values to be returned (and the clustering code
|
||||
* should be adapted accourdingly).
|
||||
*
|
||||
* As a simple workaround we only return the first value.
|
||||
* We could also return the entire list but it would make
|
||||
* matching harder.
|
||||
*/
|
||||
return encoder.encode(string).split("\\|")[0];
|
||||
} catch (EncoderException e) {
|
||||
return string;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
package org.openrefine.phonetic.keyers;
|
||||
|
||||
import com.google.refine.clustering.binning.Keyer;
|
||||
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
|
||||
|
||||
public class DaitchMokotoffKeyer extends Keyer {
|
||||
|
||||
protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
|
||||
|
||||
@Override
|
||||
public String key(String string, Object... params) {
|
||||
return encoder.encode(string);
|
||||
}
|
||||
|
||||
}
|
14
extensions/phonetic/tests/conf/tests.xml
Normal file
14
extensions/phonetic/tests/conf/tests.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<!DOCTYPE suite SYSTEM "http://testng.org/testng-1.0.dtd" >
|
||||
|
||||
<suite name="Wikidata extension">
|
||||
<test name="tests">
|
||||
<groups>
|
||||
<run>
|
||||
<exclude name="broken" />
|
||||
</run>
|
||||
</groups>
|
||||
<packages>
|
||||
<package name="org.openrefine.phonetic.*" />
|
||||
</packages>
|
||||
</test>
|
||||
</suite>
|
@ -0,0 +1,28 @@
|
||||
package org.openrefine.phonetic.keyers;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.refine.clustering.binning.Keyer;
|
||||
|
||||
public class BeiderMorseKeyerTest {
|
||||
|
||||
Keyer keyer = new BeiderMorseKeyer();
|
||||
|
||||
@Test
|
||||
public void testKey() {
|
||||
assertEquals(keyer.key("Alphonse"), "YlfYnzi");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAccents() {
|
||||
assertEquals(keyer.key("Éléonore"), "ilionor");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmpty() {
|
||||
assertEquals(keyer.key(""), "");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package org.openrefine.phonetic.keyers;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.refine.clustering.binning.Keyer;
|
||||
|
||||
public class DaitchMokotoffKeyerTest {
|
||||
protected Keyer keyer = new DaitchMokotoffKeyer();
|
||||
|
||||
@Test
|
||||
public void testDaitchMokotoff() {
|
||||
assertEquals(keyer.key("Alphonse"), "087640");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAccents() {
|
||||
assertEquals(keyer.key("Éléonore"), "086900");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmpty() {
|
||||
assertEquals(keyer.key(""), "000000");
|
||||
}
|
||||
}
|
@ -23,6 +23,7 @@
|
||||
<module>database</module>
|
||||
<module>gdata</module>
|
||||
<module>pc-axis</module>
|
||||
<module>phonetic</module>
|
||||
<!-- Add new extensions here -->
|
||||
</modules>
|
||||
|
||||
|
@ -348,7 +348,7 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>3.8</version> <!-- in classpath: 3.13-20150929 -->
|
||||
<version>4.0.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
|
@ -39,7 +39,8 @@ import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.poi.hssf.usermodel.HSSFHyperlink;
|
||||
import org.apache.poi.common.usermodel.Hyperlink;
|
||||
import org.apache.poi.common.usermodel.HyperlinkType;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
import org.apache.poi.ss.usermodel.CellStyle;
|
||||
@ -126,7 +127,7 @@ public class XlsExporter implements StreamExporter {
|
||||
}
|
||||
|
||||
if (cellData.link != null) {
|
||||
HSSFHyperlink hl = new HSSFHyperlink(HSSFHyperlink.LINK_URL);
|
||||
Hyperlink hl = wb.getCreationHelper().createHyperlink(HyperlinkType.URL);
|
||||
hl.setLabel(cellData.text);
|
||||
hl.setAddress(cellData.link);
|
||||
}
|
||||
|
@ -50,6 +50,7 @@ import org.apache.poi.common.usermodel.Hyperlink;
|
||||
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.ss.usermodel.CellType;
|
||||
import org.apache.poi.ss.usermodel.Sheet;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
@ -239,19 +240,19 @@ public class ExcelImporter extends TabularImportingParserBase {
|
||||
}
|
||||
|
||||
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
||||
int cellType = cell.getCellType();
|
||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
|
||||
CellType cellType = cell.getCellType();
|
||||
if (cellType.equals(CellType.FORMULA)) {
|
||||
cellType = cell.getCachedFormulaResultType();
|
||||
}
|
||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
|
||||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
|
||||
if (cellType.equals(CellType.ERROR) ||
|
||||
cellType.equals(CellType.BLANK)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Serializable value = null;
|
||||
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
|
||||
if (cellType.equals(CellType.BOOLEAN)) {
|
||||
value = cell.getBooleanCellValue();
|
||||
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
|
||||
} else if (cellType.equals(CellType.NUMERIC)) {
|
||||
double d = cell.getNumericCellValue();
|
||||
|
||||
if (HSSFDateUtil.isCellDateFormatted(cell)) {
|
||||
|
Loading…
Reference in New Issue
Block a user