adding Metaphone3 algorithm
Many thanks to Lawrence Philips for donating the code to us under the BSD license. git-svn-id: http://google-refine.googlecode.com/svn/trunk@2029 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
87e7f9a7a4
commit
610de0d33a
117
.classpath
117
.classpath
@ -1,59 +1,58 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" path="main/src"/>
|
||||
<classpathentry kind="src" path="broker/core/src"/>
|
||||
<classpathentry kind="src" path="extensions/jython/src"/>
|
||||
<classpathentry kind="src" path="server/src"/>
|
||||
<classpathentry kind="src" path="extensions/freebase/src"/>
|
||||
<classpathentry kind="src" path="extensions/gdata/src"/>
|
||||
<classpathentry kind="src" path="extensions/sample/src"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/ant-tools-1.8.0.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/arithcode-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/butterfly-trunk.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-collections-3.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-io-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-lang-2.5.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/dom4j-1.6.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/httpclient-4.0.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/httpcore-4.0.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/icu4j-4.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jackson-core-asl-1.5.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jcl-over-slf4j-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jrdf-0.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/json-20100208.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/lessen-trunk-r8.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/marc4j-2.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/opencsv-2.2.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-ooxml-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-ooxml-schemas-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/rhino-1.7R2.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/secondstring-20100303.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/signpost-commonshttp4-1.2.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/signpost-core-1.2.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/slf4j-api-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/slf4j-log4j12-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/velocity-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/vicino-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/xmlbeans-2.3.0.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jdatapath-alpha2.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jetty-6.1.22.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jetty-util-6.1.22.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/servlet-api-2.5.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/slf4j-api-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/slf4j-log4j12-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="broker/appengine/WEB-INF/lib/slf4j-jdk14-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/jython/module/MOD-INF/lib/jython-2.5.1.jar"/>
|
||||
<classpathentry kind="lib" path="broker/core/module/MOD-INF/lib/bdb-je-4.0.103.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-core-1.0.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-spreadsheet-3.0.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-collect-1.0-rc1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jsoup-1.4.1.jar"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" path="main/src"/>
|
||||
<classpathentry kind="src" path="extensions/jython/src"/>
|
||||
<classpathentry kind="src" path="server/src"/>
|
||||
<classpathentry kind="src" path="extensions/freebase/src"/>
|
||||
<classpathentry kind="src" path="extensions/gdata/src"/>
|
||||
<classpathentry kind="src" path="extensions/sample/src"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/ant-tools-1.8.0.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/arithcode-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/butterfly-trunk.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-collections-3.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-io-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-lang-2.5.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/dom4j-1.6.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/httpclient-4.0.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/httpcore-4.0.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/icu4j-4.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jackson-core-asl-1.5.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jcl-over-slf4j-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jrdf-0.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/json-20100208.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/lessen-trunk-r8.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/marc4j-2.4.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/opencsv-2.2.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-ooxml-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/poi-ooxml-schemas-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/rhino-1.7R2.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/secondstring-20100303.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/signpost-commonshttp4-1.2.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/signpost-core-1.2.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/slf4j-api-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/slf4j-log4j12-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/velocity-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/vicino-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/xmlbeans-2.3.0.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jdatapath-alpha2.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jetty-6.1.22.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/jetty-util-6.1.22.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/servlet-api-2.5.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/slf4j-api-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="server/lib/slf4j-log4j12-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="broker/appengine/WEB-INF/lib/slf4j-jdk14-1.5.6.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/jython/module/MOD-INF/lib/jython-2.5.1.jar"/>
|
||||
<classpathentry kind="lib" path="broker/core/module/MOD-INF/lib/bdb-je-4.0.103.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-core-1.0.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-spreadsheet-3.0.jar"/>
|
||||
<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/google-collect-1.0-rc1.jar"/>
|
||||
<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/jsoup-1.4.1.jar"/>
|
||||
<classpathentry kind="output" path="build"/>
|
||||
</classpath>
|
||||
|
@ -40,9 +40,9 @@ import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
@ -73,6 +73,7 @@ public class BinningClusterer extends Clusterer {
|
||||
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
|
||||
_keyers.put("metaphone", new MetaphoneKeyer());
|
||||
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
|
||||
_keyers.put("metaphone3", new Metaphone3Keyer());
|
||||
_keyers.put("soundex", new SoundexKeyer());
|
||||
}
|
||||
|
||||
|
7455
main/src/com/google/refine/clustering/binning/Metaphone3.java
Normal file
7455
main/src/com/google/refine/clustering/binning/Metaphone3.java
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package com.google.refine.clustering.binning;
|
||||
|
||||
public class Metaphone3Keyer extends Keyer {
|
||||
|
||||
public Metaphone3Keyer() {
|
||||
}
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
Metaphone3 _metaphone3 = new Metaphone3();
|
||||
//_metaphone3.SetEncodeVowels(true);
|
||||
//_metaphone3.SetEncodeExact(true);
|
||||
_metaphone3.SetWord(s);
|
||||
_metaphone3.Encode();
|
||||
return _metaphone3.GetMetaph();
|
||||
}
|
||||
|
||||
}
|
@ -39,6 +39,7 @@ import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.google.refine.clustering.binning.DoubleMetaphoneKeyer;
|
||||
import com.google.refine.clustering.binning.Metaphone3Keyer;
|
||||
import com.google.refine.clustering.binning.MetaphoneKeyer;
|
||||
import com.google.refine.clustering.binning.SoundexKeyer;
|
||||
import com.google.refine.expr.EvalError;
|
||||
@ -47,6 +48,7 @@ import com.google.refine.grel.Function;
|
||||
|
||||
public class Phonetic implements Function {
|
||||
|
||||
static private Metaphone3Keyer metaphone3 = new Metaphone3Keyer();
|
||||
static private DoubleMetaphoneKeyer metaphone2 = new DoubleMetaphoneKeyer();
|
||||
static private MetaphoneKeyer metaphone = new MetaphoneKeyer();
|
||||
static private SoundexKeyer soundex = new SoundexKeyer();
|
||||
@ -58,11 +60,14 @@ public class Phonetic implements Function {
|
||||
if (o1 != null && o2 != null && o2 instanceof String) {
|
||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||
String encoding = ((String) o2).toLowerCase();
|
||||
if ("doublemetaphone".equals(encoding)) {
|
||||
if (encoding == null) encoding = "metaphone3";
|
||||
if ("doublemetaphone".equalsIgnoreCase(encoding)) {
|
||||
return metaphone2.key(str);
|
||||
} else if ("metaphone".equals(encoding)) {
|
||||
} else if ("metaphone3".equalsIgnoreCase(encoding)) {
|
||||
return metaphone3.key(str);
|
||||
} else if ("metaphone".equalsIgnoreCase(encoding)) {
|
||||
return metaphone.key(str);
|
||||
} else if ("soundex".equals(encoding)) {
|
||||
} else if ("soundex".equalsIgnoreCase(encoding)) {
|
||||
return soundex.key(str);
|
||||
} else {
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
|
||||
@ -77,7 +82,7 @@ public class Phonetic implements Function {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')");
|
||||
writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')");
|
||||
writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'metaphone3')");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
|
@ -23,7 +23,7 @@
|
||||
<div class="binning-controls">Keying Function <select bind="keyingFunctionSelector">
|
||||
<option selected="true">fingerprint</option>
|
||||
<option>ngram-fingerprint</option>
|
||||
<option>double-metaphone</option>
|
||||
<option>metaphone3</option>
|
||||
</select></div>
|
||||
<div class="knn-controls hidden">Distance Function <select bind="distanceFunctionSelector">
|
||||
<option selected="true">levenshtein</option>
|
||||
|
Loading…
Reference in New Issue
Block a user