Merge pull request #1007 from lispc/master

Use new algorithm for levenshtein clustering
This commit is contained in:
Thad Guidry 2015-09-21 20:23:45 -05:00
commit 94e219042e
4 changed files with 65 additions and 8 deletions

View File

@ -71,6 +71,8 @@ import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance; import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance; import edu.mit.simile.vicino.distances.PPMDistance;
import edu.tsinghua.dbgroup.EditDistanceClusterer;
public class kNNClusterer extends Clusterer { public class kNNClusterer extends Clusterer {
private Distance _distance; private Distance _distance;
@ -148,7 +150,6 @@ public class kNNClusterer extends Clusterer {
int _blockingNgramSize = 6; int _blockingNgramSize = 6;
HashSet<String> _data; HashSet<String> _data;
NGramClusterer _clusterer; NGramClusterer _clusterer;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) { public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d; _distance = d;
_config = o; _config = o;
@ -192,6 +193,48 @@ public class kNNClusterer extends Clusterer {
} }
} }
class EditDistanceClusteringRowVisitor implements RowVisitor {
int _radius = 2;
EditDistanceClusterer _clusterer;
public EditDistanceClusteringRowVisitor(JSONObject o) {
try {
JSONObject params = o.getJSONObject("params");
_radius = params.getInt("radius");
logger.debug("Use radius: {}", _radius);
} catch (JSONException e) {
logger.debug("No parameters found, using defaults");
}
_clusterer = new EditDistanceClusterer(_radius);
}
@Override
public void start(Project project) {
// nothing to do
}
@Override
public void end(Project project) {
// nothing to do
}
@Override
public boolean visit(Project project, int rowIndex, Row row) {
Cell cell = row.getCell(_colindex);
if (cell != null && cell.value != null) {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
_clusterer.populate(s);
count(s);
}
return false;
}
public List<Set<Serializable>> getClusters() {
return _clusterer.getClusters();
}
}
@Override @Override
public void initializeFromJSON(Project project, JSONObject o) throws Exception { public void initializeFromJSON(Project project, JSONObject o) throws Exception {
super.initializeFromJSON(project, o); super.initializeFromJSON(project, o);
@ -200,12 +243,21 @@ public class kNNClusterer extends Clusterer {
@Override @Override
public void computeClusters(Engine engine) { public void computeClusters(Engine engine) {
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config); if(_distance != _distances.get("levenshtein")) {
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config); //VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
FilteredRows filteredRows = engine.getAllFilteredRows(); BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
filteredRows.accept(_project, visitor); FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor);
_clusters = visitor.getClusters(); _clusters = visitor.getClusters();
} else {
EditDistanceClusteringRowVisitor visitor =
new EditDistanceClusteringRowVisitor(_config);
FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor);
_clusters = visitor.getClusters();
}
} }
public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable { public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable {

Binary file not shown.

View File

@ -33,7 +33,10 @@
<div id="ngram-fingerprint-params" class="function-params hidden"> <div id="ngram-fingerprint-params" class="function-params hidden">
<span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int"> <span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int">
</div> </div>
<div class="knn-controls hidden"> <div id="levenshtein-params" class="knn-controls function-params hidden">
<span style="margin-right: 1em"><span bind="or_dialog_distance"></span><input type="text" value="2" bind="editDistance" name="radius" size="2" class="param" datatype="int"></span>
</div>
<div id="PPM-params" class="knn-controls function-params hidden">
<span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span> <span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span>
<span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span> <span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span>
</div> </div>

View File

@ -66,6 +66,7 @@ ClusteringDialog.prototype._createDialog = function() {
this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]); this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]);
this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]); this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]);
this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]); this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]);
this._elmts.or_dialog_distance.html($.i18n._('core-dialogs')["ngram-radius"]);
this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]); this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]);
this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]); this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]);
this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]); this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]);
@ -118,6 +119,7 @@ ClusteringDialog.prototype._createDialog = function() {
this._elmts.ngramSize.change(params_changer); this._elmts.ngramSize.change(params_changer);
this._elmts.radius.change(params_changer); this._elmts.radius.change(params_changer);
this._elmts.ngramBlock.change(params_changer); this._elmts.ngramBlock.change(params_changer);
this._elmts.editDistance.change(params_changer);
this._elmts.selectAllButton.click(function() { self._selectAll(); }); this._elmts.selectAllButton.click(function() { self._selectAll(); });
this._elmts.deselectAllButton.click(function() { self._deselectAll(); }); this._elmts.deselectAllButton.click(function() { self._deselectAll(); });