Merge pull request #1007 from lispc/master

Use new algorithm for levenshtein clustering
This commit is contained in:
Thad Guidry 2015-09-21 20:23:45 -05:00
commit 94e219042e
4 changed files with 65 additions and 8 deletions

View File

@ -71,6 +71,8 @@ import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance;
import edu.tsinghua.dbgroup.EditDistanceClusterer;
public class kNNClusterer extends Clusterer {
private Distance _distance;
@ -148,7 +150,6 @@ public class kNNClusterer extends Clusterer {
int _blockingNgramSize = 6;
HashSet<String> _data;
NGramClusterer _clusterer;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d;
_config = o;
@ -191,6 +192,48 @@ public class kNNClusterer extends Clusterer {
return _clusterer.getClusters(_radius);
}
}
class EditDistanceClusteringRowVisitor implements RowVisitor {
int _radius = 2;
EditDistanceClusterer _clusterer;
public EditDistanceClusteringRowVisitor(JSONObject o) {
try {
JSONObject params = o.getJSONObject("params");
_radius = params.getInt("radius");
logger.debug("Use radius: {}", _radius);
} catch (JSONException e) {
logger.debug("No parameters found, using defaults");
}
_clusterer = new EditDistanceClusterer(_radius);
}
@Override
public void start(Project project) {
// nothing to do
}
@Override
public void end(Project project) {
// nothing to do
}
@Override
public boolean visit(Project project, int rowIndex, Row row) {
Cell cell = row.getCell(_colindex);
if (cell != null && cell.value != null) {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
_clusterer.populate(s);
count(s);
}
return false;
}
public List<Set<Serializable>> getClusters() {
return _clusterer.getClusters();
}
}
@Override
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
@ -200,12 +243,21 @@ public class kNNClusterer extends Clusterer {
@Override
public void computeClusters(Engine engine) {
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor);
_clusters = visitor.getClusters();
if(_distance != _distances.get("levenshtein")) {
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor);
_clusters = visitor.getClusters();
} else {
EditDistanceClusteringRowVisitor visitor =
new EditDistanceClusteringRowVisitor(_config);
FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor);
_clusters = visitor.getClusters();
}
}
public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable {

Binary file not shown.

View File

@ -33,7 +33,10 @@
<div id="ngram-fingerprint-params" class="function-params hidden">
<span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int">
</div>
<div class="knn-controls hidden">
<div id="levenshtein-params" class="knn-controls function-params hidden">
<span style="margin-right: 1em"><span bind="or_dialog_distance"></span><input type="text" value="2" bind="editDistance" name="radius" size="2" class="param" datatype="int"></span>
</div>
<div id="PPM-params" class="knn-controls function-params hidden">
<span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span>
<span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span>
</div>

View File

@ -66,6 +66,7 @@ ClusteringDialog.prototype._createDialog = function() {
this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]);
this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]);
this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]);
this._elmts.or_dialog_distance.html($.i18n._('core-dialogs')["ngram-radius"]);
this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]);
this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]);
this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]);
@ -118,6 +119,7 @@ ClusteringDialog.prototype._createDialog = function() {
this._elmts.ngramSize.change(params_changer);
this._elmts.radius.change(params_changer);
this._elmts.ngramBlock.change(params_changer);
this._elmts.editDistance.change(params_changer);
this._elmts.selectAllButton.click(function() { self._selectAll(); });
this._elmts.deselectAllButton.click(function() { self._deselectAll(); });