Merge pull request #1007 from lispc/master
Use new algorithm for levenshtein clustering
This commit is contained in:
commit
94e219042e
@ -71,6 +71,8 @@ import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||
|
||||
import edu.tsinghua.dbgroup.EditDistanceClusterer;
|
||||
|
||||
public class kNNClusterer extends Clusterer {
|
||||
|
||||
private Distance _distance;
|
||||
@ -148,7 +150,6 @@ public class kNNClusterer extends Clusterer {
|
||||
int _blockingNgramSize = 6;
|
||||
HashSet<String> _data;
|
||||
NGramClusterer _clusterer;
|
||||
|
||||
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
|
||||
_distance = d;
|
||||
_config = o;
|
||||
@ -191,6 +192,48 @@ public class kNNClusterer extends Clusterer {
|
||||
return _clusterer.getClusters(_radius);
|
||||
}
|
||||
}
|
||||
|
||||
class EditDistanceClusteringRowVisitor implements RowVisitor {
|
||||
|
||||
int _radius = 2;
|
||||
EditDistanceClusterer _clusterer;
|
||||
public EditDistanceClusteringRowVisitor(JSONObject o) {
|
||||
try {
|
||||
JSONObject params = o.getJSONObject("params");
|
||||
_radius = params.getInt("radius");
|
||||
logger.debug("Use radius: {}", _radius);
|
||||
} catch (JSONException e) {
|
||||
logger.debug("No parameters found, using defaults");
|
||||
}
|
||||
_clusterer = new EditDistanceClusterer(_radius);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(Project project) {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end(Project project) {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean visit(Project project, int rowIndex, Row row) {
|
||||
Cell cell = row.getCell(_colindex);
|
||||
if (cell != null && cell.value != null) {
|
||||
Object v = cell.value;
|
||||
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
|
||||
_clusterer.populate(s);
|
||||
count(s);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<Set<Serializable>> getClusters() {
|
||||
return _clusterer.getClusters();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||
@ -200,12 +243,21 @@ public class kNNClusterer extends Clusterer {
|
||||
|
||||
@Override
|
||||
public void computeClusters(Engine engine) {
|
||||
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
|
||||
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows();
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
_clusters = visitor.getClusters();
|
||||
if(_distance != _distances.get("levenshtein")) {
|
||||
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
|
||||
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows();
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
_clusters = visitor.getClusters();
|
||||
} else {
|
||||
EditDistanceClusteringRowVisitor visitor =
|
||||
new EditDistanceClusteringRowVisitor(_config);
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows();
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
_clusters = visitor.getClusters();
|
||||
}
|
||||
}
|
||||
|
||||
public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable {
|
||||
|
BIN
main/webapp/WEB-INF/lib/EditDistanceJoiner.jar
Normal file
BIN
main/webapp/WEB-INF/lib/EditDistanceJoiner.jar
Normal file
Binary file not shown.
@ -33,7 +33,10 @@
|
||||
<div id="ngram-fingerprint-params" class="function-params hidden">
|
||||
<span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int">
|
||||
</div>
|
||||
<div class="knn-controls hidden">
|
||||
<div id="levenshtein-params" class="knn-controls function-params hidden">
|
||||
<span style="margin-right: 1em"><span bind="or_dialog_distance"></span><input type="text" value="2" bind="editDistance" name="radius" size="2" class="param" datatype="int"></span>
|
||||
</div>
|
||||
<div id="PPM-params" class="knn-controls function-params hidden">
|
||||
<span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span>
|
||||
<span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span>
|
||||
</div>
|
||||
|
@ -66,6 +66,7 @@ ClusteringDialog.prototype._createDialog = function() {
|
||||
this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]);
|
||||
this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]);
|
||||
this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]);
|
||||
this._elmts.or_dialog_distance.html($.i18n._('core-dialogs')["ngram-radius"]);
|
||||
this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]);
|
||||
this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]);
|
||||
this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]);
|
||||
@ -118,6 +119,7 @@ ClusteringDialog.prototype._createDialog = function() {
|
||||
this._elmts.ngramSize.change(params_changer);
|
||||
this._elmts.radius.change(params_changer);
|
||||
this._elmts.ngramBlock.change(params_changer);
|
||||
this._elmts.editDistance.change(params_changer);
|
||||
|
||||
this._elmts.selectAllButton.click(function() { self._selectAll(); });
|
||||
this._elmts.deselectAllButton.click(function() { self._deselectAll(); });
|
||||
|
Loading…
Reference in New Issue
Block a user