Merge pull request #1073 from OpenRefine/revert-1007-master
Revert " Use new algorithm for levenshtein clustering"
This commit is contained in:
commit
6c50974e4d
@ -71,8 +71,6 @@ import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
|||||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||||
|
|
||||||
import edu.tsinghua.dbgroup.EditDistanceClusterer;
|
|
||||||
|
|
||||||
public class kNNClusterer extends Clusterer {
|
public class kNNClusterer extends Clusterer {
|
||||||
|
|
||||||
private Distance _distance;
|
private Distance _distance;
|
||||||
@ -150,6 +148,7 @@ public class kNNClusterer extends Clusterer {
|
|||||||
int _blockingNgramSize = 6;
|
int _blockingNgramSize = 6;
|
||||||
HashSet<String> _data;
|
HashSet<String> _data;
|
||||||
NGramClusterer _clusterer;
|
NGramClusterer _clusterer;
|
||||||
|
|
||||||
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
|
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
|
||||||
_distance = d;
|
_distance = d;
|
||||||
_config = o;
|
_config = o;
|
||||||
@ -192,48 +191,6 @@ public class kNNClusterer extends Clusterer {
|
|||||||
return _clusterer.getClusters(_radius);
|
return _clusterer.getClusters(_radius);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class EditDistanceClusteringRowVisitor implements RowVisitor {
|
|
||||||
|
|
||||||
int _radius = 2;
|
|
||||||
EditDistanceClusterer _clusterer;
|
|
||||||
public EditDistanceClusteringRowVisitor(JSONObject o) {
|
|
||||||
try {
|
|
||||||
JSONObject params = o.getJSONObject("params");
|
|
||||||
_radius = params.getInt("radius");
|
|
||||||
logger.debug("Use radius: {}", _radius);
|
|
||||||
} catch (JSONException e) {
|
|
||||||
logger.debug("No parameters found, using defaults");
|
|
||||||
}
|
|
||||||
_clusterer = new EditDistanceClusterer(_radius);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void start(Project project) {
|
|
||||||
// nothing to do
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void end(Project project) {
|
|
||||||
// nothing to do
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean visit(Project project, int rowIndex, Row row) {
|
|
||||||
Cell cell = row.getCell(_colindex);
|
|
||||||
if (cell != null && cell.value != null) {
|
|
||||||
Object v = cell.value;
|
|
||||||
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
|
|
||||||
_clusterer.populate(s);
|
|
||||||
count(s);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Set<Serializable>> getClusters() {
|
|
||||||
return _clusterer.getClusters();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||||
@ -243,21 +200,12 @@ public class kNNClusterer extends Clusterer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void computeClusters(Engine engine) {
|
public void computeClusters(Engine engine) {
|
||||||
if(_distance != _distances.get("levenshtein")) {
|
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
|
||||||
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
|
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
|
||||||
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config);
|
FilteredRows filteredRows = engine.getAllFilteredRows();
|
||||||
FilteredRows filteredRows = engine.getAllFilteredRows();
|
filteredRows.accept(_project, visitor);
|
||||||
filteredRows.accept(_project, visitor);
|
|
||||||
|
_clusters = visitor.getClusters();
|
||||||
_clusters = visitor.getClusters();
|
|
||||||
} else {
|
|
||||||
EditDistanceClusteringRowVisitor visitor =
|
|
||||||
new EditDistanceClusteringRowVisitor(_config);
|
|
||||||
FilteredRows filteredRows = engine.getAllFilteredRows();
|
|
||||||
filteredRows.accept(_project, visitor);
|
|
||||||
|
|
||||||
_clusters = visitor.getClusters();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable {
|
public static class ValuesComparator implements Comparator<Entry<Serializable,Integer>>, Serializable {
|
||||||
|
Binary file not shown.
@ -33,10 +33,7 @@
|
|||||||
<div id="ngram-fingerprint-params" class="function-params hidden">
|
<div id="ngram-fingerprint-params" class="function-params hidden">
|
||||||
<span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int">
|
<span bind="or_dialog_ngramSize"></span><input type="text" value="2" bind="ngramSize" name="ngram-size" size="2" class="param" datatype="int">
|
||||||
</div>
|
</div>
|
||||||
<div id="levenshtein-params" class="knn-controls function-params hidden">
|
<div class="knn-controls hidden">
|
||||||
<span style="margin-right: 1em"><span bind="or_dialog_distance"></span><input type="text" value="2" bind="editDistance" name="radius" size="2" class="param" datatype="int"></span>
|
|
||||||
</div>
|
|
||||||
<div id="PPM-params" class="knn-controls function-params hidden">
|
|
||||||
<span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span>
|
<span style="margin-right: 1em"><span bind="or_dialog_radius"></span><input type="text" value="1.0" bind="radius" name="radius" size="2" class="param" datatype="float"></span>
|
||||||
<span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span>
|
<span><span bind="or_dialog_blockChars"></span><input type="text" value="6" bind="ngramBlock" name="blocking-ngram-size" size="2" class="param" datatype="int"></span>
|
||||||
</div>
|
</div>
|
||||||
|
@ -66,7 +66,6 @@ ClusteringDialog.prototype._createDialog = function() {
|
|||||||
this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]);
|
this._elmts.or_dialog_ppm.html($.i18n._('core-dialogs')["ppm"]);
|
||||||
this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]);
|
this._elmts.or_dialog_ngramSize.html($.i18n._('core-dialogs')["ngram-size"]);
|
||||||
this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]);
|
this._elmts.or_dialog_radius.html($.i18n._('core-dialogs')["ngram-radius"]);
|
||||||
this._elmts.or_dialog_distance.html($.i18n._('core-dialogs')["ngram-radius"]);
|
|
||||||
this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]);
|
this._elmts.or_dialog_blockChars.html($.i18n._('core-dialogs')["block-chars"]);
|
||||||
this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]);
|
this._elmts.selectAllButton.html($.i18n._('core-buttons')["select-all"]);
|
||||||
this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]);
|
this._elmts.deselectAllButton.html($.i18n._('core-buttons')["unselect-all"]);
|
||||||
@ -119,7 +118,6 @@ ClusteringDialog.prototype._createDialog = function() {
|
|||||||
this._elmts.ngramSize.change(params_changer);
|
this._elmts.ngramSize.change(params_changer);
|
||||||
this._elmts.radius.change(params_changer);
|
this._elmts.radius.change(params_changer);
|
||||||
this._elmts.ngramBlock.change(params_changer);
|
this._elmts.ngramBlock.change(params_changer);
|
||||||
this._elmts.editDistance.change(params_changer);
|
|
||||||
|
|
||||||
this._elmts.selectAllButton.click(function() { self._selectAll(); });
|
this._elmts.selectAllButton.click(function() { self._selectAll(); });
|
||||||
this._elmts.deselectAllButton.click(function() { self._deselectAll(); });
|
this._elmts.deselectAllButton.click(function() { self._deselectAll(); });
|
||||||
|
Loading…
Reference in New Issue
Block a user