da7347e7b1
git-svn-id: http://google-refine.googlecode.com/svn/trunk@2183 7d457c2a-affb-35e4-300a-418c747d4874
267 lines
8.3 KiB
Java
267 lines
8.3 KiB
Java
/*
|
|
|
|
Copyright 2010, Google Inc.
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above
|
|
copyright notice, this list of conditions and the following disclaimer
|
|
in the documentation and/or other materials provided with the
|
|
distribution.
|
|
* Neither the name of Google Inc. nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
package com.google.refine.browsing.util;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Calendar;
|
|
import java.util.Collection;
|
|
import java.util.Date;
|
|
import java.util.List;
|
|
import java.util.Properties;
|
|
|
|
import com.google.refine.expr.ExpressionUtils;
|
|
import com.google.refine.model.Project;
|
|
import com.google.refine.model.Row;
|
|
|
|
/**
|
|
* A utility class for computing the base bins that form the base histograms of
|
|
* temporal range facets. It evaluates an expression on all the rows of a project to
|
|
* get temporal values, determines how many bins to distribute those values in, and
|
|
* bins the rows accordingly.
|
|
*
|
|
* This class processes all rows rather than just the filtered rows because it
|
|
* needs to compute the base bins of a temporal range facet, which remain unchanged
|
|
* as the user interacts with the facet.
|
|
*/
|
|
abstract public class TimeBinIndex {
|
|
|
|
protected int _totalValueCount;
|
|
protected int _timeValueCount;
|
|
protected long _min;
|
|
protected long _max;
|
|
protected long _step;
|
|
protected int[] _bins;
|
|
|
|
protected int _timeRowCount;
|
|
protected int _nonTimeRowCount;
|
|
protected int _blankRowCount;
|
|
protected int _errorRowCount;
|
|
|
|
protected boolean _hasError = false;
|
|
protected boolean _hasNonTime = false;
|
|
protected boolean _hasTime = false;
|
|
protected boolean _hasBlank = false;
|
|
|
|
protected long[] steps = {
|
|
1, // msec
|
|
1000, // sec
|
|
1000*60, // min
|
|
1000*60*60, // hour
|
|
1000*60*60*24, // day
|
|
1000*60*60*24*7, // week
|
|
1000l*2629746l, // month (average Gregorian year / 12)
|
|
1000l*31556952l, // year (average Gregorian year)
|
|
1000l*31556952l*10l, // decade
|
|
1000l*31556952l*100l, // century
|
|
1000l*31556952l*1000l, // millennium
|
|
};
|
|
|
|
abstract protected void iterate(Project project, RowEvaluable rowEvaluable, List<Long> allValues);
|
|
|
|
public TimeBinIndex(Project project, RowEvaluable rowEvaluable) {
|
|
_min = Long.MAX_VALUE;
|
|
_max = Long.MIN_VALUE;
|
|
|
|
List<Long> allValues = new ArrayList<Long>();
|
|
|
|
iterate(project, rowEvaluable, allValues);
|
|
|
|
_timeValueCount = allValues.size();
|
|
|
|
if (_min >= _max) {
|
|
_step = 1;
|
|
_min = Math.min(_min, _max);
|
|
_max = _step;
|
|
_bins = new int[1];
|
|
|
|
return;
|
|
}
|
|
|
|
long diff = _max - _min;
|
|
|
|
for (long step : steps) {
|
|
_step = step;
|
|
if (diff / _step <= 100) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
_bins = new int[(int) (diff / _step) + 1];
|
|
for (long d : allValues) {
|
|
int bin = (int) Math.max((d - _min) / _step,0);
|
|
_bins[bin]++;
|
|
}
|
|
}
|
|
|
|
public boolean isTemporal() {
|
|
return _timeValueCount > _totalValueCount / 2;
|
|
}
|
|
|
|
public long getMin() {
|
|
return _min;
|
|
}
|
|
|
|
public long getMax() {
|
|
return _max;
|
|
}
|
|
|
|
public long getStep() {
|
|
return _step;
|
|
}
|
|
|
|
public int[] getBins() {
|
|
return _bins;
|
|
}
|
|
|
|
public int getTimeRowCount() {
|
|
return _timeRowCount;
|
|
}
|
|
|
|
public int getNonTimeRowCount() {
|
|
return _nonTimeRowCount;
|
|
}
|
|
|
|
public int getBlankRowCount() {
|
|
return _blankRowCount;
|
|
}
|
|
|
|
public int getErrorRowCount() {
|
|
return _errorRowCount;
|
|
}
|
|
|
|
protected void processRow(
|
|
Project project,
|
|
RowEvaluable rowEvaluable,
|
|
List<Long> allValues,
|
|
int rowIndex,
|
|
Row row,
|
|
Properties bindings
|
|
) {
|
|
Object value = rowEvaluable.eval(project, rowIndex, row, bindings);
|
|
|
|
if (ExpressionUtils.isError(value)) {
|
|
_hasError = true;
|
|
} else if (ExpressionUtils.isNonBlankData(value)) {
|
|
if (value.getClass().isArray()) {
|
|
Object[] a = (Object[]) value;
|
|
for (Object v : a) {
|
|
_totalValueCount++;
|
|
|
|
if (ExpressionUtils.isError(v)) {
|
|
_hasError = true;
|
|
} else if (ExpressionUtils.isNonBlankData(v)) {
|
|
if (v instanceof Calendar) {
|
|
v = ((Calendar) v).getTime();
|
|
}
|
|
|
|
if (v instanceof Date) {
|
|
_hasTime = true;
|
|
processValue(((Date) v).getTime(), allValues);
|
|
} else {
|
|
_hasNonTime = true;
|
|
}
|
|
} else {
|
|
_hasBlank = true;
|
|
}
|
|
}
|
|
} else if (value instanceof Collection<?>) {
|
|
for (Object v : ExpressionUtils.toObjectCollection(value)) {
|
|
_totalValueCount++;
|
|
|
|
if (ExpressionUtils.isError(v)) {
|
|
_hasError = true;
|
|
} else if (ExpressionUtils.isNonBlankData(v)) {
|
|
if (v instanceof Calendar) {
|
|
v = ((Calendar) v).getTime();
|
|
}
|
|
|
|
if (v instanceof Date) {
|
|
_hasTime = true;
|
|
processValue(((Date) v).getTime(), allValues);
|
|
} else {
|
|
_hasNonTime = true;
|
|
}
|
|
} else {
|
|
_hasBlank = true;
|
|
}
|
|
}
|
|
} else {
|
|
_totalValueCount++;
|
|
|
|
if (value instanceof Calendar) {
|
|
value = ((Calendar) value).getTime();
|
|
}
|
|
|
|
if (value instanceof Date) {
|
|
_hasTime = true;
|
|
processValue(((Date) value).getTime(), allValues);
|
|
} else {
|
|
_hasNonTime = true;
|
|
}
|
|
}
|
|
} else {
|
|
_hasBlank = true;
|
|
}
|
|
}
|
|
|
|
protected void preprocessing() {
|
|
_hasBlank = false;
|
|
_hasError = false;
|
|
_hasNonTime = false;
|
|
_hasTime = false;
|
|
}
|
|
|
|
protected void postprocessing() {
|
|
if (_hasError) {
|
|
_errorRowCount++;
|
|
}
|
|
if (_hasBlank) {
|
|
_blankRowCount++;
|
|
}
|
|
if (_hasTime) {
|
|
_timeRowCount++;
|
|
}
|
|
if (_hasNonTime) {
|
|
_nonTimeRowCount++;
|
|
}
|
|
}
|
|
|
|
protected void processValue(long v, List<Long> allValues) {
|
|
_min = Math.min(_min, v);
|
|
_max = Math.max(_max, v);
|
|
allValues.add(v);
|
|
}
|
|
|
|
}
|