Remove O(n^2) issue in tree importers - fixes #699

- Add sparse/based list implementation for ImportRecord
This commit is contained in:
Tom Morris 2013-03-23 12:02:51 -04:00
parent f78dfadcf3
commit 6b3592982e
2 changed files with 297 additions and 234 deletions

View File

@ -10,5 +10,66 @@ import com.google.refine.model.Cell;
*
*/
public class ImportRecord {
public List<List<Cell>> rows = new ArrayList<List<Cell>>();
public List<List<Cell>> rows = new BasedList<List<Cell>>();
/**
* A List implementation to match the characteristics needed by the
* import process. It's optimized for a relatively small number of
* contiguous records at a potentially large offset from zero.
* <p>
* I suspect it's usually only a single row, but we support more, just
* not as efficiently. Depending on the behavior of the ColumnGroups
* this may not be necessary at all, but I don't fully understand what it
* does, so we'll just put this hack in place for now.
*
* @param <T>
*/
class BasedList<T> extends ArrayList<T> {
private static final long serialVersionUID = 1L;
int offset = Integer.MAX_VALUE;
public T set(int index, T element) {
rebase(index);
extend(index);
return super.set(index - offset, element);
}
public T get(int index) {
if (offset == Integer.MAX_VALUE || index - offset > size() - 1) {
return null;
}
return super.get(index - offset);
}
private void rebase(final int index) {
if (index < offset) {
if (offset < Integer.MAX_VALUE) {
int new_offset = Math.max(0, index - 10); // Leave some extra room
int delta = offset - new_offset;
// Ensure room at top
for (int i = 0; i < delta; i++) {
add(null);
}
// Shuffle up
for (int i = size(); i > delta; i --) {
set(i,get(i-delta));
} // Null unused entries
for (int i = 0; i < delta; i++) {
set(i,null);
}
offset = new_offset;
} else {
offset = index;
}
}
}
private void extend(final int index) {
int i = index - offset;
while (i >= size()) {
add(null);
}
}
}
}

View File

@ -1,233 +1,235 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.importers.tree;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.refine.importers.ImporterUtilities;
import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.Project;
public abstract class TreeImportUtilities {
final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities");
static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
Collections.sort(list, new Comparator<RecordElementCandidate>() {
@Override
public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
return o2.count - o1.count;
}
});
}
static public void createColumnsFromImport(
Project project,
ImportColumnGroup columnGroup
) {
int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() {
@Override
public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1;
}
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i);
Column column = new com.google.refine.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column);
}
List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
@Override
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
// TODO: We really want the column/group with the highest % of
// records with at least one row populated, so popular optional
// elements with multiple instances per record don't
// outweigh mandatory elements with a single occurrence per record
// TODO: From a human factors point of view, we probably want
// to try to preserve the order that we found things in the XML
// Sort by most populated first, then shortest name
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g);
}
int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) {
// TODO: Only use "key column" if it's 100% populated?
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
}
}
@Deprecated
static protected void addCell(
Project project,
ImportColumnGroup columnGroup,
ImportRecord record,
String columnLocalName,
String text
) {
addCell(project, columnGroup, record, columnLocalName, text, true, true);
}
static protected void addCell(
Project project,
ImportColumnGroup columnGroup,
ImportRecord record,
String columnLocalName,
String text,
boolean storeEmptyString,
boolean guessDataType
) {
Serializable value = text;
if (!storeEmptyString && (text == null || (text).isEmpty())) {
return;
}
if (guessDataType) {
value = ImporterUtilities.parseCellValue(text);
}
addCell(project, columnGroup, record, columnLocalName, value);
}
protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record,
String columnLocalName, Serializable value) {
ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex;
int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex);
while (rowIndex >= record.rows.size()) {
record.rows.add(new ArrayList<Cell>());
}
List<Cell> row = record.rows.get(rowIndex);
while (cellIndex >= row.size()) {
row.add(null);
}
row.set(cellIndex, new Cell(value, null));
column.nextRowIndex = rowIndex + 1;
column.nonBlankCount++; // TODO: Only increment for first instance in record?
}
static protected ImportColumn getColumn(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
if (columnGroup.columns.containsKey(localName)) {
return columnGroup.columns.get(localName);
}
ImportColumn column = createColumn(project, columnGroup, localName);
columnGroup.columns.put(localName, column);
return column;
}
static protected ImportColumn createColumn(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumn newColumn = new ImportColumn();
newColumn.name = columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
newColumn.nextRowIndex = columnGroup.nextRowIndex;
return newColumn;
}
static protected ImportColumnGroup getColumnGroup(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
if (columnGroup.subgroups.containsKey(localName)) {
return columnGroup.subgroups.get(localName);
}
ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
columnGroup.subgroups.put(localName, subgroup);
return subgroup;
}
static protected ImportColumnGroup createColumnGroup(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumnGroup newGroup = new ImportColumnGroup();
newGroup.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newGroup.nextRowIndex = columnGroup.nextRowIndex;
return newGroup;
}
}
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.importers.tree;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.refine.importers.ImporterUtilities;
import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.Project;
public abstract class TreeImportUtilities {
final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities");
static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
Collections.sort(list, new Comparator<RecordElementCandidate>() {
@Override
public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
return o2.count - o1.count;
}
});
}
static public void createColumnsFromImport(
Project project,
ImportColumnGroup columnGroup
) {
int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() {
@Override
public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1;
}
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i);
Column column = new com.google.refine.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column);
}
List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
@Override
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
// TODO: We really want the column/group with the highest % of
// records with at least one row populated, so popular optional
// elements with multiple instances per record don't
// outweigh mandatory elements with a single occurrence per record
// TODO: From a human factors point of view, we probably want
// to try to preserve the order that we found things in the XML
// Sort by most populated first, then shortest name
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g);
}
int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) {
// TODO: Only use "key column" if it's 100% populated?
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
}
}
@Deprecated
static protected void addCell(
Project project,
ImportColumnGroup columnGroup,
ImportRecord record,
String columnLocalName,
String text
) {
addCell(project, columnGroup, record, columnLocalName, text, true, true);
}
static protected void addCell(
Project project,
ImportColumnGroup columnGroup,
ImportRecord record,
String columnLocalName,
String text,
boolean storeEmptyString,
boolean guessDataType
) {
Serializable value = text;
if (!storeEmptyString && (text == null || (text).isEmpty())) {
return;
}
if (guessDataType) {
value = ImporterUtilities.parseCellValue(text);
}
addCell(project, columnGroup, record, columnLocalName, value);
}
protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record,
String columnLocalName, Serializable value) {
ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex;
int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex);
List<Cell> row = record.rows.get(rowIndex);
if (row == null) {
row = new ArrayList<Cell>();
record.rows.set(rowIndex, row);
}
while (cellIndex >= row.size()) {
row.add(null);
}
row.set(cellIndex, new Cell(value, null));
column.nextRowIndex = rowIndex + 1;
column.nonBlankCount++; // TODO: Only increment for first instance in record?
}
static protected ImportColumn getColumn(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
if (columnGroup.columns.containsKey(localName)) {
return columnGroup.columns.get(localName);
}
ImportColumn column = createColumn(project, columnGroup, localName);
columnGroup.columns.put(localName, column);
return column;
}
static protected ImportColumn createColumn(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumn newColumn = new ImportColumn();
newColumn.name = columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
newColumn.nextRowIndex = columnGroup.nextRowIndex;
return newColumn;
}
static protected ImportColumnGroup getColumnGroup(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
if (columnGroup.subgroups.containsKey(localName)) {
return columnGroup.subgroups.get(localName);
}
ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
columnGroup.subgroups.put(localName, subgroup);
return subgroup;
}
static protected ImportColumnGroup createColumnGroup(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumnGroup newGroup = new ImportColumnGroup();
newGroup.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newGroup.nextRowIndex = columnGroup.nextRowIndex;
return newGroup;
}
}