Remove O(n^2) issue in tree importers - fixes #699

- Add sparse/based list implementation for ImportRecord
This commit is contained in:
Tom Morris 2013-03-23 12:02:51 -04:00
parent f78dfadcf3
commit 6b3592982e
2 changed files with 297 additions and 234 deletions

View File

@ -10,5 +10,66 @@ import com.google.refine.model.Cell;
* *
*/ */
public class ImportRecord { public class ImportRecord {
public List<List<Cell>> rows = new ArrayList<List<Cell>>(); public List<List<Cell>> rows = new BasedList<List<Cell>>();
/**
* A List implementation to match the characteristics needed by the
* import process. It's optimized for a relatively small number of
* contiguous records at a potentially large offset from zero.
* <p>
* I suspect it's usually only a single row, but we support more, just
* not as efficiently. Depending on the behavior of the ColumnGroups
* this may not be necessary at all, but I don't fully understand what it
* does, so we'll just put this hack in place for now.
*
* @param <T>
*/
class BasedList<T> extends ArrayList<T> {
private static final long serialVersionUID = 1L;
int offset = Integer.MAX_VALUE;
public T set(int index, T element) {
rebase(index);
extend(index);
return super.set(index - offset, element);
}
public T get(int index) {
if (offset == Integer.MAX_VALUE || index - offset > size() - 1) {
return null;
}
return super.get(index - offset);
}
private void rebase(final int index) {
if (index < offset) {
if (offset < Integer.MAX_VALUE) {
int new_offset = Math.max(0, index - 10); // Leave some extra room
int delta = offset - new_offset;
// Ensure room at top
for (int i = 0; i < delta; i++) {
add(null);
}
// Shuffle up
for (int i = size(); i > delta; i --) {
set(i,get(i-delta));
} // Null unused entries
for (int i = 0; i < delta; i++) {
set(i,null);
}
offset = new_offset;
} else {
offset = index;
}
}
}
private void extend(final int index) {
int i = index - offset;
while (i >= size()) {
add(null);
}
}
}
} }

View File

@ -1,233 +1,235 @@
/* /*
Copyright 2010, Google Inc. Copyright 2010, Google Inc.
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
met: met:
* Redistributions of source code must retain the above copyright * Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above * Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the in the documentation and/or other materials provided with the
distribution. distribution.
* Neither the name of Google Inc. nor the names of its * Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from contributors may be used to endorse or promote products derived from
this software without specific prior written permission. this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
package com.google.refine.importers.tree; package com.google.refine.importers.tree;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.refine.importers.ImporterUtilities; import com.google.refine.importers.ImporterUtilities;
import com.google.refine.model.Cell; import com.google.refine.model.Cell;
import com.google.refine.model.Column; import com.google.refine.model.Column;
import com.google.refine.model.Project; import com.google.refine.model.Project;
public abstract class TreeImportUtilities { public abstract class TreeImportUtilities {
final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities"); final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities");
static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) { static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
Collections.sort(list, new Comparator<RecordElementCandidate>() { Collections.sort(list, new Comparator<RecordElementCandidate>() {
@Override @Override
public int compare(RecordElementCandidate o1, RecordElementCandidate o2) { public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
return o2.count - o1.count; return o2.count - o1.count;
} }
}); });
} }
static public void createColumnsFromImport( static public void createColumnsFromImport(
Project project, Project project,
ImportColumnGroup columnGroup ImportColumnGroup columnGroup
) { ) {
int startColumnIndex = project.columnModel.columns.size(); int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values()); List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() { Collections.sort(columns, new Comparator<ImportColumn>() {
@Override @Override
public int compare(ImportColumn o1, ImportColumn o2) { public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) { if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1; return o1.blankOnFirstRow ? 1 : -1;
} }
int c = o2.nonBlankCount - o1.nonBlankCount; int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length()); return c != 0 ? c : (o1.name.length() - o2.name.length());
} }
}); });
for (int i = 0; i < columns.size(); i++) { for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i); ImportColumn c = columns.get(i);
Column column = new com.google.refine.model.Column(c.cellIndex, c.name); Column column = new com.google.refine.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column); project.columnModel.columns.add(column);
} }
List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values()); List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() { Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
@Override @Override
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) { public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
// TODO: We really want the column/group with the highest % of // TODO: We really want the column/group with the highest % of
// records with at least one row populated, so popular optional // records with at least one row populated, so popular optional
// elements with multiple instances per record don't // elements with multiple instances per record don't
// outweigh mandatory elements with a single occurrence per record // outweigh mandatory elements with a single occurrence per record
// TODO: From a human factors point of view, we probably want // TODO: From a human factors point of view, we probably want
// to try to preserve the order that we found things in the XML // to try to preserve the order that we found things in the XML
// Sort by most populated first, then shortest name // Sort by most populated first, then shortest name
int c = o2.nonBlankCount - o1.nonBlankCount; int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length()); return c != 0 ? c : (o1.name.length() - o2.name.length());
} }
}); });
for (ImportColumnGroup g : subgroups) { for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g); createColumnsFromImport(project, g);
} }
int endColumnIndex = project.columnModel.columns.size(); int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex; int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) { if (span > 1 && span < project.columnModel.columns.size()) {
// TODO: Only use "key column" if it's 100% populated? // TODO: Only use "key column" if it's 100% populated?
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex); project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
} }
} }
@Deprecated @Deprecated
static protected void addCell( static protected void addCell(
Project project, Project project,
ImportColumnGroup columnGroup, ImportColumnGroup columnGroup,
ImportRecord record, ImportRecord record,
String columnLocalName, String columnLocalName,
String text String text
) { ) {
addCell(project, columnGroup, record, columnLocalName, text, true, true); addCell(project, columnGroup, record, columnLocalName, text, true, true);
} }
static protected void addCell( static protected void addCell(
Project project, Project project,
ImportColumnGroup columnGroup, ImportColumnGroup columnGroup,
ImportRecord record, ImportRecord record,
String columnLocalName, String columnLocalName,
String text, String text,
boolean storeEmptyString, boolean storeEmptyString,
boolean guessDataType boolean guessDataType
) { ) {
Serializable value = text; Serializable value = text;
if (!storeEmptyString && (text == null || (text).isEmpty())) { if (!storeEmptyString && (text == null || (text).isEmpty())) {
return; return;
} }
if (guessDataType) { if (guessDataType) {
value = ImporterUtilities.parseCellValue(text); value = ImporterUtilities.parseCellValue(text);
} }
addCell(project, columnGroup, record, columnLocalName, value); addCell(project, columnGroup, record, columnLocalName, value);
} }
protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record, protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record,
String columnLocalName, Serializable value) { String columnLocalName, Serializable value) {
ImportColumn column = getColumn(project, columnGroup, columnLocalName); ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex; int cellIndex = column.cellIndex;
int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex); int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex);
while (rowIndex >= record.rows.size()) {
record.rows.add(new ArrayList<Cell>()); List<Cell> row = record.rows.get(rowIndex);
} if (row == null) {
row = new ArrayList<Cell>();
List<Cell> row = record.rows.get(rowIndex); record.rows.set(rowIndex, row);
while (cellIndex >= row.size()) { }
row.add(null);
} while (cellIndex >= row.size()) {
row.add(null);
row.set(cellIndex, new Cell(value, null)); }
column.nextRowIndex = rowIndex + 1; row.set(cellIndex, new Cell(value, null));
column.nonBlankCount++; // TODO: Only increment for first instance in record?
} column.nextRowIndex = rowIndex + 1;
column.nonBlankCount++; // TODO: Only increment for first instance in record?
}
static protected ImportColumn getColumn(
Project project,
ImportColumnGroup columnGroup, static protected ImportColumn getColumn(
String localName Project project,
) { ImportColumnGroup columnGroup,
if (columnGroup.columns.containsKey(localName)) { String localName
return columnGroup.columns.get(localName); ) {
} if (columnGroup.columns.containsKey(localName)) {
return columnGroup.columns.get(localName);
ImportColumn column = createColumn(project, columnGroup, localName); }
columnGroup.columns.put(localName, column);
ImportColumn column = createColumn(project, columnGroup, localName);
return column; columnGroup.columns.put(localName, column);
}
return column;
static protected ImportColumn createColumn( }
Project project,
ImportColumnGroup columnGroup, static protected ImportColumn createColumn(
String localName Project project,
) { ImportColumnGroup columnGroup,
ImportColumn newColumn = new ImportColumn(); String localName
) {
newColumn.name = columnGroup.name.length() == 0 ? ImportColumn newColumn = new ImportColumn();
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); newColumn.name = columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
newColumn.cellIndex = project.columnModel.allocateNewCellIndex(); (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newColumn.nextRowIndex = columnGroup.nextRowIndex;
newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
return newColumn; newColumn.nextRowIndex = columnGroup.nextRowIndex;
}
return newColumn;
static protected ImportColumnGroup getColumnGroup( }
Project project,
ImportColumnGroup columnGroup, static protected ImportColumnGroup getColumnGroup(
String localName Project project,
) { ImportColumnGroup columnGroup,
if (columnGroup.subgroups.containsKey(localName)) { String localName
return columnGroup.subgroups.get(localName); ) {
} if (columnGroup.subgroups.containsKey(localName)) {
return columnGroup.subgroups.get(localName);
ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName); }
columnGroup.subgroups.put(localName, subgroup);
ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
return subgroup; columnGroup.subgroups.put(localName, subgroup);
}
return subgroup;
static protected ImportColumnGroup createColumnGroup( }
Project project,
ImportColumnGroup columnGroup, static protected ImportColumnGroup createColumnGroup(
String localName Project project,
) { ImportColumnGroup columnGroup,
ImportColumnGroup newGroup = new ImportColumnGroup(); String localName
) {
newGroup.name = ImportColumnGroup newGroup = new ImportColumnGroup();
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) : newGroup.name =
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
newGroup.nextRowIndex = columnGroup.nextRowIndex; (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
return newGroup; newGroup.nextRowIndex = columnGroup.nextRowIndex;
}
return newGroup;
} }
}