Remove O(n^2) issue in tree importers - fixes #699

- Add sparse/based list implementation for ImportRecord
2013-03-23 12:02:51 -04:00 · 2013-03-23 12:02:51 -04:00 · 6b3592982e
commit 6b3592982e
parent f78dfadcf3
2 changed files with 297 additions and 234 deletions
--- a/main/src/com/google/refine/importers/tree/ImportRecord.java
+++ b/main/src/com/google/refine/importers/tree/ImportRecord.java
@ -10,5 +10,66 @@ import com.google.refine.model.Cell;
 *
 */
 public class ImportRecord {
-    public List<List<Cell>> rows = new ArrayList<List<Cell>>();
+    public List<List<Cell>> rows = new BasedList<List<Cell>>();
+
+    
+    /**
+     * A List implementation to match the characteristics needed by the 
+     * import process.  It's optimized for a relatively small number of 
+     * contiguous records at a potentially large offset from zero.
+     * <p>
+     * I suspect it's usually only a single row, but we support more, just
+     * not as efficiently.  Depending on the behavior of the ColumnGroups
+     * this may not be necessary at all, but I don't fully understand what it
+     * does, so we'll just put this hack in place for now.
+     * 
+     * @param <T>
+     */
+    class BasedList<T> extends ArrayList<T> {
+        private static final long serialVersionUID = 1L;
+        int offset = Integer.MAX_VALUE;
+        
+        public T set(int index, T element) {
+            rebase(index);
+            extend(index);
+            return super.set(index - offset, element);
+        }
+
+        public T get(int index) {
+            if (offset == Integer.MAX_VALUE || index - offset > size() - 1) {
+                return null;
+            }
+            return super.get(index - offset);
+        }
+        
+        private void rebase(final int index) {
+            if (index < offset) {
+                if (offset < Integer.MAX_VALUE) {
+                    int new_offset = Math.max(0, index - 10); // Leave some extra room
+                    int delta = offset - new_offset;
+                    // Ensure room at top
+                    for (int i = 0; i < delta; i++) {
+                        add(null);
+                    }
+                    // Shuffle up
+                    for (int i = size(); i > delta; i --) {
+                        set(i,get(i-delta));
+                    } // Null unused entries
+                    for (int i = 0; i < delta; i++) {
+                        set(i,null);
+                    }
+                    offset = new_offset;
+                } else {
+                    offset = index;
+                }
+            }
+         }
+        
+        private void extend(final int index) {
+            int i = index - offset;
+            while (i >= size()) {
+                add(null);
+            }
+        }
+    }
 }
--- a/main/src/com/google/refine/importers/tree/TreeImportUtilities.java
+++ b/main/src/com/google/refine/importers/tree/TreeImportUtilities.java
@ -1,233 +1,235 @@
-/*
-
-Copyright 2010, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,           
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY           
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-package com.google.refine.importers.tree;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.refine.importers.ImporterUtilities;
-import com.google.refine.model.Cell;
-import com.google.refine.model.Column;
-import com.google.refine.model.Project;
-
-public abstract class TreeImportUtilities {
-    final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities");
-
-    static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
-        Collections.sort(list, new Comparator<RecordElementCandidate>() {
-            @Override
-            public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
-                return o2.count - o1.count;
-            }
-        });
-    }
-
-    static public void createColumnsFromImport(
-            Project project,
-            ImportColumnGroup columnGroup
-    ) {
-        int startColumnIndex = project.columnModel.columns.size();
-
-        List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
-        Collections.sort(columns, new Comparator<ImportColumn>() {
-            @Override
-            public int compare(ImportColumn o1, ImportColumn o2) {
-                if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
-                    return o1.blankOnFirstRow ? 1 : -1;
-                }
-
-                int c = o2.nonBlankCount - o1.nonBlankCount;
-                return c != 0 ? c : (o1.name.length() - o2.name.length());
-            }
-        });
-
-        for (int i = 0; i < columns.size(); i++) {
-            ImportColumn c = columns.get(i);
-
-            Column column = new com.google.refine.model.Column(c.cellIndex, c.name);
-            project.columnModel.columns.add(column);
-        }
-
-        List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
-        Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
-            @Override
-            public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
-                // TODO: We really want the column/group with the highest % of 
-                // records with at least one row populated, so popular optional
-                // elements with multiple instances per record don't 
-                // outweigh mandatory elements with a single occurrence per record
-                // TODO: From a human factors point of view, we probably want
-                // to try to preserve the order that we found things in the XML
-                
-                // Sort by most populated first, then shortest name
-                int c = o2.nonBlankCount - o1.nonBlankCount;
-                return c != 0 ? c : (o1.name.length() - o2.name.length());
-            }
-        });
-
-        for (ImportColumnGroup g : subgroups) {
-            createColumnsFromImport(project, g);
-        }
-
-        int endColumnIndex = project.columnModel.columns.size();
-        int span = endColumnIndex - startColumnIndex;
-        if (span > 1 && span < project.columnModel.columns.size()) {
-            // TODO: Only use "key column" if it's 100% populated?
-            project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
-        }
-    }
-
-    @Deprecated
-    static protected void addCell(
-            Project project,
-            ImportColumnGroup columnGroup,
-            ImportRecord record,
-            String columnLocalName,
-            String text
-    ) {
-        addCell(project, columnGroup, record, columnLocalName, text, true, true);
-    }
-    
-    static protected void addCell(
-            Project project,
-            ImportColumnGroup columnGroup,
-            ImportRecord record,
-            String columnLocalName,
-            String text,
-            boolean storeEmptyString,
-            boolean guessDataType
-    ) {
-        Serializable value = text;
-        if (!storeEmptyString && (text == null || (text).isEmpty())) {
-            return;
-        }
-        if (guessDataType) {
-            value = ImporterUtilities.parseCellValue(text); 
-        }
-        addCell(project, columnGroup, record, columnLocalName, value);
-    }
-
-    protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record,
-            String columnLocalName, Serializable value) {
-        ImportColumn column = getColumn(project, columnGroup, columnLocalName);
-        int cellIndex = column.cellIndex;
-
-        int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex);
-        while (rowIndex >= record.rows.size()) {
-            record.rows.add(new ArrayList<Cell>());
-        }
-
-        List<Cell> row = record.rows.get(rowIndex);
-        while (cellIndex >= row.size()) {
-            row.add(null);
-        }
-
-        row.set(cellIndex, new Cell(value, null));
-
-        column.nextRowIndex = rowIndex + 1;
-        column.nonBlankCount++; // TODO: Only increment for first instance in record?
-    }
-
-
-    static protected ImportColumn getColumn(
-            Project project,
-            ImportColumnGroup columnGroup,
-            String localName
-    ) {
-        if (columnGroup.columns.containsKey(localName)) {
-            return columnGroup.columns.get(localName);
-        }
-
-        ImportColumn column = createColumn(project, columnGroup, localName);
-        columnGroup.columns.put(localName, column);
-
-        return column;
-    }
-
-    static protected ImportColumn createColumn(
-            Project project,
-            ImportColumnGroup columnGroup,
-            String localName
-    ) {
-        ImportColumn newColumn = new ImportColumn();
-
-        newColumn.name = columnGroup.name.length() == 0 ?
-                (localName == null ? "Text" : localName) :
-                    (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
-
-        newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
-        newColumn.nextRowIndex = columnGroup.nextRowIndex;
-
-        return newColumn;
-    }
-
-    static protected ImportColumnGroup getColumnGroup(
-            Project project,
-            ImportColumnGroup columnGroup,
-            String localName
-    ) {
-        if (columnGroup.subgroups.containsKey(localName)) {
-            return columnGroup.subgroups.get(localName);
-        }
-
-        ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
-        columnGroup.subgroups.put(localName, subgroup);
-
-        return subgroup;
-    }
-
-    static protected ImportColumnGroup createColumnGroup(
-            Project project,
-            ImportColumnGroup columnGroup,
-            String localName
-    ) {
-        ImportColumnGroup newGroup = new ImportColumnGroup();
-
-        newGroup.name =
-            columnGroup.name.length() == 0 ?
-                    (localName == null ? "Text" : localName) :
-                        (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
-
-        newGroup.nextRowIndex = columnGroup.nextRowIndex;
-
-        return newGroup;
-    }
-        
-}
+/*
+
+Copyright 2010, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,           
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY           
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+package com.google.refine.importers.tree;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.refine.importers.ImporterUtilities;
+import com.google.refine.model.Cell;
+import com.google.refine.model.Column;
+import com.google.refine.model.Project;
+
+public abstract class TreeImportUtilities {
+    final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities");
+
+    static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
+        Collections.sort(list, new Comparator<RecordElementCandidate>() {
+            @Override
+            public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
+                return o2.count - o1.count;
+            }
+        });
+    }
+
+    static public void createColumnsFromImport(
+            Project project,
+            ImportColumnGroup columnGroup
+    ) {
+        int startColumnIndex = project.columnModel.columns.size();
+
+        List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
+        Collections.sort(columns, new Comparator<ImportColumn>() {
+            @Override
+            public int compare(ImportColumn o1, ImportColumn o2) {
+                if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
+                    return o1.blankOnFirstRow ? 1 : -1;
+                }
+
+                int c = o2.nonBlankCount - o1.nonBlankCount;
+                return c != 0 ? c : (o1.name.length() - o2.name.length());
+            }
+        });
+
+        for (int i = 0; i < columns.size(); i++) {
+            ImportColumn c = columns.get(i);
+
+            Column column = new com.google.refine.model.Column(c.cellIndex, c.name);
+            project.columnModel.columns.add(column);
+        }
+
+        List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
+        Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
+            @Override
+            public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
+                // TODO: We really want the column/group with the highest % of 
+                // records with at least one row populated, so popular optional
+                // elements with multiple instances per record don't 
+                // outweigh mandatory elements with a single occurrence per record
+                // TODO: From a human factors point of view, we probably want
+                // to try to preserve the order that we found things in the XML
+                
+                // Sort by most populated first, then shortest name
+                int c = o2.nonBlankCount - o1.nonBlankCount;
+                return c != 0 ? c : (o1.name.length() - o2.name.length());
+            }
+        });
+
+        for (ImportColumnGroup g : subgroups) {
+            createColumnsFromImport(project, g);
+        }
+
+        int endColumnIndex = project.columnModel.columns.size();
+        int span = endColumnIndex - startColumnIndex;
+        if (span > 1 && span < project.columnModel.columns.size()) {
+            // TODO: Only use "key column" if it's 100% populated?
+            project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
+        }
+    }
+
+    @Deprecated
+    static protected void addCell(
+            Project project,
+            ImportColumnGroup columnGroup,
+            ImportRecord record,
+            String columnLocalName,
+            String text
+    ) {
+        addCell(project, columnGroup, record, columnLocalName, text, true, true);
+    }
+    
+    static protected void addCell(
+            Project project,
+            ImportColumnGroup columnGroup,
+            ImportRecord record,
+            String columnLocalName,
+            String text,
+            boolean storeEmptyString,
+            boolean guessDataType
+    ) {
+        Serializable value = text;
+        if (!storeEmptyString && (text == null || (text).isEmpty())) {
+            return;
+        }
+        if (guessDataType) {
+            value = ImporterUtilities.parseCellValue(text); 
+        }
+        addCell(project, columnGroup, record, columnLocalName, value);
+    }
+
+    protected static void addCell(Project project, ImportColumnGroup columnGroup, ImportRecord record,
+            String columnLocalName, Serializable value) {
+        ImportColumn column = getColumn(project, columnGroup, columnLocalName);
+        int cellIndex = column.cellIndex;
+
+        int rowIndex = Math.max(columnGroup.nextRowIndex, column.nextRowIndex);
+
+        List<Cell> row = record.rows.get(rowIndex);
+        if (row == null) {
+            row = new ArrayList<Cell>();
+            record.rows.set(rowIndex, row);
+        }
+        
+        while (cellIndex >= row.size()) {
+            row.add(null);
+        }
+
+        row.set(cellIndex, new Cell(value, null));
+
+        column.nextRowIndex = rowIndex + 1;
+        column.nonBlankCount++; // TODO: Only increment for first instance in record?
+    }
+
+
+    static protected ImportColumn getColumn(
+            Project project,
+            ImportColumnGroup columnGroup,
+            String localName
+    ) {
+        if (columnGroup.columns.containsKey(localName)) {
+            return columnGroup.columns.get(localName);
+        }
+
+        ImportColumn column = createColumn(project, columnGroup, localName);
+        columnGroup.columns.put(localName, column);
+
+        return column;
+    }
+
+    static protected ImportColumn createColumn(
+            Project project,
+            ImportColumnGroup columnGroup,
+            String localName
+    ) {
+        ImportColumn newColumn = new ImportColumn();
+
+        newColumn.name = columnGroup.name.length() == 0 ?
+                (localName == null ? "Text" : localName) :
+                    (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
+
+        newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
+        newColumn.nextRowIndex = columnGroup.nextRowIndex;
+
+        return newColumn;
+    }
+
+    static protected ImportColumnGroup getColumnGroup(
+            Project project,
+            ImportColumnGroup columnGroup,
+            String localName
+    ) {
+        if (columnGroup.subgroups.containsKey(localName)) {
+            return columnGroup.subgroups.get(localName);
+        }
+
+        ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
+        columnGroup.subgroups.put(localName, subgroup);
+
+        return subgroup;
+    }
+
+    static protected ImportColumnGroup createColumnGroup(
+            Project project,
+            ImportColumnGroup columnGroup,
+            String localName
+    ) {
+        ImportColumnGroup newGroup = new ImportColumnGroup();
+
+        newGroup.name =
+            columnGroup.name.length() == 0 ?
+                    (localName == null ? "Text" : localName) :
+                        (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
+
+        newGroup.nextRowIndex = columnGroup.nextRowIndex;
+
+        return newGroup;
+    }
+        
+}