Got some work done on the plane:

- better detection of record XML elements in XML importer
- XML importer creates column groups and data table view renders them


git-svn-id: http://google-refine.googlecode.com/svn/trunk@356 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-03-27 05:23:09 +00:00
parent 2a9fbd7d81
commit 1d0e6abaf8
4 changed files with 261 additions and 71 deletions

View File

@ -17,6 +17,7 @@ import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader; import javax.xml.stream.XMLStreamReader;
import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.ColumnGroup;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Column;
@ -27,16 +28,38 @@ public class XmlImportUtilities {
int count; int count;
} }
static public class ImportColumnGroup { static protected abstract class ImportVertical {
public String name = ""; public String name = "";
public int nonBlankCount;
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>(); abstract void tabulate();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
} }
static public class ImportColumn { static public class ImportColumnGroup extends ImportVertical {
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
@Override
void tabulate() {
for (ImportColumn c : columns.values()) {
c.tabulate();
nonBlankCount = Math.max(nonBlankCount, c.nonBlankCount);
}
for (ImportColumnGroup g : subgroups.values()) {
g.tabulate();
nonBlankCount = Math.max(nonBlankCount, g.nonBlankCount);
}
}
}
static public class ImportColumn extends ImportVertical {
public int cellIndex; public int cellIndex;
public String name; public boolean blankOnFirstRow;
@Override
void tabulate() {
// already done the tabulation elsewhere
}
} }
static public class ImportRecord { static public class ImportRecord {
@ -44,6 +67,58 @@ public class XmlImportUtilities {
List<Integer> columnEmptyRowIndices = new ArrayList<Integer>(); List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
} }
static public String[] detectPathFromTag(InputStream inputStream, String tag) {
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
String[] path2 = new String[path.size()];
path.toArray(path2);
return path2;
}
}
}
} catch (Exception e) {
// silent
// e.printStackTrace();
}
return null;
}
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
String localName = parser.getLocalName();
String fullName = composeName(parser.getPrefix(), localName);
if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) {
List<String> path = new LinkedList<String>();
path.add(localName);
return path;
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) {
break;
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
path.add(0, localName);
return path;
}
}
}
return null;
}
static public String[] detectRecordElement(InputStream inputStream) { static public String[] detectRecordElement(InputStream inputStream) {
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>(); List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
@ -77,19 +152,30 @@ public class XmlImportUtilities {
} }
static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) { static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) {
List<RecordElementCandidate> candidateList = new ArrayList<RecordElementCandidate>(); List<RecordElementCandidate> descendantCandidates = new ArrayList<RecordElementCandidate>();
Map<String, Integer> candidates = new HashMap<String, Integer>(); Map<String, Integer> immediateChildCandidateMap = new HashMap<String, Integer>();
int textNodeCount = 0;
int childElementNodeCount = 0;
try { try {
while (parser.hasNext()) { while (parser.hasNext()) {
int eventType = parser.next(); int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) { if (eventType == XMLStreamConstants.END_ELEMENT) {
break; break;
} else if (eventType == XMLStreamConstants.CHARACTERS) {
if (parser.getText().trim().length() > 0) {
textNodeCount++;
}
} else if (eventType == XMLStreamConstants.START_ELEMENT) { } else if (eventType == XMLStreamConstants.START_ELEMENT) {
childElementNodeCount++;
String tagName = parser.getLocalName(); String tagName = parser.getLocalName();
candidates.put(tagName, candidates.containsKey(tagName) ? candidates.get(tagName) + 1 : 1); immediateChildCandidateMap.put(
tagName,
immediateChildCandidateMap.containsKey(tagName) ?
immediateChildCandidateMap.get(tagName) + 1 : 1);
String[] path2 = new String[path.length + 1]; String[] path2 = new String[path.length + 1];
System.arraycopy(path, 0, path2, 0, path.length); System.arraycopy(path, 0, path2, 0, path.length);
@ -97,7 +183,7 @@ public class XmlImportUtilities {
RecordElementCandidate c = detectRecordElement(parser, path2); RecordElementCandidate c = detectRecordElement(parser, path2);
if (c != null) { if (c != null) {
candidateList.add(c); descendantCandidates.add(c);
} }
} }
} }
@ -106,9 +192,14 @@ public class XmlImportUtilities {
// e.printStackTrace(); // e.printStackTrace();
} }
if (candidates.size() > 0) { if (textNodeCount > 0 && childElementNodeCount > 0) {
List<RecordElementCandidate> ourCandidateList = new ArrayList<RecordElementCandidate>(candidates.size()); // This is a mixed element
for (Entry<String, Integer> entry : candidates.entrySet()) { return null;
}
if (immediateChildCandidateMap.size() > 0) {
List<RecordElementCandidate> immediateChildCandidates = new ArrayList<RecordElementCandidate>(immediateChildCandidateMap.size());
for (Entry<String, Integer> entry : immediateChildCandidateMap.entrySet()) {
int count = entry.getValue(); int count = entry.getValue();
if (count > 1) { if (count > 1) {
String[] path2 = new String[path.length + 1]; String[] path2 = new String[path.length + 1];
@ -118,26 +209,32 @@ public class XmlImportUtilities {
RecordElementCandidate candidate = new RecordElementCandidate(); RecordElementCandidate candidate = new RecordElementCandidate();
candidate.path = path2; candidate.path = path2;
candidate.count = count; candidate.count = count;
ourCandidateList.add(candidate); immediateChildCandidates.add(candidate);
} }
} }
if (ourCandidateList.size() > 0) { if (immediateChildCandidates.size() > 0 && immediateChildCandidates.size() < 5) {
sortRecordElementCandidates(ourCandidateList); // There are some promising immediate child elements, but not many,
// that can serve as record elements.
RecordElementCandidate ourCandidate = ourCandidateList.get(0); sortRecordElementCandidates(immediateChildCandidates);
if (ourCandidate.count > 10) {
RecordElementCandidate ourCandidate = immediateChildCandidates.get(0);
if (ourCandidate.count / immediateChildCandidates.size() > 5) {
return ourCandidate; return ourCandidate;
} }
candidateList.add(ourCandidate); descendantCandidates.add(ourCandidate);
} }
} }
if (candidateList.size() > 0) { if (descendantCandidates.size() > 0) {
sortRecordElementCandidates(candidateList); sortRecordElementCandidates(descendantCandidates);
return candidateList.get(0); RecordElementCandidate candidate = descendantCandidates.get(0);
if (candidate.count / descendantCandidates.size() > 5) {
return candidate;
}
} }
return null; return null;
@ -175,14 +272,44 @@ public class XmlImportUtilities {
Project project, Project project,
ImportColumnGroup columnGroup ImportColumnGroup columnGroup
) { ) {
for (ImportColumn c : columnGroup.columns.values()) { int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() {
public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1;
}
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i);
Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name); Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column); project.columnModel.columns.add(column);
} }
for (ImportColumnGroup g : columnGroup.subgroups.values()) { List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g); createColumnsFromImport(project, g);
} }
int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) {
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
}
} }
static protected void findRecord( static protected void findRecord(
@ -231,28 +358,39 @@ public class XmlImportUtilities {
processSubRecord(project, parser, rootColumnGroup, record); processSubRecord(project, parser, rootColumnGroup, record);
for (List<Cell> row : record.rows) { if (record.rows.size() > 0) {
Row realRow = new Row(row.size()); for (List<Cell> row : record.rows) {
Row realRow = new Row(row.size());
for (int c = 0; c < row.size(); c++) {
Cell cell = row.get(c); for (int c = 0; c < row.size(); c++) {
if (cell != null) { Cell cell = row.get(c);
realRow.setCell(c, cell); if (cell != null) {
realRow.setCell(c, cell);
}
} }
project.rows.add(realRow);
} }
project.rows.add(realRow);
} }
} }
static protected String composeName(String prefix, String localName) {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
static protected void processSubRecord( static protected void processSubRecord(
Project project, Project project,
XMLStreamReader parser, XMLStreamReader parser,
ImportColumnGroup columnGroup, ImportColumnGroup columnGroup,
ImportRecord record ImportRecord record
) throws XMLStreamException { ) throws XMLStreamException {
ImportColumnGroup thisColumnGroup = getColumnGroup(
project,
columnGroup,
composeName(parser.getPrefix(), parser.getLocalName()));
int commonStartingRowIndex = 0; int commonStartingRowIndex = 0;
for (ImportColumn column : columnGroup.columns.values()) { for (ImportColumn column : thisColumnGroup.columns.values()) {
if (column.cellIndex < record.columnEmptyRowIndices.size()) { if (column.cellIndex < record.columnEmptyRowIndices.size()) {
commonStartingRowIndex = Math.max( commonStartingRowIndex = Math.max(
commonStartingRowIndex, commonStartingRowIndex,
@ -260,43 +398,58 @@ public class XmlImportUtilities {
} }
} }
int attributeCount = parser.getAttributeCount();
for (int i = 0; i < attributeCount; i++) {
String text = parser.getAttributeValue(i).trim();
if (text.length() > 0) {
addCell(
project,
thisColumnGroup,
record,
composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)),
text,
commonStartingRowIndex
);
}
}
while (parser.hasNext()) { while (parser.hasNext()) {
int eventType = parser.next(); int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) { if (eventType == XMLStreamConstants.START_ELEMENT) {
ImportColumnGroup thisColumnGroup = getColumnGroup(project, columnGroup, parser.getLocalName());
int attributeCount = parser.getAttributeCount();
for (int i = 0; i < attributeCount; i++) {
addCell(
project,
thisColumnGroup,
record,
parser.getAttributeLocalName(i),
parser.getAttributeValue(i),
commonStartingRowIndex
);
}
processSubRecord( processSubRecord(
project, project,
parser, parser,
thisColumnGroup, thisColumnGroup,
record record
); );
} else if (eventType == XMLStreamConstants.CDATA || } else if (//eventType == XMLStreamConstants.CDATA ||
eventType == XMLStreamConstants.CHARACTERS) { eventType == XMLStreamConstants.CHARACTERS) {
addCell( String text = parser.getText().trim();
project, if (text.length() > 0) {
columnGroup, addCell(
record, project,
null, thisColumnGroup,
parser.getText(), record,
commonStartingRowIndex null,
); parser.getText(),
commonStartingRowIndex
);
}
} else if (eventType == XMLStreamConstants.END_ELEMENT) { } else if (eventType == XMLStreamConstants.END_ELEMENT) {
break; break;
} }
} }
if (commonStartingRowIndex < record.rows.size()) {
List<Cell> startingRow = record.rows.get(commonStartingRowIndex);
for (ImportColumn c : thisColumnGroup.columns.values()) {
int cellIndex = c.cellIndex;
if (cellIndex >= startingRow.size() || startingRow.get(cellIndex) == null) {
c.blankOnFirstRow = true;
}
}
}
} }
static protected void addCell( static protected void addCell(
@ -304,13 +457,15 @@ public class XmlImportUtilities {
ImportColumnGroup columnGroup, ImportColumnGroup columnGroup,
ImportRecord record, ImportRecord record,
String columnLocalName, String columnLocalName,
Serializable value, String text,
int commonStaringRowIndex int commonStaringRowIndex
) { ) {
if (value == null || (value instanceof String && ((String) value).isEmpty())) { if (text == null || ((String) text).isEmpty()) {
return; return;
} }
Serializable value = ImporterUtilities.parseCellValue(text);
ImportColumn column = getColumn(project, columnGroup, columnLocalName); ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex; int cellIndex = column.cellIndex;
@ -331,6 +486,8 @@ public class XmlImportUtilities {
row.set(cellIndex, new Cell(value, null)); row.set(cellIndex, new Cell(value, null));
record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1); record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1);
column.nonBlankCount++;
} }
static protected ImportColumn getColumn( static protected ImportColumn getColumn(

View File

@ -23,9 +23,13 @@ public class XmlImporter implements Importer {
throw new NotImplementedException(); throw new NotImplementedException();
} }
public void read(InputStream inputStream, Project project, public void read(
Properties options, int skip, int limit) throws Exception { InputStream inputStream,
Project project,
Properties options,
int skip,
int limit
) throws Exception {
BufferedInputStream bis = new BufferedInputStream(inputStream); BufferedInputStream bis = new BufferedInputStream(inputStream);
String[] recordPath = null; String[] recordPath = null;
@ -36,14 +40,22 @@ public class XmlImporter implements Importer {
int c = bis.read(buffer); int c = bis.read(buffer);
bis.reset(); bis.reset();
recordPath = XmlImportUtilities.detectRecordElement( if (options.containsKey("importer-record-tag")) {
new ByteArrayInputStream(buffer, 0, c)); recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, c),
options.getProperty("importer-record-tag"));
} else {
recordPath = XmlImportUtilities.detectRecordElement(
new ByteArrayInputStream(buffer, 0, c));
}
} }
ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
XmlImportUtilities.importXml(bis, project, recordPath, rootColumnGroup); XmlImportUtilities.importXml(bis, project, recordPath, rootColumnGroup);
XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup);
project.columnModel.update();
} }
} }

View File

@ -53,9 +53,27 @@ public class ColumnModel implements Jsonizable {
public int getKeyColumnIndex() { public int getKeyColumnIndex() {
return _keyColumnIndex; return _keyColumnIndex;
} }
public void addColumnGroup(int startColumnIndex, int span, int keyColumnIndex) {
for (ColumnGroup g : columnGroups) {
if (g.startColumnIndex == startColumnIndex && g.columnSpan == span) {
if (g.keyColumnIndex == keyColumnIndex) {
return;
} else {
columnGroups.remove(g);
break;
}
}
}
ColumnGroup cg = new ColumnGroup(startColumnIndex, span, keyColumnIndex);
columnGroups.add(cg);
}
public void update() { public void update() {
generateMaps(); internalInitialize();
} }
public Column getColumnByName(String name) { public Column getColumnByName(String name) {

View File

@ -161,7 +161,9 @@ DataTableView.prototype._renderDataTable = function(table) {
c += (columnGroup.columnSpan - 1); c += (columnGroup.columnSpan - 1);
nextLayer = nextLayer.concat(columnGroup.subgroups); if ("subgroups" in columnGroup) {
nextLayer = nextLayer.concat(columnGroup.subgroups);
}
} }
} }
} }
@ -172,12 +174,13 @@ DataTableView.prototype._renderDataTable = function(table) {
renderColumnGroups(nextLayer, []); renderColumnGroups(nextLayer, []);
} }
}; };
/*
renderColumnGroups( if (columnGroups.length > 0) {
columnGroups, renderColumnGroups(
[ theProject.columnModel.keyCellIndex ] columnGroups,
); [ theProject.columnModel.keyCellIndex ]
*/ );
}
/*------------------------------------------------------------ /*------------------------------------------------------------
* Column Headers with Menus * Column Headers with Menus