Got some work done on the plane:

- better detection of record XML elements in XML importer
- XML importer creates column groups and data table view renders them


git-svn-id: http://google-refine.googlecode.com/svn/trunk@356 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-03-27 05:23:09 +00:00
parent 2a9fbd7d81
commit 1d0e6abaf8
4 changed files with 261 additions and 71 deletions

View File

@ -17,6 +17,7 @@ import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.ColumnGroup;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Column;
@ -27,16 +28,38 @@ public class XmlImportUtilities {
int count;
}
static public class ImportColumnGroup {
static protected abstract class ImportVertical {
public String name = "";
public int nonBlankCount;
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
abstract void tabulate();
}
static public class ImportColumn {
static public class ImportColumnGroup extends ImportVertical {
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
@Override
void tabulate() {
for (ImportColumn c : columns.values()) {
c.tabulate();
nonBlankCount = Math.max(nonBlankCount, c.nonBlankCount);
}
for (ImportColumnGroup g : subgroups.values()) {
g.tabulate();
nonBlankCount = Math.max(nonBlankCount, g.nonBlankCount);
}
}
}
static public class ImportColumn extends ImportVertical {
public int cellIndex;
public String name;
public boolean blankOnFirstRow;
@Override
void tabulate() {
// already done the tabulation elsewhere
}
}
static public class ImportRecord {
@ -44,6 +67,58 @@ public class XmlImportUtilities {
List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
}
static public String[] detectPathFromTag(InputStream inputStream, String tag) {
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
String[] path2 = new String[path.size()];
path.toArray(path2);
return path2;
}
}
}
} catch (Exception e) {
// silent
// e.printStackTrace();
}
return null;
}
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
String localName = parser.getLocalName();
String fullName = composeName(parser.getPrefix(), localName);
if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) {
List<String> path = new LinkedList<String>();
path.add(localName);
return path;
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) {
break;
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
path.add(0, localName);
return path;
}
}
}
return null;
}
static public String[] detectRecordElement(InputStream inputStream) {
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
@ -77,19 +152,30 @@ public class XmlImportUtilities {
}
static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) {
List<RecordElementCandidate> candidateList = new ArrayList<RecordElementCandidate>();
List<RecordElementCandidate> descendantCandidates = new ArrayList<RecordElementCandidate>();
Map<String, Integer> candidates = new HashMap<String, Integer>();
Map<String, Integer> immediateChildCandidateMap = new HashMap<String, Integer>();
int textNodeCount = 0;
int childElementNodeCount = 0;
try {
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) {
break;
} else if (eventType == XMLStreamConstants.CHARACTERS) {
if (parser.getText().trim().length() > 0) {
textNodeCount++;
}
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
childElementNodeCount++;
String tagName = parser.getLocalName();
candidates.put(tagName, candidates.containsKey(tagName) ? candidates.get(tagName) + 1 : 1);
immediateChildCandidateMap.put(
tagName,
immediateChildCandidateMap.containsKey(tagName) ?
immediateChildCandidateMap.get(tagName) + 1 : 1);
String[] path2 = new String[path.length + 1];
System.arraycopy(path, 0, path2, 0, path.length);
@ -97,7 +183,7 @@ public class XmlImportUtilities {
RecordElementCandidate c = detectRecordElement(parser, path2);
if (c != null) {
candidateList.add(c);
descendantCandidates.add(c);
}
}
}
@ -106,9 +192,14 @@ public class XmlImportUtilities {
// e.printStackTrace();
}
if (candidates.size() > 0) {
List<RecordElementCandidate> ourCandidateList = new ArrayList<RecordElementCandidate>(candidates.size());
for (Entry<String, Integer> entry : candidates.entrySet()) {
if (textNodeCount > 0 && childElementNodeCount > 0) {
// This is a mixed element
return null;
}
if (immediateChildCandidateMap.size() > 0) {
List<RecordElementCandidate> immediateChildCandidates = new ArrayList<RecordElementCandidate>(immediateChildCandidateMap.size());
for (Entry<String, Integer> entry : immediateChildCandidateMap.entrySet()) {
int count = entry.getValue();
if (count > 1) {
String[] path2 = new String[path.length + 1];
@ -118,26 +209,32 @@ public class XmlImportUtilities {
RecordElementCandidate candidate = new RecordElementCandidate();
candidate.path = path2;
candidate.count = count;
ourCandidateList.add(candidate);
immediateChildCandidates.add(candidate);
}
}
if (ourCandidateList.size() > 0) {
sortRecordElementCandidates(ourCandidateList);
if (immediateChildCandidates.size() > 0 && immediateChildCandidates.size() < 5) {
// There are some promising immediate child elements, but not many,
// that can serve as record elements.
RecordElementCandidate ourCandidate = ourCandidateList.get(0);
if (ourCandidate.count > 10) {
sortRecordElementCandidates(immediateChildCandidates);
RecordElementCandidate ourCandidate = immediateChildCandidates.get(0);
if (ourCandidate.count / immediateChildCandidates.size() > 5) {
return ourCandidate;
}
candidateList.add(ourCandidate);
descendantCandidates.add(ourCandidate);
}
}
if (candidateList.size() > 0) {
sortRecordElementCandidates(candidateList);
if (descendantCandidates.size() > 0) {
sortRecordElementCandidates(descendantCandidates);
return candidateList.get(0);
RecordElementCandidate candidate = descendantCandidates.get(0);
if (candidate.count / descendantCandidates.size() > 5) {
return candidate;
}
}
return null;
@ -175,14 +272,44 @@ public class XmlImportUtilities {
Project project,
ImportColumnGroup columnGroup
) {
for (ImportColumn c : columnGroup.columns.values()) {
int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() {
public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1;
}
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i);
Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column);
}
for (ImportColumnGroup g : columnGroup.subgroups.values()) {
List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g);
}
int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) {
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
}
}
static protected void findRecord(
@ -231,6 +358,7 @@ public class XmlImportUtilities {
processSubRecord(project, parser, rootColumnGroup, record);
if (record.rows.size() > 0) {
for (List<Cell> row : record.rows) {
Row realRow = new Row(row.size());
@ -244,6 +372,11 @@ public class XmlImportUtilities {
project.rows.add(realRow);
}
}
}
static protected String composeName(String prefix, String localName) {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
static protected void processSubRecord(
Project project,
@ -251,8 +384,13 @@ public class XmlImportUtilities {
ImportColumnGroup columnGroup,
ImportRecord record
) throws XMLStreamException {
ImportColumnGroup thisColumnGroup = getColumnGroup(
project,
columnGroup,
composeName(parser.getPrefix(), parser.getLocalName()));
int commonStartingRowIndex = 0;
for (ImportColumn column : columnGroup.columns.values()) {
for (ImportColumn column : thisColumnGroup.columns.values()) {
if (column.cellIndex < record.columnEmptyRowIndices.size()) {
commonStartingRowIndex = Math.max(
commonStartingRowIndex,
@ -260,43 +398,58 @@ public class XmlImportUtilities {
}
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
ImportColumnGroup thisColumnGroup = getColumnGroup(project, columnGroup, parser.getLocalName());
int attributeCount = parser.getAttributeCount();
for (int i = 0; i < attributeCount; i++) {
String text = parser.getAttributeValue(i).trim();
if (text.length() > 0) {
addCell(
project,
thisColumnGroup,
record,
parser.getAttributeLocalName(i),
parser.getAttributeValue(i),
composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)),
text,
commonStartingRowIndex
);
}
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
processSubRecord(
project,
parser,
thisColumnGroup,
record
);
} else if (eventType == XMLStreamConstants.CDATA ||
} else if (//eventType == XMLStreamConstants.CDATA ||
eventType == XMLStreamConstants.CHARACTERS) {
String text = parser.getText().trim();
if (text.length() > 0) {
addCell(
project,
columnGroup,
thisColumnGroup,
record,
null,
parser.getText(),
commonStartingRowIndex
);
}
} else if (eventType == XMLStreamConstants.END_ELEMENT) {
break;
}
}
if (commonStartingRowIndex < record.rows.size()) {
List<Cell> startingRow = record.rows.get(commonStartingRowIndex);
for (ImportColumn c : thisColumnGroup.columns.values()) {
int cellIndex = c.cellIndex;
if (cellIndex >= startingRow.size() || startingRow.get(cellIndex) == null) {
c.blankOnFirstRow = true;
}
}
}
}
static protected void addCell(
@ -304,13 +457,15 @@ public class XmlImportUtilities {
ImportColumnGroup columnGroup,
ImportRecord record,
String columnLocalName,
Serializable value,
String text,
int commonStaringRowIndex
) {
if (value == null || (value instanceof String && ((String) value).isEmpty())) {
if (text == null || ((String) text).isEmpty()) {
return;
}
Serializable value = ImporterUtilities.parseCellValue(text);
ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex;
@ -331,6 +486,8 @@ public class XmlImportUtilities {
row.set(cellIndex, new Cell(value, null));
record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1);
column.nonBlankCount++;
}
static protected ImportColumn getColumn(

View File

@ -23,9 +23,13 @@ public class XmlImporter implements Importer {
throw new NotImplementedException();
}
public void read(InputStream inputStream, Project project,
Properties options, int skip, int limit) throws Exception {
public void read(
InputStream inputStream,
Project project,
Properties options,
int skip,
int limit
) throws Exception {
BufferedInputStream bis = new BufferedInputStream(inputStream);
String[] recordPath = null;
@ -36,14 +40,22 @@ public class XmlImporter implements Importer {
int c = bis.read(buffer);
bis.reset();
if (options.containsKey("importer-record-tag")) {
recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, c),
options.getProperty("importer-record-tag"));
} else {
recordPath = XmlImportUtilities.detectRecordElement(
new ByteArrayInputStream(buffer, 0, c));
}
}
ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
XmlImportUtilities.importXml(bis, project, recordPath, rootColumnGroup);
XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup);
project.columnModel.update();
}
}

View File

@ -54,8 +54,26 @@ public class ColumnModel implements Jsonizable {
return _keyColumnIndex;
}
public void addColumnGroup(int startColumnIndex, int span, int keyColumnIndex) {
for (ColumnGroup g : columnGroups) {
if (g.startColumnIndex == startColumnIndex && g.columnSpan == span) {
if (g.keyColumnIndex == keyColumnIndex) {
return;
} else {
columnGroups.remove(g);
break;
}
}
}
ColumnGroup cg = new ColumnGroup(startColumnIndex, span, keyColumnIndex);
columnGroups.add(cg);
}
public void update() {
generateMaps();
internalInitialize();
}
public Column getColumnByName(String name) {

View File

@ -161,10 +161,12 @@ DataTableView.prototype._renderDataTable = function(table) {
c += (columnGroup.columnSpan - 1);
if ("subgroups" in columnGroup) {
nextLayer = nextLayer.concat(columnGroup.subgroups);
}
}
}
}
renderColumnKeys(keys);
@ -172,12 +174,13 @@ DataTableView.prototype._renderDataTable = function(table) {
renderColumnGroups(nextLayer, []);
}
};
/*
if (columnGroups.length > 0) {
renderColumnGroups(
columnGroups,
[ theProject.columnModel.keyCellIndex ]
);
*/
}
/*------------------------------------------------------------
* Column Headers with Menus