2010-10-20 22:45:52 +02:00
|
|
|
/*
|
|
|
|
|
|
|
|
Copyright 2010, Google Inc.
|
|
|
|
All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions are
|
|
|
|
met:
|
|
|
|
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above
|
|
|
|
copyright notice, this list of conditions and the following disclaimer
|
|
|
|
in the documentation and/or other materials provided with the
|
|
|
|
distribution.
|
|
|
|
* Neither the name of Google Inc. nor the names of its
|
|
|
|
contributors may be used to endorse or promote products derived from
|
|
|
|
this software without specific prior written permission.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
package com.google.refine.importers;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
import java.io.InputStream;
|
2010-05-05 01:24:48 +02:00
|
|
|
import java.io.Serializable;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Properties;
|
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
import org.json.JSONObject;
|
|
|
|
|
|
|
|
import com.google.refine.importing.ImportingJob;
|
|
|
|
import com.google.refine.importing.ImportingUtilities;
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.model.Column;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.model.ModelException;
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.model.Project;
|
|
|
|
import com.google.refine.model.Row;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.util.TrackingInputStream;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
|
|
|
public class ImporterUtilities {
|
|
|
|
|
|
|
|
static public Serializable parseCellValue(String text) {
|
|
|
|
if (text.length() > 0) {
|
|
|
|
if (text.length() > 1 && text.startsWith("\"") && text.endsWith("\"")) {
|
2010-05-17 07:40:46 +02:00
|
|
|
return text.substring(1, text.length() - 1);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-10-15 07:30:15 +02:00
|
|
|
String text2 = text.trim();
|
|
|
|
if (text2.length() > 0) {
|
|
|
|
try {
|
|
|
|
return Long.parseLong(text2);
|
|
|
|
} catch (NumberFormatException e) {
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
double d = Double.parseDouble(text2);
|
|
|
|
if (!Double.isInfinite(d) && !Double.isNaN(d)) {
|
|
|
|
return d;
|
|
|
|
}
|
|
|
|
} catch (NumberFormatException e) {
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return text;
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
static public int getIntegerOption(String name, Properties options, int def) {
|
|
|
|
int value = def;
|
|
|
|
if (options.containsKey(name)) {
|
|
|
|
String s = options.getProperty(name);
|
|
|
|
try {
|
|
|
|
value = Integer.parseInt(s);
|
|
|
|
} catch (Exception e) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return value;
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
static public boolean getBooleanOption(String name, Properties options, boolean def) {
|
|
|
|
boolean value = def;
|
|
|
|
if (options.containsKey(name)) {
|
|
|
|
String s = options.getProperty(name);
|
|
|
|
try {
|
2010-08-06 08:15:05 +02:00
|
|
|
value = s.equalsIgnoreCase("on") || s.equals("1") || Boolean.parseBoolean(s);
|
2010-05-05 01:24:48 +02:00
|
|
|
} catch (Exception e) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
|
|
|
static public void appendColumnName(List<String> columnNames, int index, String name) {
|
|
|
|
name = name.trim();
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
while (columnNames.size() <= index) {
|
|
|
|
columnNames.add("");
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
if (!name.isEmpty()) {
|
|
|
|
String oldName = columnNames.get(index);
|
|
|
|
if (!oldName.isEmpty()) {
|
|
|
|
name = oldName + " " + name;
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
columnNames.set(index, name);
|
|
|
|
}
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
static public void ensureColumnsInRowExist(List<String> columnNames, Row row) {
|
|
|
|
int count = row.cells.size();
|
|
|
|
while (count > columnNames.size()) {
|
|
|
|
columnNames.add("");
|
|
|
|
}
|
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
|
|
|
static public Column getOrAllocateColumn(Project project, List<String> currentFileColumnNames, int index) {
|
|
|
|
if (index < currentFileColumnNames.size()) {
|
|
|
|
return project.columnModel.getColumnByName(currentFileColumnNames.get(index));
|
|
|
|
} else if (index == currentFileColumnNames.size()) {
|
|
|
|
String prefix = "Column ";
|
|
|
|
int i = 1;
|
|
|
|
while (true) {
|
|
|
|
String columnName = prefix + i;
|
|
|
|
if (project.columnModel.getColumnByName(columnName) != null) {
|
|
|
|
// Already taken name
|
|
|
|
i++;
|
|
|
|
} else {
|
|
|
|
Column column = new Column(project.columnModel.allocateNewCellIndex(), columnName);
|
|
|
|
try {
|
|
|
|
project.columnModel.addColumn(project.columnModel.columns.size(), column, false);
|
|
|
|
} catch (ModelException e) {
|
|
|
|
// Ignore: shouldn't get in here since we just checked for duplicate names.
|
|
|
|
}
|
|
|
|
currentFileColumnNames.add(columnName);
|
|
|
|
return column;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw new RuntimeException("Unexpected code path");
|
|
|
|
}
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
static public void setupColumns(Project project, List<String> columnNames) {
|
|
|
|
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
|
|
|
|
for (int c = 0; c < columnNames.size(); c++) {
|
|
|
|
String cell = columnNames.get(c).trim();
|
|
|
|
if (cell.isEmpty()) {
|
|
|
|
cell = "Column";
|
|
|
|
} else if (cell.startsWith("\"") && cell.endsWith("\"")) {
|
2011-08-02 05:34:47 +02:00
|
|
|
// FIXME: is trimming quotation marks appropriate?
|
|
|
|
cell = cell.substring(1, cell.length() - 1).trim();
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
if (nameToIndex.containsKey(cell)) {
|
|
|
|
int index = nameToIndex.get(cell);
|
|
|
|
nameToIndex.put(cell, index + 1);
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
|
|
|
|
} else {
|
|
|
|
nameToIndex.put(cell, 2);
|
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
|
|
|
columnNames.set(c, cell);
|
|
|
|
if (project.columnModel.getColumnByName(cell) == null) {
|
|
|
|
Column column = new Column(project.columnModel.allocateNewCellIndex(), cell);
|
|
|
|
try {
|
|
|
|
project.columnModel.addColumn(project.columnModel.columns.size(), column, false);
|
|
|
|
} catch (ModelException e) {
|
|
|
|
// Ignore: shouldn't get in here since we just checked for duplicate names.
|
|
|
|
}
|
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
|
|
|
|
static public interface MultiFileReadingProgress {
|
|
|
|
public void startFile(String fileSource);
|
|
|
|
public void readingFile(String fileSource, long bytesRead);
|
|
|
|
public void endFile(String fileSource, long bytesRead);
|
|
|
|
}
|
|
|
|
|
|
|
|
static public MultiFileReadingProgress createMultiFileReadingProgress(
|
|
|
|
final ImportingJob job, List<JSONObject> fileRecords) {
|
|
|
|
long totalSize = 0;
|
|
|
|
for (JSONObject fileRecord : fileRecords) {
|
|
|
|
File file = ImportingUtilities.getFile(job, fileRecord);
|
|
|
|
totalSize += file.length();
|
|
|
|
}
|
|
|
|
|
|
|
|
final long totalSize2 = totalSize;
|
|
|
|
return new MultiFileReadingProgress() {
|
|
|
|
long totalBytesRead = 0;
|
|
|
|
|
|
|
|
void setProgress(String fileSource, long bytesRead) {
|
|
|
|
ImportingUtilities.setCreatingProjectProgress(
|
|
|
|
job,
|
|
|
|
"Reading " + fileSource,
|
2011-08-11 02:35:01 +02:00
|
|
|
totalSize2 == 0 ? -1 : (int) (100 * (totalBytesRead + bytesRead) / totalSize2));
|
2011-08-02 05:34:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void startFile(String fileSource) {
|
|
|
|
setProgress(fileSource, 0);
|
|
|
|
}
|
2010-05-16 20:42:52 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
@Override
|
|
|
|
public void readingFile(String fileSource, long bytesRead) {
|
|
|
|
setProgress(fileSource, bytesRead);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void endFile(String fileSource, long bytesRead) {
|
|
|
|
totalBytesRead += bytesRead;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static public InputStream openAndTrackFile(
|
|
|
|
final String fileSource,
|
|
|
|
final File file,
|
|
|
|
final MultiFileReadingProgress progress) throws FileNotFoundException {
|
|
|
|
InputStream inputStream = new FileInputStream(file);
|
|
|
|
return progress == null ? inputStream : new TrackingInputStream(inputStream) {
|
|
|
|
@Override
|
|
|
|
protected long track(long bytesRead) {
|
|
|
|
long l = super.track(bytesRead);
|
|
|
|
|
|
|
|
progress.readingFile(fileSource, this.bytesRead);
|
|
|
|
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|