remove Freebase reconciliation from Excel Importer (#2470)
This commit is contained in:
parent
f2b06418da
commit
72966af5b6
@ -38,15 +38,11 @@ import java.io.FileInputStream;
|
|||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.PushbackInputStream;
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.poi.ooxml.POIXMLException;
|
import org.apache.poi.ooxml.POIXMLException;
|
||||||
import org.apache.poi.common.usermodel.Hyperlink;
|
|
||||||
import org.apache.poi.ss.usermodel.DateUtil;
|
import org.apache.poi.ss.usermodel.DateUtil;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
@ -66,9 +62,6 @@ import com.google.refine.importing.ImportingJob;
|
|||||||
import com.google.refine.importing.ImportingUtilities;
|
import com.google.refine.importing.ImportingUtilities;
|
||||||
import com.google.refine.model.Cell;
|
import com.google.refine.model.Cell;
|
||||||
import com.google.refine.model.Project;
|
import com.google.refine.model.Project;
|
||||||
import com.google.refine.model.Recon;
|
|
||||||
import com.google.refine.model.Recon.Judgment;
|
|
||||||
import com.google.refine.model.ReconCandidate;
|
|
||||||
import com.google.refine.util.JSONUtilities;
|
import com.google.refine.util.JSONUtilities;
|
||||||
import com.google.refine.util.ParsingUtilities;
|
import com.google.refine.util.ParsingUtilities;
|
||||||
|
|
||||||
@ -201,7 +194,6 @@ public class ExcelImporter extends TabularImportingParserBase {
|
|||||||
|
|
||||||
TableDataReader dataReader = new TableDataReader() {
|
TableDataReader dataReader = new TableDataReader() {
|
||||||
int nextRow = 0;
|
int nextRow = 0;
|
||||||
Map<String, Recon> reconMap = new HashMap<String, Recon>();
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Object> getNextRowOfCells() throws IOException {
|
public List<Object> getNextRowOfCells() throws IOException {
|
||||||
@ -218,7 +210,7 @@ public class ExcelImporter extends TabularImportingParserBase {
|
|||||||
|
|
||||||
org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex);
|
org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex);
|
||||||
if (sourceCell != null) {
|
if (sourceCell != null) {
|
||||||
cell = extractCell(sourceCell, reconMap);
|
cell = extractCell(sourceCell);
|
||||||
}
|
}
|
||||||
cells.add(cell);
|
cells.add(cell);
|
||||||
}
|
}
|
||||||
@ -242,7 +234,7 @@ public class ExcelImporter extends TabularImportingParserBase {
|
|||||||
super.parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
|
super.parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
||||||
CellType cellType = cell.getCellType();
|
CellType cellType = cell.getCellType();
|
||||||
if (cellType.equals(CellType.FORMULA)) {
|
if (cellType.equals(CellType.FORMULA)) {
|
||||||
cellType = cell.getCachedFormulaResultType();
|
cellType = cell.getCachedFormulaResultType();
|
||||||
@ -276,61 +268,7 @@ public class ExcelImporter extends TabularImportingParserBase {
|
|||||||
value = text;
|
value = text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return value;
|
return new Cell(value, null);
|
||||||
}
|
|
||||||
|
|
||||||
static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
|
|
||||||
Serializable value = extractCell(cell);
|
|
||||||
|
|
||||||
if (value != null) {
|
|
||||||
Recon recon = null;
|
|
||||||
|
|
||||||
Hyperlink hyperlink = cell.getHyperlink();
|
|
||||||
if (hyperlink != null) {
|
|
||||||
String url = hyperlink.getAddress();
|
|
||||||
|
|
||||||
if (url != null && (url.startsWith("http://") ||
|
|
||||||
url.startsWith("https://"))) {
|
|
||||||
|
|
||||||
final String sig = "freebase.com/view";
|
|
||||||
|
|
||||||
int i = url.indexOf(sig);
|
|
||||||
if (i > 0) {
|
|
||||||
String id = url.substring(i + sig.length());
|
|
||||||
|
|
||||||
int q = id.indexOf('?');
|
|
||||||
if (q > 0) {
|
|
||||||
id = id.substring(0, q);
|
|
||||||
}
|
|
||||||
int h = id.indexOf('#');
|
|
||||||
if (h > 0) {
|
|
||||||
id = id.substring(0, h);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (reconMap.containsKey(id)) {
|
|
||||||
recon = reconMap.get(id);
|
|
||||||
recon.judgmentBatchSize++;
|
|
||||||
} else {
|
|
||||||
recon = new Recon(0, null, null);
|
|
||||||
recon.service = "import";
|
|
||||||
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
|
|
||||||
recon.matchRank = 0;
|
|
||||||
recon.judgment = Judgment.Matched;
|
|
||||||
recon.judgmentAction = "auto";
|
|
||||||
recon.judgmentBatchSize = 1;
|
|
||||||
recon.addCandidate(recon.match);
|
|
||||||
|
|
||||||
reconMap.put(id, recon);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Cell(value, recon);
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user