remove Freebase reconciliation from Excel Importer (#2470)

This commit is contained in:
Albin Larsson 2020-03-27 09:30:00 +01:00 committed by GitHub
parent f2b06418da
commit 72966af5b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -38,15 +38,11 @@ import java.io.FileInputStream;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import org.apache.poi.ooxml.POIXMLException; import org.apache.poi.ooxml.POIXMLException;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -66,9 +62,6 @@ import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingUtilities; import com.google.refine.importing.ImportingUtilities;
import com.google.refine.model.Cell; import com.google.refine.model.Cell;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.google.refine.model.Recon;
import com.google.refine.model.Recon.Judgment;
import com.google.refine.model.ReconCandidate;
import com.google.refine.util.JSONUtilities; import com.google.refine.util.JSONUtilities;
import com.google.refine.util.ParsingUtilities; import com.google.refine.util.ParsingUtilities;
@ -201,7 +194,6 @@ public class ExcelImporter extends TabularImportingParserBase {
TableDataReader dataReader = new TableDataReader() { TableDataReader dataReader = new TableDataReader() {
int nextRow = 0; int nextRow = 0;
Map<String, Recon> reconMap = new HashMap<String, Recon>();
@Override @Override
public List<Object> getNextRowOfCells() throws IOException { public List<Object> getNextRowOfCells() throws IOException {
@ -218,7 +210,7 @@ public class ExcelImporter extends TabularImportingParserBase {
org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex); org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex);
if (sourceCell != null) { if (sourceCell != null) {
cell = extractCell(sourceCell, reconMap); cell = extractCell(sourceCell);
} }
cells.add(cell); cells.add(cell);
} }
@ -242,7 +234,7 @@ public class ExcelImporter extends TabularImportingParserBase {
super.parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions); super.parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
} }
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) { static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell) {
CellType cellType = cell.getCellType(); CellType cellType = cell.getCellType();
if (cellType.equals(CellType.FORMULA)) { if (cellType.equals(CellType.FORMULA)) {
cellType = cell.getCachedFormulaResultType(); cellType = cell.getCachedFormulaResultType();
@ -276,61 +268,7 @@ public class ExcelImporter extends TabularImportingParserBase {
value = text; value = text;
} }
} }
return value; return new Cell(value, null);
}
static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
Serializable value = extractCell(cell);
if (value != null) {
Recon recon = null;
Hyperlink hyperlink = cell.getHyperlink();
if (hyperlink != null) {
String url = hyperlink.getAddress();
if (url != null && (url.startsWith("http://") ||
url.startsWith("https://"))) {
final String sig = "freebase.com/view";
int i = url.indexOf(sig);
if (i > 0) {
String id = url.substring(i + sig.length());
int q = id.indexOf('?');
if (q > 0) {
id = id.substring(0, q);
}
int h = id.indexOf('#');
if (h > 0) {
id = id.substring(0, h);
}
if (reconMap.containsKey(id)) {
recon = reconMap.get(id);
recon.judgmentBatchSize++;
} else {
recon = new Recon(0, null, null);
recon.service = "import";
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
}
}
}
return new Cell(value, recon);
} else {
return null;
}
} }
} }