Add caching in URL fetching

This commit is contained in:
Antonin Delpeuch 2017-03-07 20:21:27 +00:00
parent 5d8d372244
commit 782a2f5b48

View File

@ -43,6 +43,8 @@ import java.net.URLConnection;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Properties; import java.util.Properties;
import java.util.Map;
import java.util.HashMap;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
@ -170,6 +172,7 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
final protected Evaluable _eval; final protected Evaluable _eval;
final protected long _historyEntryID; final protected long _historyEntryID;
protected int _cellIndex; protected int _cellIndex;
protected Map<URL, Serializable> _urlCache;
public ColumnAdditionByFetchingURLsProcess( public ColumnAdditionByFetchingURLsProcess(
Project project, Project project,
@ -182,6 +185,7 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
_engine = engine; _engine = engine;
_eval = eval; _eval = eval;
_historyEntryID = HistoryEntry.allocateID(); _historyEntryID = HistoryEntry.allocateID();
_urlCache = new HashMap<URL, Serializable>();
} }
@Override @Override
@ -222,27 +226,21 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
List<CellAtRow> responseBodies = new ArrayList<CellAtRow>(urls.size()); List<CellAtRow> responseBodies = new ArrayList<CellAtRow>(urls.size());
for (int i = 0; i < urls.size(); i++) { for (int i = 0; i < urls.size(); i++) {
CellAtRow urlData = urls.get(i); CellAtRow urlData = urls.get(i);
long start = System.currentTimeMillis(); CellAtRow cellAtRow = cachedFetch(urlData);
CellAtRow cellAtRow = fetch(urlData);
if (cellAtRow != null) { if (cellAtRow != null) {
responseBodies.add(cellAtRow); responseBodies.add(cellAtRow);
} }
_progress = i * 100 / urls.size(); _progress = i * 100 / urls.size();
try {
long delay = _delay - (System.currentTimeMillis() - start);
if (delay > 0) {
Thread.sleep(delay);
}
} catch (InterruptedException e) {
if (_canceled) { if (_canceled) {
break; break;
} }
} }
}
_urlCache.clear();
if (!_canceled) { if (!_canceled) {
HistoryEntry historyEntry = new HistoryEntry( HistoryEntry historyEntry = new HistoryEntry(
_historyEntryID, _historyEntryID,
_project, _project,
@ -259,16 +257,45 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
} }
} }
CellAtRow fetch(CellAtRow urlData) { CellAtRow cachedFetch(CellAtRow urlData) {
String urlString = urlData.cell.value.toString(); String urlString = urlData.cell.value.toString();
URL url = null; URL url = null;
try { try {
url = new URL(urlString); url = new URL(urlString);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
return null; return null;
} }
Serializable cellResult = _urlCache.get(url);
if (cellResult == null) {
cellResult = fetch(url);
if (cellResult != null) {
_urlCache.put(url, cellResult);
}
try {
// Always sleep for the delay, no matter how long the
// request took. This is more responsible than substracting
// the time spend requesting the URL, because it naturally
// slows us down if the server is busy and takes a long time
// to reply.
if (_delay > 0) {
Thread.sleep(_delay);
}
} catch (InterruptedException e) {
return null;
}
}
if (cellResult != null) {
return new CellAtRow(
urlData.row,
new Cell(cellResult, null));
}
return null;
}
Serializable fetch(URL url) {
try { try {
URLConnection urlConnection = url.openConnection(); URLConnection urlConnection = url.openConnection();
// urlConnection.setRequestProperty(_headerKey, _headerValue); // urlConnection.setRequestProperty(_headerKey, _headerValue);
@ -287,12 +314,8 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
} }
} }
} }
return new CellAtRow( return ParsingUtilities.inputStreamToString(
urlData.row, is, (encoding == null) || ( encoding.equalsIgnoreCase("\"UTF-8\"")) ? "UTF-8" : encoding);
new Cell(
ParsingUtilities.inputStreamToString(
is, (encoding == null) || ( encoding.equalsIgnoreCase("\"UTF-8\"")) ? "UTF-8" : encoding),
null));
} finally { } finally {
is.close(); is.close();
@ -313,11 +336,11 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
message = e.toString(); message = e.toString();
} }
return _onError == OnError.StoreError ? return _onError == OnError.StoreError ?
new CellAtRow(urlData.row, new Cell(new EvalError(message), null)) : null; new EvalError(message) : null;
} }
} catch (Exception e) { } catch (Exception e) {
return _onError == OnError.StoreError ? return _onError == OnError.StoreError ?
new CellAtRow(urlData.row, new Cell(new EvalError(e.getMessage()), null)) : null; new EvalError(e.getMessage()) : null;
} }
} }