Merge pull request #1178 from wetneb/url_caching
Add caching in URL fetching
This commit is contained in:
commit
912600f0bd
@ -54,6 +54,7 @@ public class AddColumnByFetchingURLsCommand extends EngineDependentCommand {
|
||||
int columnInsertIndex = Integer.parseInt(request.getParameter("columnInsertIndex"));
|
||||
int delay = Integer.parseInt(request.getParameter("delay"));
|
||||
String onError = request.getParameter("onError");
|
||||
boolean cacheResponses = Boolean.parseBoolean(request.getParameter("cacheResponses"));
|
||||
|
||||
return new ColumnAdditionByFetchingURLsOperation(
|
||||
engineConfig,
|
||||
@ -62,7 +63,8 @@ public class AddColumnByFetchingURLsCommand extends EngineDependentCommand {
|
||||
TextTransformOperation.stringToOnError(onError),
|
||||
newColumnName,
|
||||
columnInsertIndex,
|
||||
delay
|
||||
delay,
|
||||
cacheResponses
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,8 @@ import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
@ -71,6 +73,9 @@ import com.google.refine.operations.cell.TextTransformOperation;
|
||||
import com.google.refine.process.LongRunningProcess;
|
||||
import com.google.refine.process.Process;
|
||||
import com.google.refine.util.ParsingUtilities;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.cache.LoadingCache;
|
||||
import com.google.common.cache.CacheLoader;
|
||||
|
||||
public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperation {
|
||||
final protected String _baseColumnName;
|
||||
@ -80,6 +85,7 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
final protected String _newColumnName;
|
||||
final protected int _columnInsertIndex;
|
||||
final protected int _delay;
|
||||
final protected boolean _cacheResponses;
|
||||
|
||||
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
|
||||
JSONObject engineConfig = obj.getJSONObject("engineConfig");
|
||||
@ -91,7 +97,8 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
TextTransformOperation.stringToOnError(obj.getString("onError")),
|
||||
obj.getString("newColumnName"),
|
||||
obj.getInt("columnInsertIndex"),
|
||||
obj.getInt("delay")
|
||||
obj.getInt("delay"),
|
||||
obj.optBoolean("cacheResponses", false) // false for retro-compatibility
|
||||
);
|
||||
}
|
||||
|
||||
@ -102,7 +109,8 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
OnError onError,
|
||||
String newColumnName,
|
||||
int columnInsertIndex,
|
||||
int delay
|
||||
int delay,
|
||||
boolean cacheResponses
|
||||
) {
|
||||
super(engineConfig);
|
||||
|
||||
@ -114,6 +122,7 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
_columnInsertIndex = columnInsertIndex;
|
||||
|
||||
_delay = delay;
|
||||
_cacheResponses = cacheResponses;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -130,6 +139,7 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
writer.key("urlExpression"); writer.value(_urlExpression);
|
||||
writer.key("onError"); writer.value(TextTransformOperation.onErrorToString(_onError));
|
||||
writer.key("delay"); writer.value(_delay);
|
||||
writer.key("cacheResponses"); writer.value(_cacheResponses);
|
||||
writer.endObject();
|
||||
}
|
||||
|
||||
@ -160,7 +170,8 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
project,
|
||||
engine,
|
||||
eval,
|
||||
getBriefDescription(null)
|
||||
getBriefDescription(null),
|
||||
_cacheResponses
|
||||
);
|
||||
}
|
||||
|
||||
@ -170,18 +181,45 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
final protected Evaluable _eval;
|
||||
final protected long _historyEntryID;
|
||||
protected int _cellIndex;
|
||||
protected LoadingCache<String, Serializable> _urlCache;
|
||||
|
||||
public ColumnAdditionByFetchingURLsProcess(
|
||||
Project project,
|
||||
Engine engine,
|
||||
Evaluable eval,
|
||||
String description
|
||||
String description,
|
||||
boolean cacheResponses
|
||||
) throws JSONException {
|
||||
super(description);
|
||||
_project = project;
|
||||
_engine = engine;
|
||||
_eval = eval;
|
||||
_historyEntryID = HistoryEntry.allocateID();
|
||||
_urlCache = null;
|
||||
if (cacheResponses) {
|
||||
_urlCache = CacheBuilder.newBuilder()
|
||||
.maximumSize(2048)
|
||||
.expireAfterWrite(10, TimeUnit.MINUTES)
|
||||
.build(
|
||||
new CacheLoader<String, Serializable>() {
|
||||
public Serializable load(String urlString) {
|
||||
Serializable result = fetch(urlString);
|
||||
try {
|
||||
// Always sleep for the delay, no matter how long the
|
||||
// request took. This is more responsible than substracting
|
||||
// the time spend requesting the URL, because it naturally
|
||||
// slows us down if the server is busy and takes a long time
|
||||
// to reply.
|
||||
if (_delay > 0) {
|
||||
Thread.sleep(_delay);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
return null;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -222,27 +260,31 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
List<CellAtRow> responseBodies = new ArrayList<CellAtRow>(urls.size());
|
||||
for (int i = 0; i < urls.size(); i++) {
|
||||
CellAtRow urlData = urls.get(i);
|
||||
long start = System.currentTimeMillis();
|
||||
CellAtRow cellAtRow = fetch(urlData);
|
||||
if (cellAtRow != null) {
|
||||
String urlString = urlData.cell.value.toString();
|
||||
|
||||
Serializable response = null;
|
||||
if (_urlCache != null) {
|
||||
response = cachedFetch(urlString);
|
||||
} else {
|
||||
response = fetch(urlString);
|
||||
}
|
||||
|
||||
if (response != null) {
|
||||
CellAtRow cellAtRow = new CellAtRow(
|
||||
urlData.row,
|
||||
new Cell(response, null));
|
||||
|
||||
responseBodies.add(cellAtRow);
|
||||
}
|
||||
|
||||
_progress = i * 100 / urls.size();
|
||||
try {
|
||||
long delay = _delay - (System.currentTimeMillis() - start);
|
||||
if (delay > 0) {
|
||||
Thread.sleep(delay);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
|
||||
if (_canceled) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!_canceled) {
|
||||
|
||||
HistoryEntry historyEntry = new HistoryEntry(
|
||||
_historyEntryID,
|
||||
_project,
|
||||
@ -259,10 +301,16 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
}
|
||||
}
|
||||
|
||||
CellAtRow fetch(CellAtRow urlData) {
|
||||
String urlString = urlData.cell.value.toString();
|
||||
URL url = null;
|
||||
Serializable cachedFetch(String urlString) {
|
||||
try {
|
||||
return _urlCache.get(urlString);
|
||||
} catch(ExecutionException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Serializable fetch(String urlString) {
|
||||
URL url = null;
|
||||
try {
|
||||
url = new URL(urlString);
|
||||
} catch (MalformedURLException e) {
|
||||
@ -287,12 +335,8 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
}
|
||||
}
|
||||
}
|
||||
return new CellAtRow(
|
||||
urlData.row,
|
||||
new Cell(
|
||||
ParsingUtilities.inputStreamToString(
|
||||
is, (encoding == null) || ( encoding.equalsIgnoreCase("\"UTF-8\"")) ? "UTF-8" : encoding),
|
||||
null));
|
||||
return ParsingUtilities.inputStreamToString(
|
||||
is, (encoding == null) || ( encoding.equalsIgnoreCase("\"UTF-8\"")) ? "UTF-8" : encoding);
|
||||
|
||||
} finally {
|
||||
is.close();
|
||||
@ -313,11 +357,11 @@ public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperat
|
||||
message = e.toString();
|
||||
}
|
||||
return _onError == OnError.StoreError ?
|
||||
new CellAtRow(urlData.row, new Cell(new EvalError(message), null)) : null;
|
||||
new EvalError(message) : null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return _onError == OnError.StoreError ?
|
||||
new CellAtRow(urlData.row, new Cell(new EvalError(e.getMessage()), null)) : null;
|
||||
new EvalError(e.getMessage()) : null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,164 @@
|
||||
/*
|
||||
|
||||
Copyright 2010, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package com.google.refine.tests.model;
|
||||
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.refine.ProjectManager;
|
||||
import com.google.refine.ProjectMetadata;
|
||||
import com.google.refine.browsing.Engine;
|
||||
import com.google.refine.browsing.RowVisitor;
|
||||
import com.google.refine.grel.Function;
|
||||
import com.google.refine.io.FileProjectManager;
|
||||
import com.google.refine.model.Cell;
|
||||
import com.google.refine.model.Column;
|
||||
import com.google.refine.model.ModelException;
|
||||
import com.google.refine.model.Project;
|
||||
import com.google.refine.model.Row;
|
||||
import com.google.refine.process.Process;
|
||||
import com.google.refine.process.ProcessManager;
|
||||
import com.google.refine.operations.OnError;
|
||||
import com.google.refine.operations.EngineDependentOperation;
|
||||
import com.google.refine.operations.column.ColumnAdditionByFetchingURLsOperation;
|
||||
import com.google.refine.tests.RefineTest;
|
||||
import com.google.refine.tests.util.TestUtils;
|
||||
|
||||
|
||||
public class UrlFetchingTests extends RefineTest {
|
||||
|
||||
static final String ENGINE_JSON_URLS = "{\"mode\":\"row-based\"}}";
|
||||
|
||||
@Override
|
||||
@BeforeTest
|
||||
public void init() {
|
||||
logger = LoggerFactory.getLogger(this.getClass());
|
||||
}
|
||||
|
||||
// dependencies
|
||||
Project project;
|
||||
Properties options;
|
||||
JSONObject engine_config;
|
||||
Engine engine;
|
||||
Properties bindings;
|
||||
|
||||
@BeforeMethod
|
||||
public void SetUp() throws JSONException, IOException, ModelException {
|
||||
File dir = TestUtils.createTempDirectory("openrefine-test-workspace-dir");
|
||||
FileProjectManager.initialize(dir);
|
||||
project = new Project();
|
||||
ProjectMetadata pm = new ProjectMetadata();
|
||||
pm.setName("URL Fetching Test Project");
|
||||
ProjectManager.singleton.registerProject(project, pm);
|
||||
|
||||
int index = project.columnModel.allocateNewCellIndex();
|
||||
Column column = new Column(index,"fruits");
|
||||
project.columnModel.addColumn(index, column, true);
|
||||
|
||||
options = mock(Properties.class);
|
||||
engine = new Engine(project);
|
||||
engine_config = new JSONObject(ENGINE_JSON_URLS);
|
||||
engine.initializeFromJSON(engine_config);
|
||||
engine.setMode(Engine.Mode.RowBased);
|
||||
|
||||
bindings = new Properties();
|
||||
bindings.put("project", project);
|
||||
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void TearDown() {
|
||||
project = null;
|
||||
options = null;
|
||||
engine = null;
|
||||
bindings = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test for caching
|
||||
*/
|
||||
|
||||
@Test
|
||||
public void testUrlCaching() throws Exception {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Row row = new Row(2);
|
||||
row.setCell(0, new Cell(i < 5 ? "apple":"orange", null));
|
||||
project.rows.add(row);
|
||||
}
|
||||
EngineDependentOperation op = new ColumnAdditionByFetchingURLsOperation(engine_config,
|
||||
"fruits",
|
||||
"\"https://www.random.org/integers/?num=1&min=1&max=100&col=1&base=10&format=plain&rnd=new&city=\"+value",
|
||||
OnError.SetToBlank,
|
||||
"rand",
|
||||
1,
|
||||
500,
|
||||
true);
|
||||
ProcessManager pm = project.getProcessManager();
|
||||
Process process = op.createProcess(project, options);
|
||||
process.startPerforming(pm);
|
||||
Assert.assertTrue(process.isRunning());
|
||||
try {
|
||||
// We have 100 rows and 500 ms per row but only two distinct
|
||||
// values so we should not wait more than ~2000 ms to get the
|
||||
// results. Just to make sure the test passes with plenty of
|
||||
// net latency we sleep for longer (but still less than
|
||||
// 50,000ms).
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e) {
|
||||
Assert.fail("Test interrupted");
|
||||
}
|
||||
Assert.assertFalse(process.isRunning());
|
||||
|
||||
// Inspect rows
|
||||
String ref_val = (String)project.rows.get(0).getCellValue(1);
|
||||
Assert.assertTrue(ref_val != "apple"); // just to make sure I picked the right column
|
||||
for (int i = 1; i < 4; i++) {
|
||||
// all random values should be equal due to caching
|
||||
Assert.assertEquals(project.rows.get(i).getCellValue(1), ref_val);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -500,6 +500,7 @@
|
||||
"on-error": "On error",
|
||||
"set-blank": "set to blank",
|
||||
"store-err": "store error",
|
||||
"cache-responses": "Cache responses",
|
||||
"copy-val": "copy value from original column",
|
||||
"warning-col-name": "You must enter a column name.",
|
||||
"add-col-fetch": "Add column by fetching URLs based on column",
|
||||
|
@ -500,6 +500,7 @@
|
||||
"on-error": "On error",
|
||||
"set-blank": "set to blank",
|
||||
"store-err": "store error",
|
||||
"cache-responses": "Cache responses",
|
||||
"copy-val": "copy value from original column",
|
||||
"warning-col-name": "You must enter a column name.",
|
||||
"add-col-fetch": "Add column by fetching URLs based on column",
|
||||
|
@ -500,6 +500,7 @@
|
||||
"on-error": "En cas d’erreur",
|
||||
"set-blank": "vider la cellule",
|
||||
"store-err": "conserver l’erreur",
|
||||
"cache-responses": "Mettre les réponses en cache",
|
||||
"copy-val": "copier la valeur depuis la colonne originale",
|
||||
"warning-col-name": "Vous devez indiquer un nom de colonne.",
|
||||
"add-col-fetch": "Ajouter une colonne en moissonnant les données depuis les URL d’une colonne",
|
||||
|
@ -12,11 +12,14 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<td width="1%" style="white-space: pre;" bind="or_views_onErr"></td>
|
||||
<td colspan="3">
|
||||
<td>
|
||||
<input type="radio" name="dialog-onerror-choice" value="set-to-blank" checked id="$add-column-error-set-to-blank"/>
|
||||
<label for="$add-column-error-set-to-blank" bind="or_views_setBlank"></label>
|
||||
<input type="radio" name="dialog-onerror-choice" value="store-error" id="$add-column-error-store-error" />
|
||||
<label for="$add-column-error-store-error" bind="or_views_storeErr"></label></td>
|
||||
<td colspan="2">
|
||||
<input type="checkbox" name="dialog-cache-responses" id="$add-column-cache-responses" checked="checked" />
|
||||
<label for="$add-column-cache-responses" bind="or_views_cacheResponses"></label></td>
|
||||
</tr>
|
||||
<tr><td colspan="4"><h3><span bind="or_views_urlFetch"></span></h3></td></tr>
|
||||
<tr><td colspan="4">$EXPRESSION_PREVIEW_WIDGET$</td></tr>
|
||||
|
@ -103,6 +103,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) {
|
||||
elmts.or_views_onErr.text($.i18n._('core-views')["on-error"]);
|
||||
elmts.or_views_setBlank.text($.i18n._('core-views')["set-blank"]);
|
||||
elmts.or_views_storeErr.text($.i18n._('core-views')["store-err"]);
|
||||
elmts.or_views_cacheResponses.text($.i18n._('core-views')["cache-responses"]);
|
||||
elmts.or_views_urlFetch.text($.i18n._('core-views')["url-fetch"]);
|
||||
elmts.okButton.html($.i18n._('core-buttons')["ok"]);
|
||||
elmts.cancelButton.text($.i18n._('core-buttons')["cancel"]);
|
||||
@ -135,7 +136,8 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) {
|
||||
newColumnName: columnName,
|
||||
columnInsertIndex: columnIndex + 1,
|
||||
delay: elmts.throttleDelayInput[0].value,
|
||||
onError: $('input[name="dialog-onerror-choice"]:checked')[0].value
|
||||
onError: $('input[name="dialog-onerror-choice"]:checked')[0].value,
|
||||
cacheResponses: $('input[name="dialog-cache-responses"]')[0].checked,
|
||||
},
|
||||
null,
|
||||
{ modelsChanged: true }
|
||||
|
Loading…
Reference in New Issue
Block a user