finished adding multiple sentences, memory leak

This commit is contained in:
rjawor 2015-08-20 20:12:18 +02:00
parent 823c1fbdb2
commit 9eae5aa5b1
11 changed files with 173 additions and 3 deletions

View File

@ -57,6 +57,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
} }
} }
_indexController->addSentences(jsonWriter, sourceSentences, targetSentences, tmIds); _indexController->addSentences(jsonWriter, sourceSentences, targetSentences, tmIds);
} else if (operation == REFRESH_INDEX_OP) {
_indexController->refreshIndexFromRAM(jsonWriter);
} else if (operation == SIMPLE_SEARCH_OP) { } else if (operation == SIMPLE_SEARCH_OP) {
std::string pattern = d[PATTERN_PARAM].GetString(); std::string pattern = d[PATTERN_PARAM].GetString();
_searcherController->simpleSearch(jsonWriter, pattern); _searcherController->simpleSearch(jsonWriter, pattern);

View File

@ -17,6 +17,7 @@
#define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences" #define ADD_SENTENCES_OP "addSentences"
#define REFRESH_INDEX_OP "refreshIndex"
#define SIMPLE_SEARCH_OP "simpleSearch" #define SIMPLE_SEARCH_OP "simpleSearch"
#define CONCORDIA_SEARCH_OP "concordiaSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch"

View File

@ -17,6 +17,7 @@
#define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences" #define ADD_SENTENCES_OP "addSentences"
#define REFRESH_INDEX_OP "refreshIndex"
#define SIMPLE_SEARCH_OP "simpleSearch" #define SIMPLE_SEARCH_OP "simpleSearch"
#define CONCORDIA_SEARCH_OP "concordiaSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch"

View File

@ -45,7 +45,6 @@ void IndexController::addSentences(
std::vector<TokenizedSentence> tokenizedSentences = _concordia->tokenizeAll(sourceSentences); std::vector<TokenizedSentence> tokenizedSentences = _concordia->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmIds); std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmIds);
_concordia->addAllTokenizedExamples(tokenizedSentences, sentenceIds); _concordia->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
_concordia->refreshSAfromRAM();
jsonWriter.StartObject(); jsonWriter.StartObject();
jsonWriter.String("status"); jsonWriter.String("status");
@ -58,3 +57,20 @@ void IndexController::addSentences(
} }
} }
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter) {
try {
_concordia->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}

View File

@ -30,6 +30,8 @@ public:
const std::vector<std::string> & targetSentences, const std::vector<std::string> & targetSentences,
const std::vector<int> & tmIds); const std::vector<int> & tmIds);
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter);
private: private:
boost::shared_ptr<Concordia> _concordia; boost::shared_ptr<Concordia> _concordia;

View File

@ -64,6 +64,9 @@ std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<Matc
connection.getStringValue(result,0,1), connection.getStringValue(result,0,1),
connection.getStringValue(result,0,2))); connection.getStringValue(result,0,2)));
connection.clearResult(result); connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
} }
connection.endTransaction(); connection.endTransaction();
return results; return results;

View File

@ -1,13 +1,13 @@
#!/bin/sh #!/bin/sh
echo "Recreating database schema..." echo "Recreating database schema..."
sudo -u concordia psql concordia_server -f concordia_server.sql psql -W -U concordia concordia_server -f concordia_server.sql
echo "Inserting initial data..." echo "Inserting initial data..."
for initFile in `ls init/*` for initFile in `ls init/*`
do do
echo "Init file:" $initFile echo "Init file:" $initFile
sudo -u concordia psql concordia_server -f $initFile psql -W -U concordia concordia_server -f $initFile
done done
echo "Concordia server database recreation complete!" echo "Concordia server database recreation complete!"

73
tests/addFile.py Executable file
View File

@ -0,0 +1,73 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request('http://localhost')
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
sourceFile = sys.argv[1]
targetFile = sys.argv[2]
tmId = int(sys.argv[3])
totalLines = file_len(sourceFile)
if file_len(targetFile) != totalLines:
print "File lengths do not match"
sys.exit(1)
BUFFER_SIZE = 1000
data = {
'operation': 'addSentences'
}
sentences = []
start = time.time()
with open(sys.argv[1]) as sourceSentences:
with open(sys.argv[2]) as targetSentences:
lineNumber = 0
for sourceSentence in sourceSentences:
lineNumber += 1
targetSentence = targetSentences.readline()
sentences.append([tmId, sourceSentence, targetSentence])
if lineNumber % BUFFER_SIZE == 0:
data['sentences'] = sentences
sentences = []
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % (lineNumber, totalLines, mark-start, lineNumber/(mark-start))
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (lineNumber, end-start, lineNumber/(end-start))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex'
}
req = urllib2.Request('http://localhost')
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

27
tests/addSentence.py Executable file
View File

@ -0,0 +1,27 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
data = {
'operation': 'addSentence',
'sourceSentence':sys.argv[1],
'targetSentence':sys.argv[2],
'tmId':sys.argv[3]
}
start = time.time()
req = urllib2.Request('http://localhost')
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response

20
tests/addingError.txt Normal file
View File

@ -0,0 +1,20 @@
Added 1101000 of 1254468 sentences. Time elapsed: 984.7707 s, current speed: 1118.0268 sentences/second
Traceback (most recent call last):
File "./addFile.py", line 48, in <module>
add_data(data)
File "./addFile.py", line 19, in add_data
urllib2.urlopen(req, json.dumps(data)).read()
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 413: Request Entity Too Large

25
tests/simpleSearch.py Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
data = {
'operation': 'simpleSearch',
'pattern':sys.argv[1]
}
start = time.time()
req = urllib2.Request('http://localhost')
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response