limits control

Former-commit-id: 83d90cb63b3f1447938d16010e66f4345dfe0617
This commit is contained in:
rjawor 2014-03-14 11:30:17 +01:00
parent 655087582e
commit 4b921decae
12 changed files with 97 additions and 12 deletions

View File

@ -9,10 +9,12 @@ set (CONCORDIA_VERSION_MINOR 1)
# Type of the characters in SA # Type of the characters in SA
set (INDEX_CHARACTER_TYPE "unsigned int") set (INDEX_CHARACTER_TYPE "unsigned int")
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus. # The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
# Suffix markers # Suffix markers
set (SUFFIX_MARKER_TYPE "unsigned int") set (SUFFIX_MARKER_TYPE "unsigned int")
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
set (SUFFIX_MARKER_DIVISOR 256) set (SUFFIX_MARKER_DIVISOR 256)
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset. # The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words. # This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.

View File

@ -18,6 +18,13 @@
#define LEXICON_FIELD_SEPARATOR "\t" #define LEXICON_FIELD_SEPARATOR "\t"
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE; typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
#define INDEX_CHARACTER_TYPE_MAX_VALUE @INDEX_CHARACTER_TYPE_MAX_VALUE@
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@ #define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
//Max sentence size is determined by suffix marker divisor.
//The last bits in a sentence marker denote offset whose maximum value
//is the sentence size minus 2.
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@

View File

@ -1,10 +1,10 @@
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include "concordia/common/config.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <iostream> #include <iostream>
#include <climits>
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath, ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
const string & markersFilePath) const string & markersFilePath)
@ -102,11 +102,11 @@ void ConcordiaIndex::_addSingleExample(
} }
// append sentence boundary marker // append sentence boundary marker
INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX; INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX; SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA); Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA);
} }

View File

@ -1,9 +1,14 @@
#include "concordia/example.hpp" #include "concordia/example.hpp"
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id): Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
throw (ConcordiaException):
_sentence(sentence), _sentence(sentence),
_id(id) { _id(id) {
//check if the example id exceeds space reserved for it in the suffix marker
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
throw ConcordiaException("Example id too large.");
}
} }
Example::~Example() { Example::~Example() {

View File

@ -2,6 +2,7 @@
#define EXAMPLE_HDR #define EXAMPLE_HDR
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp"
#include <string> #include <string>
/*! /*!
@ -13,7 +14,7 @@ using namespace std;
class Example { class Example {
public: public:
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id); explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) throw (ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */

View File

@ -21,12 +21,15 @@ HashGenerator::~HashGenerator() {
} }
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash( boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
const string & sentence) { const string & sentence) throw(ConcordiaException) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
result(new vector<INDEX_CHARACTER_TYPE>()); result(new vector<INDEX_CHARACTER_TYPE>());
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>()); boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
boost::split(*tokenTexts, sentence, boost::is_any_of(" ")); boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
throw ConcordiaException("Trying to add to long sentence.");
}
for (vector<string>::iterator it = tokenTexts->begin(); for (vector<string>::iterator it = tokenTexts->begin();
it != tokenTexts->end(); ++it) { it != tokenTexts->end(); ++it) {
string token = *it; string token = *it;

View File

@ -5,6 +5,7 @@
#include <map> #include <map>
#include <vector> #include <vector>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp" #include "concordia/word_map.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
@ -27,7 +28,8 @@ public:
virtual ~HashGenerator(); virtual ~HashGenerator();
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
generateHash(const string & sentence); generateHash(const string & sentence)
throw(ConcordiaException);
void serializeWordMap(); void serializeWordMap();

View File

@ -1,4 +1,5 @@
add_library(concordia-tests add_library(concordia-tests
test_example.cpp
test_tm_matches.cpp test_tm_matches.cpp
test_interval.cpp test_interval.cpp
test_logging.cpp test_logging.cpp

View File

@ -0,0 +1,29 @@
#include <boost/algorithm/string/predicate.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include "concordia/example.hpp"
using namespace std;
BOOST_AUTO_TEST_SUITE(exampleTest)
BOOST_AUTO_TEST_CASE( ExceedingId )
{
Example example1("Test", 16777215);
bool exceptionThrown = false;
string message = "";
try {
Example example2("Test", 16777216);
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -1,6 +1,7 @@
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include <string> #include <string>
#include <sstream>
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
@ -28,6 +29,35 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
} }
BOOST_AUTO_TEST_CASE( TooLongHashTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
}
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
stringstream ss;
for (int i=0;i<256;i++) {
ss << "a" << i << " ";
}
string longSentence = ss.str();
bool exceptionThrown = false;
string message = "";
try {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Trying to add to long sentence"), true);
}
BOOST_AUTO_TEST_CASE( HashSerializationTest ) BOOST_AUTO_TEST_CASE( HashSerializationTest )
{ {
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {

View File

@ -8,8 +8,12 @@ WordMap::WordMap() throw(ConcordiaException) {
WordMap::~WordMap() { WordMap::~WordMap() {
} }
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) { INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word)
throw(ConcordiaException) {
if (_map.find(word) == _map.end()) { if (_map.find(word) == _map.end()) {
if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
throw ConcordiaException("Word map capacity limit reached!");
}
INDEX_CHARACTER_TYPE newCode = _nextFree; INDEX_CHARACTER_TYPE newCode = _nextFree;
_map[word] = newCode; _map[word] = newCode;
_nextFree++; _nextFree++;

View File

@ -24,7 +24,8 @@ public:
*/ */
virtual ~WordMap(); virtual ~WordMap();
INDEX_CHARACTER_TYPE getWordCode(const string & word); INDEX_CHARACTER_TYPE getWordCode(const string & word)
throw(ConcordiaException);
private: private:
friend class boost::serialization::access; friend class boost::serialization::access;