limits control
Former-commit-id: 83d90cb63b3f1447938d16010e66f4345dfe0617
This commit is contained in:
parent
655087582e
commit
4b921decae
@ -9,10 +9,12 @@ set (CONCORDIA_VERSION_MINOR 1)
|
||||
# Type of the characters in SA
|
||||
|
||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
|
||||
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
||||
|
||||
# Suffix markers
|
||||
set (SUFFIX_MARKER_TYPE "unsigned int")
|
||||
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
|
||||
set (SUFFIX_MARKER_DIVISOR 256)
|
||||
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
||||
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
||||
|
@ -18,6 +18,13 @@
|
||||
#define LEXICON_FIELD_SEPARATOR "\t"
|
||||
|
||||
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
||||
#define INDEX_CHARACTER_TYPE_MAX_VALUE @INDEX_CHARACTER_TYPE_MAX_VALUE@
|
||||
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
|
||||
|
||||
|
||||
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
||||
//Max sentence size is determined by suffix marker divisor.
|
||||
//The last bits in a sentence marker denote offset whose maximum value
|
||||
//is the sentence size minus 2.
|
||||
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
|
||||
|
@ -1,10 +1,10 @@
|
||||
#include "concordia/concordia_index.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <iostream>
|
||||
#include <climits>
|
||||
|
||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
||||
const string & markersFilePath)
|
||||
@ -102,11 +102,11 @@ void ConcordiaIndex::_addSingleExample(
|
||||
}
|
||||
|
||||
// append sentence boundary marker
|
||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX;
|
||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX;
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
}
|
||||
|
@ -1,9 +1,14 @@
|
||||
#include "concordia/example.hpp"
|
||||
|
||||
|
||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id):
|
||||
_sentence(sentence),
|
||||
_id(id) {
|
||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||
throw (ConcordiaException):
|
||||
_sentence(sentence),
|
||||
_id(id) {
|
||||
//check if the example id exceeds space reserved for it in the suffix marker
|
||||
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
|
||||
throw ConcordiaException("Example id too large.");
|
||||
}
|
||||
}
|
||||
|
||||
Example::~Example() {
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define EXAMPLE_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
@ -13,7 +14,7 @@ using namespace std;
|
||||
|
||||
class Example {
|
||||
public:
|
||||
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id);
|
||||
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) throw (ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
|
@ -21,12 +21,15 @@ HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
||||
const string & sentence) {
|
||||
const string & sentence) throw(ConcordiaException) {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
||||
boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
|
||||
|
||||
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
|
||||
throw ConcordiaException("Trying to add to long sentence.");
|
||||
}
|
||||
for (vector<string>::iterator it = tokenTexts->begin();
|
||||
it != tokenTexts->end(); ++it) {
|
||||
string token = *it;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include "concordia/word_map.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
@ -27,7 +28,8 @@ public:
|
||||
virtual ~HashGenerator();
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
generateHash(const string & sentence);
|
||||
generateHash(const string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
void serializeWordMap();
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_example.cpp
|
||||
test_tm_matches.cpp
|
||||
test_interval.cpp
|
||||
test_logging.cpp
|
||||
|
29
concordia/t/test_example.cpp
Normal file
29
concordia/t/test_example.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
|
||||
#include "concordia/example.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(exampleTest)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ExceedingId )
|
||||
{
|
||||
Example example1("Test", 16777215);
|
||||
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
try {
|
||||
Example example2("Test", 16777216);
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
|
||||
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -1,6 +1,7 @@
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
@ -28,6 +29,35 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||
{
|
||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
|
||||
stringstream ss;
|
||||
for (int i=0;i<256;i++) {
|
||||
ss << "a" << i << " ";
|
||||
}
|
||||
|
||||
string longSentence = ss.str();
|
||||
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
try {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Trying to add to long sentence"), true);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
{
|
||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||
|
@ -8,8 +8,12 @@ WordMap::WordMap() throw(ConcordiaException) {
|
||||
WordMap::~WordMap() {
|
||||
}
|
||||
|
||||
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
|
||||
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word)
|
||||
throw(ConcordiaException) {
|
||||
if (_map.find(word) == _map.end()) {
|
||||
if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
|
||||
throw ConcordiaException("Word map capacity limit reached!");
|
||||
}
|
||||
INDEX_CHARACTER_TYPE newCode = _nextFree;
|
||||
_map[word] = newCode;
|
||||
_nextFree++;
|
||||
|
@ -24,7 +24,8 @@ public:
|
||||
*/
|
||||
virtual ~WordMap();
|
||||
|
||||
INDEX_CHARACTER_TYPE getWordCode(const string & word);
|
||||
INDEX_CHARACTER_TYPE getWordCode(const string & word)
|
||||
throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
friend class boost::serialization::access;
|
||||
|
Loading…
Reference in New Issue
Block a user