limits control
Former-commit-id: 83d90cb63b3f1447938d16010e66f4345dfe0617
This commit is contained in:
parent
655087582e
commit
4b921decae
@ -9,10 +9,12 @@ set (CONCORDIA_VERSION_MINOR 1)
|
|||||||
# Type of the characters in SA
|
# Type of the characters in SA
|
||||||
|
|
||||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||||
|
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
|
||||||
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
||||||
|
|
||||||
# Suffix markers
|
# Suffix markers
|
||||||
set (SUFFIX_MARKER_TYPE "unsigned int")
|
set (SUFFIX_MARKER_TYPE "unsigned int")
|
||||||
|
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
|
||||||
set (SUFFIX_MARKER_DIVISOR 256)
|
set (SUFFIX_MARKER_DIVISOR 256)
|
||||||
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
||||||
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
||||||
|
@ -18,6 +18,13 @@
|
|||||||
#define LEXICON_FIELD_SEPARATOR "\t"
|
#define LEXICON_FIELD_SEPARATOR "\t"
|
||||||
|
|
||||||
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
||||||
|
#define INDEX_CHARACTER_TYPE_MAX_VALUE @INDEX_CHARACTER_TYPE_MAX_VALUE@
|
||||||
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||||
|
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
|
||||||
|
|
||||||
|
|
||||||
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
||||||
|
//Max sentence size is determined by suffix marker divisor.
|
||||||
|
//The last bits in a sentence marker denote offset whose maximum value
|
||||||
|
//is the sentence size minus 2.
|
||||||
|
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
|
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
||||||
const string & markersFilePath)
|
const string & markersFilePath)
|
||||||
@ -102,11 +102,11 @@ void ConcordiaIndex::_addSingleExample(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// append sentence boundary marker
|
// append sentence boundary marker
|
||||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX;
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||||
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||||
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX;
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
markers->push_back(sentenceBoundaryMA);
|
markers->push_back(sentenceBoundaryMA);
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,14 @@
|
|||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
|
|
||||||
|
|
||||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id):
|
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||||
_sentence(sentence),
|
throw (ConcordiaException):
|
||||||
_id(id) {
|
_sentence(sentence),
|
||||||
|
_id(id) {
|
||||||
|
//check if the example id exceeds space reserved for it in the suffix marker
|
||||||
|
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
|
||||||
|
throw ConcordiaException("Example id too large.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Example::~Example() {
|
Example::~Example() {
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define EXAMPLE_HDR
|
#define EXAMPLE_HDR
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -13,7 +14,7 @@ using namespace std;
|
|||||||
|
|
||||||
class Example {
|
class Example {
|
||||||
public:
|
public:
|
||||||
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id);
|
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) throw (ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
|
@ -21,12 +21,15 @@ HashGenerator::~HashGenerator() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
||||||
const string & sentence) {
|
const string & sentence) throw(ConcordiaException) {
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||||
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
||||||
boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
|
boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
|
||||||
|
|
||||||
|
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
|
||||||
|
throw ConcordiaException("Trying to add to long sentence.");
|
||||||
|
}
|
||||||
for (vector<string>::iterator it = tokenTexts->begin();
|
for (vector<string>::iterator it = tokenTexts->begin();
|
||||||
it != tokenTexts->end(); ++it) {
|
it != tokenTexts->end(); ++it) {
|
||||||
string token = *it;
|
string token = *it;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
@ -27,7 +28,8 @@ public:
|
|||||||
virtual ~HashGenerator();
|
virtual ~HashGenerator();
|
||||||
|
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||||
generateHash(const string & sentence);
|
generateHash(const string & sentence)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
void serializeWordMap();
|
void serializeWordMap();
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
|
test_example.cpp
|
||||||
test_tm_matches.cpp
|
test_tm_matches.cpp
|
||||||
test_interval.cpp
|
test_interval.cpp
|
||||||
test_logging.cpp
|
test_logging.cpp
|
||||||
|
29
concordia/t/test_example.cpp
Normal file
29
concordia/t/test_example.cpp
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "concordia/example.hpp"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(exampleTest)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ExceedingId )
|
||||||
|
{
|
||||||
|
Example example1("Test", 16777215);
|
||||||
|
|
||||||
|
bool exceptionThrown = false;
|
||||||
|
string message = "";
|
||||||
|
try {
|
||||||
|
Example example2("Test", 16777216);
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
exceptionThrown = true;
|
||||||
|
message = e.what();
|
||||||
|
}
|
||||||
|
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||||
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -1,6 +1,7 @@
|
|||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
@ -28,6 +29,35 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||||
|
{
|
||||||
|
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||||
|
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
|
|
||||||
|
stringstream ss;
|
||||||
|
for (int i=0;i<256;i++) {
|
||||||
|
ss << "a" << i << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
string longSentence = ss.str();
|
||||||
|
|
||||||
|
bool exceptionThrown = false;
|
||||||
|
string message = "";
|
||||||
|
try {
|
||||||
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
exceptionThrown = true;
|
||||||
|
message = e.what();
|
||||||
|
}
|
||||||
|
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||||
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "Trying to add to long sentence"), true);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||||
{
|
{
|
||||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||||
|
@ -8,8 +8,12 @@ WordMap::WordMap() throw(ConcordiaException) {
|
|||||||
WordMap::~WordMap() {
|
WordMap::~WordMap() {
|
||||||
}
|
}
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
|
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word)
|
||||||
|
throw(ConcordiaException) {
|
||||||
if (_map.find(word) == _map.end()) {
|
if (_map.find(word) == _map.end()) {
|
||||||
|
if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
|
||||||
|
throw ConcordiaException("Word map capacity limit reached!");
|
||||||
|
}
|
||||||
INDEX_CHARACTER_TYPE newCode = _nextFree;
|
INDEX_CHARACTER_TYPE newCode = _nextFree;
|
||||||
_map[word] = newCode;
|
_map[word] = newCode;
|
||||||
_nextFree++;
|
_nextFree++;
|
||||||
|
@ -24,7 +24,8 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~WordMap();
|
virtual ~WordMap();
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE getWordCode(const string & word);
|
INDEX_CHARACTER_TYPE getWordCode(const string & word)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class boost::serialization::access;
|
friend class boost::serialization::access;
|
||||||
|
Loading…
Reference in New Issue
Block a user