diff --git a/CMakeLists.txt b/CMakeLists.txt index 862b1b1..9b39b11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,10 +9,12 @@ set (CONCORDIA_VERSION_MINOR 1) # Type of the characters in SA set (INDEX_CHARACTER_TYPE "unsigned int") +set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295) # The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus. # Suffix markers set (SUFFIX_MARKER_TYPE "unsigned int") +set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295) set (SUFFIX_MARKER_DIVISOR 256) # The above settings assign 3 bytes to sentence id and 1 byte for suffix offset. # This allows to store 2^24 = 16 777 216 sentences no longer than 256 words. diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index cfbd01e..c4c43cb 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -18,6 +18,13 @@ #define LEXICON_FIELD_SEPARATOR "\t" typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE; +#define INDEX_CHARACTER_TYPE_MAX_VALUE @INDEX_CHARACTER_TYPE_MAX_VALUE@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; +#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@ + #define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@ +//Max sentence size is determined by suffix marker divisor. +//The last bits in a sentence marker denote offset whose maximum value +//is the sentence size minus 2. +#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@ diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 174e986..6f6ac42 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -1,10 +1,10 @@ #include "concordia/concordia_index.hpp" #include "concordia/common/utils.hpp" +#include "concordia/common/config.hpp" #include #include #include -#include ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath, const string & markersFilePath) @@ -102,11 +102,11 @@ void ConcordiaIndex::_addSingleExample( } // append sentence boundary marker - INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX; + INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); - SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX; + SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); } diff --git a/concordia/example.cpp b/concordia/example.cpp index 58cee52..6ab973a 100644 --- a/concordia/example.cpp +++ b/concordia/example.cpp @@ -1,9 +1,14 @@ #include "concordia/example.hpp" -Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id): - _sentence(sentence), - _id(id) { +Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) + throw (ConcordiaException): + _sentence(sentence), + _id(id) { + //check if the example id exceeds space reserved for it in the suffix marker + if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) { + throw ConcordiaException("Example id too large."); + } } Example::~Example() { diff --git a/concordia/example.hpp b/concordia/example.hpp index 5e28c56..cbbfc7a 100644 --- a/concordia/example.hpp +++ b/concordia/example.hpp @@ -2,6 +2,7 @@ #define EXAMPLE_HDR #include "concordia/common/config.hpp" +#include "concordia/concordia_exception.hpp" #include /*! @@ -13,7 +14,7 @@ using namespace std; class Example { public: - explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id); + explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) throw (ConcordiaException); /*! Destructor. */ diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index fb545f2..fc736f3 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -21,12 +21,15 @@ HashGenerator::~HashGenerator() { } boost::shared_ptr > HashGenerator::generateHash( - const string & sentence) { + const string & sentence) throw(ConcordiaException) { boost::shared_ptr > result(new vector()); boost::shared_ptr > tokenTexts(new vector()); boost::split(*tokenTexts, sentence, boost::is_any_of(" ")); - + + if (tokenTexts->size() > MAX_SENTENCE_SIZE) { + throw ConcordiaException("Trying to add to long sentence."); + } for (vector::iterator it = tokenTexts->begin(); it != tokenTexts->end(); ++it) { string token = *it; diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index c458343..450ce6d 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "concordia/word_map.hpp" #include "concordia/common/config.hpp" #include "concordia/concordia_exception.hpp" @@ -27,7 +28,8 @@ public: virtual ~HashGenerator(); boost::shared_ptr > - generateHash(const string & sentence); + generateHash(const string & sentence) + throw(ConcordiaException); void serializeWordMap(); diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 867112f..bdd3c92 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(concordia-tests + test_example.cpp test_tm_matches.cpp test_interval.cpp test_logging.cpp diff --git a/concordia/t/test_example.cpp b/concordia/t/test_example.cpp new file mode 100644 index 0000000..7ee3b13 --- /dev/null +++ b/concordia/t/test_example.cpp @@ -0,0 +1,29 @@ +#include +#include "tests/unit-tests/unit_tests_globals.hpp" +#include + +#include "concordia/example.hpp" + +using namespace std; + +BOOST_AUTO_TEST_SUITE(exampleTest) + +BOOST_AUTO_TEST_CASE( ExceedingId ) +{ + Example example1("Test", 16777215); + + bool exceptionThrown = false; + string message = ""; + try { + Example example2("Test", 16777216); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + BOOST_CHECK_EQUAL(exceptionThrown, true); + BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true); + +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index ccabd86..6df7cfe 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -1,6 +1,7 @@ #include #include "tests/unit-tests/unit_tests_globals.hpp" #include +#include #include "concordia/common/config.hpp" #include "concordia/hash_generator.hpp" @@ -28,6 +29,35 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end()); } +BOOST_AUTO_TEST_CASE( TooLongHashTest ) +{ + if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { + boost::filesystem::remove(TEST_WORD_MAP_PATH); + } + + HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); + + stringstream ss; + for (int i=0;i<256;i++) { + ss << "a" << i << " "; + } + + string longSentence = ss.str(); + + bool exceptionThrown = false; + string message = ""; + try { + boost::shared_ptr > hash = hashGenerator.generateHash(longSentence); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + BOOST_CHECK_EQUAL(exceptionThrown, true); + BOOST_CHECK_EQUAL(boost::starts_with(message, "Trying to add to long sentence"), true); + + +} + BOOST_AUTO_TEST_CASE( HashSerializationTest ) { if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { diff --git a/concordia/word_map.cpp b/concordia/word_map.cpp index b5d9f0a..ba74d3f 100644 --- a/concordia/word_map.cpp +++ b/concordia/word_map.cpp @@ -8,8 +8,12 @@ WordMap::WordMap() throw(ConcordiaException) { WordMap::~WordMap() { } -INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) { +INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) + throw(ConcordiaException) { if (_map.find(word) == _map.end()) { + if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) { + throw ConcordiaException("Word map capacity limit reached!"); + } INDEX_CHARACTER_TYPE newCode = _nextFree; _map[word] = newCode; _nextFree++; diff --git a/concordia/word_map.hpp b/concordia/word_map.hpp index 5a92d5e..f395b49 100644 --- a/concordia/word_map.hpp +++ b/concordia/word_map.hpp @@ -24,7 +24,8 @@ public: */ virtual ~WordMap(); - INDEX_CHARACTER_TYPE getWordCode(const string & word); + INDEX_CHARACTER_TYPE getWordCode(const string & word) + throw(ConcordiaException); private: friend class boost::serialization::access;