limits control

Former-commit-id: 83d90cb63b3f1447938d16010e66f4345dfe0617
2014-03-14 11:30:17 +01:00 · 2014-03-14 11:30:17 +01:00 · 4b921decae
commit 4b921decae
parent 655087582e
12 changed files with 97 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,10 +9,12 @@ set (CONCORDIA_VERSION_MINOR 1)
 # Type of the characters in SA
 set (INDEX_CHARACTER_TYPE "unsigned int")
 set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
 # The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
 # Suffix markers
 set (SUFFIX_MARKER_TYPE "unsigned int")
 set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
 set (SUFFIX_MARKER_DIVISOR 256)
 # The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
 # This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
--- a/concordia/common/config.hpp.in
+++ b/concordia/common/config.hpp.in
@ -18,6 +18,13 @@
 #define LEXICON_FIELD_SEPARATOR "\t"
 typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
 #define INDEX_CHARACTER_TYPE_MAX_VALUE @INDEX_CHARACTER_TYPE_MAX_VALUE@
 typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
 #define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
 #define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
 //Max sentence size is determined by suffix marker divisor.
 //The last bits in a sentence marker denote offset whose maximum value
 //is the sentence size minus 2.
 #define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -1,10 +1,10 @@
 #include "concordia/concordia_index.hpp"
 #include "concordia/common/utils.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/filesystem.hpp>
 #include <boost/foreach.hpp>
 #include <iostream>
 #include <climits>
 ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
                               const string & markersFilePath)
@ -102,11 +102,11 @@ void ConcordiaIndex::_addSingleExample(
    }
    // append sentence boundary marker
-    INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX;
+    INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
    Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
    Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
-    SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX;
+    SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    Utils::writeMarker(markersFile, sentenceBoundaryMA);
    markers->push_back(sentenceBoundaryMA);
 }
--- a/concordia/example.cpp
+++ b/concordia/example.cpp
@ -1,9 +1,14 @@
 #include "concordia/example.hpp"
-Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id):
+Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
-                        _sentence(sentence),
+                                              throw (ConcordiaException):
-                        _id(id) {
+                                              _sentence(sentence),
                                              _id(id) {
    //check if the example id exceeds space reserved for it in the suffix marker
    if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
        throw ConcordiaException("Example id too large.");
    }
 }
 Example::~Example() {
--- a/concordia/example.hpp
+++ b/concordia/example.hpp
@ -2,6 +2,7 @@
 #define EXAMPLE_HDR
 #include "concordia/common/config.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <string>
 /*!
@ -13,7 +14,7 @@ using namespace std;
 class Example {
 public:
-    explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id);
+    explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id) throw (ConcordiaException);
    /*! Destructor.
    */
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -21,12 +21,15 @@ HashGenerator::~HashGenerator() {
 }
 boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
-                                        const string & sentence) {
+                         const string & sentence) throw(ConcordiaException) {
    boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
                                  result(new vector<INDEX_CHARACTER_TYPE>());
    boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
    boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
-
+    
    if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
        throw ConcordiaException("Trying to add to long sentence.");
    }
    for (vector<string>::iterator it = tokenTexts->begin();
                                it != tokenTexts->end(); ++it) {
        string token = *it;
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -5,6 +5,7 @@
 #include <map>
 #include <vector>
 #include <boost/shared_ptr.hpp>
 #include <boost/algorithm/string/predicate.hpp>
 #include "concordia/word_map.hpp"
 #include "concordia/common/config.hpp"
 #include "concordia/concordia_exception.hpp"
@ -27,7 +28,8 @@ public:
    virtual ~HashGenerator();
    boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
-                           generateHash(const string & sentence);
+                    generateHash(const string & sentence)
                                throw(ConcordiaException);
    void serializeWordMap();
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,4 +1,5 @@
 add_library(concordia-tests
  test_example.cpp
  test_tm_matches.cpp
  test_interval.cpp
  test_logging.cpp
--- a/concordia/t/test_example.cpp
+++ b/concordia/t/test_example.cpp
@ -0,0 +1,29 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
 #include "concordia/example.hpp"
 using namespace std;
 BOOST_AUTO_TEST_SUITE(exampleTest)
 BOOST_AUTO_TEST_CASE( ExceedingId )
 {
    Example example1("Test", 16777215);
    bool exceptionThrown = false;
    string message = "";
    try {
        Example example2("Test", 16777216);
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
    }    
    BOOST_CHECK_EQUAL(exceptionThrown, true);    
    BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);    
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -1,6 +1,7 @@
 #include <boost/filesystem.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
 #include <sstream>
 #include "concordia/common/config.hpp"
 #include "concordia/hash_generator.hpp"
@ -28,6 +29,35 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
    BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
 }
 BOOST_AUTO_TEST_CASE( TooLongHashTest )
 {
    if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
        boost::filesystem::remove(TEST_WORD_MAP_PATH);      
    } 
    HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
    stringstream ss;
    for (int i=0;i<256;i++) {
        ss << "a" << i << " ";
    }
    string longSentence = ss.str();
    bool exceptionThrown = false;
    string message = "";
    try {
        boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
    }    
    BOOST_CHECK_EQUAL(exceptionThrown, true);    
    BOOST_CHECK_EQUAL(boost::starts_with(message, "Trying to add to long sentence"), true);    
 }
 BOOST_AUTO_TEST_CASE( HashSerializationTest )
 {
    if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
--- a/concordia/word_map.cpp
+++ b/concordia/word_map.cpp
@ -8,8 +8,12 @@ WordMap::WordMap() throw(ConcordiaException) {
 WordMap::~WordMap() {
 }
-INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
+INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word)
                                    throw(ConcordiaException) {
    if (_map.find(word) == _map.end()) {
        if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
             throw ConcordiaException("Word map capacity limit reached!"); 
        }
        INDEX_CHARACTER_TYPE newCode = _nextFree;
        _map[word] = newCode;
        _nextFree++;
--- a/concordia/word_map.hpp
+++ b/concordia/word_map.hpp
@ -24,7 +24,8 @@ public:
    */
    virtual ~WordMap();
-    INDEX_CHARACTER_TYPE getWordCode(const string & word);
+    INDEX_CHARACTER_TYPE getWordCode(const string & word)
                                throw(ConcordiaException);
 private:
    friend class boost::serialization::access;