From d3cccff65466dae48998a84c163c5cee70040708 Mon Sep 17 00:00:00 2001 From: rjawor Date: Wed, 20 Nov 2013 17:43:29 +0100 Subject: [PATCH] concordia index --- concordia-console/concordia-console.cpp | 2 + concordia/CMakeLists.txt | 1 + concordia/concordia_index.cpp | 55 +++++++++++++++- concordia/concordia_index.hpp | 9 +++ concordia/hash_generator.cpp | 6 +- concordia/hash_generator.hpp | 5 +- concordia/t/test_concordia_index.cpp | 59 ++++++++++++++++++ concordia/t/test_hash_generator.cpp | 12 ++-- concordia/word_map.cpp | 4 +- concordia/word_map.hpp | 9 ++- .../concordia-index/mock_word_map.bin | Bin 116 -> 104 bytes 11 files changed, 145 insertions(+), 17 deletions(-) diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index f613200..b48d3b3 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 384eeff..87a31ce 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -24,6 +24,7 @@ install(FILES concordia.hpp DESTINATION include/concordia/) target_link_libraries(concordia log4cpp) target_link_libraries(concordia ${LIBSTEMMER_LIB}) target_link_libraries(concordia ${Boost_LIBRARIES}) +target_link_libraries(concordia divsufsort) if (WITH_RE2) target_link_libraries(concordia re2) diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 62dfd28..acf8836 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -8,7 +8,12 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath, throw(ConcordiaException) { if (boost::filesystem::exists(wordMapFilepath)) { if (boost::filesystem::exists(hashedIndexFilepath)) { - // create hashed index file for appending + _hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out | + ios::app | ios::binary); + if (!_hashedIndexFile.is_open()) { + throw ConcordiaException("E03: Failed to open hashed index " + "file for appending."); + } } else { throw ConcordiaException("E01: Word map file exists " "but hashed index file absent."); @@ -18,9 +23,21 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath, throw ConcordiaException("E02: Hashed index file exists " "but word map file absent."); } else { - // create hashed index file for writing + _hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out | + ios::binary); + if (!_hashedIndexFile.is_open()) { + throw ConcordiaException("E04: Failed to open hashed index " + "file for writing."); + } } } + + _suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out | + ios::binary); + if (!_hashedIndexFile.is_open()) { + throw ConcordiaException("E05: Failed to open suffix array " + "file for writing."); + } _hashGenerator = boost::shared_ptr( new HashGenerator(wordMapFilepath)); } @@ -33,8 +50,42 @@ void ConcordiaIndex::serializeWordMap() { } void ConcordiaIndex::generateSuffixArray() { + /* Get the file size. */ + long n = _hashedIndexFile.tellg(); + + /* Allocate 5blocksize bytes of memory. */ + sauchar_t *T; + saidx_t *SA; + + T = reinterpret_cast (malloc((size_t)n * sizeof(sauchar_t))); + SA = reinterpret_cast (malloc((size_t)n * sizeof(saidx_t))); + if ((T == NULL) || (SA == NULL)) { + throw ConcordiaException("Cannot allocate memory."); + } + + /* Read n bytes of data. */ + + _hashedIndexFile.seekg(0, ios::beg); + _hashedIndexFile.read(reinterpret_cast (T), (size_t)n); + + /* Construct the suffix array. */ + if (divsufsort(T, SA, (saidx_t)n) != 0) { + throw ConcordiaException("Error creating suffix array."); + } + + /* Write the suffix array. */ + _suffixArrayFile << *SA; + + /* Deallocate memory. */ + free(SA); + free(T); } void ConcordiaIndex::addSentence(const string & sentence) { + vector hash = _hashGenerator->generateHash(sentence); + for (vector::iterator it = hash.begin(); + it != hash.end(); ++it) { + _hashedIndexFile << *it; + } } diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index c55f473..c92a066 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -1,7 +1,12 @@ #ifndef CONCORDIA_INDEX_HDR #define CONCORDIA_INDEX_HDR +#include #include +#include +#include +#include + #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" @@ -31,6 +36,10 @@ public: private: boost::shared_ptr _hashGenerator; + + fstream _hashedIndexFile; + + ofstream _suffixArrayFile; }; #endif diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 0a0c2fd..859c4a8 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -20,15 +20,15 @@ HashGenerator::HashGenerator(const string & wordMapFilePath) HashGenerator::~HashGenerator() { } -vector HashGenerator::generateHash(const string & sentence) { - vector result; +vector HashGenerator::generateHash(const string & sentence) { + vector result; vector tokenTexts; boost::split(tokenTexts, sentence, boost::is_any_of(" ")); for (vector::iterator it = tokenTexts.begin(); it != tokenTexts.end(); ++it) { string token = *it; - int code = _wordMap->getWordCode(token); + sauchar_t code = _wordMap->getWordCode(token); result.push_back(code); } diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 5c1f101..d86d34d 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -8,6 +8,9 @@ #include "concordia/word_map.hpp" #include "concordia/concordia_exception.hpp" +#include + + /*! Class for generating a sentence hash. @@ -24,7 +27,7 @@ public: */ virtual ~HashGenerator(); - vector generateHash(const string & sentence); + vector generateHash(const string & sentence); void serializeWordMap(); diff --git a/concordia/t/test_concordia_index.cpp b/concordia/t/test_concordia_index.cpp index fa2355a..bcc9398 100644 --- a/concordia/t/test_concordia_index.cpp +++ b/concordia/t/test_concordia_index.cpp @@ -1,8 +1,12 @@ #include "tests/unit-tests/unit_tests_globals.hpp" #include "concordia/concordia_index.hpp" +#include "concordia/concordia_exception.hpp" #include "tests/common/test_resources_manager.hpp" +#include +#include + using namespace std; BOOST_AUTO_TEST_SUITE(concordia_index) @@ -15,4 +19,59 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 ) } + +BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 ) +{ + bool exceptionThrown = false; + string message = ""; + + try { + ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"), + TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"), + TestResourcesManager::getTestSuffixArrayFilePath()); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + + BOOST_CHECK(exceptionThrown); + BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true); +} + +BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 ) +{ + bool exceptionThrown = false; + string message = ""; + + try { + ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"), + TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"), + TestResourcesManager::getTestSuffixArrayFilePath()); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + + BOOST_CHECK(exceptionThrown); + BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true); +} + +BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) +{ + ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"), + TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"), + TestResourcesManager::getTestSuffixArrayFilePath()); + index.addSentence("Ala ma kota"); + index.generateSuffixArray(); + index.serializeWordMap(); + + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath())); + + boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")); + boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")); + boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath()); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index aedd368..98dd320 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -18,8 +18,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); - vector hash = hashGenerator.generateHash("Ala ma kota"); - vector expected; + vector hash = hashGenerator.generateHash("Ala ma kota"); + vector expected; expected.push_back(0); expected.push_back(1); expected.push_back(2); @@ -34,8 +34,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) } HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH); - vector hash1 = hashGenerator1.generateHash("Ala ma kota"); - vector expected1; + vector hash1 = hashGenerator1.generateHash("Ala ma kota"); + vector expected1; expected1.push_back(0); expected1.push_back(1); expected1.push_back(2); @@ -44,8 +44,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) hashGenerator1.serializeWordMap(); HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH); - vector hash2 = hashGenerator2.generateHash("Ala ma psa"); - vector expected2; + vector hash2 = hashGenerator2.generateHash("Ala ma psa"); + vector expected2; expected2.push_back(0); expected2.push_back(1); expected2.push_back(3); diff --git a/concordia/word_map.cpp b/concordia/word_map.cpp index 9484d69..4a70964 100644 --- a/concordia/word_map.cpp +++ b/concordia/word_map.cpp @@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) { WordMap::~WordMap() { } -int WordMap::getWordCode(const string & word) { +sauchar_t WordMap::getWordCode(const string & word) { if (_map.find(word) == _map.end()) { - int newCode = _nextFree; + sauchar_t newCode = _nextFree; _map[word] = newCode; _nextFree++; return newCode; diff --git a/concordia/word_map.hpp b/concordia/word_map.hpp index 0d9b1e8..9d63c81 100644 --- a/concordia/word_map.hpp +++ b/concordia/word_map.hpp @@ -8,6 +8,9 @@ #include #include +#include + + /*! Class representing dictionary for word to int encoding. @@ -24,7 +27,7 @@ public: */ virtual ~WordMap(); - int getWordCode(const string & word); + sauchar_t getWordCode(const string & word); private: friend class boost::serialization::access; @@ -36,9 +39,9 @@ private: ar & _nextFree; } - map _map; + map _map; - int _nextFree; + sauchar_t _nextFree; }; #endif diff --git a/tests/resources/concordia-index/mock_word_map.bin b/tests/resources/concordia-index/mock_word_map.bin index 3da4c0450d75c8dd160870b141e0e8c174afe7f6..3ccc9be64fe20bb1a391cb800315bc94638a91dc 100644 GIT binary patch delta 30 bcmXTPnBXtT!TZQM3eJ delta 42 kcmc~Onc%O=z`(!)#2}EJUy{fKq(K-2auXSWBr^~L0DZUwY5)KL