concordia index

This commit is contained in:
rjawor 2013-11-20 17:43:29 +01:00
parent 656e9dbae9
commit d3cccff654
11 changed files with 145 additions and 17 deletions

View File

@ -1,3 +1,5 @@
#include <divsufsort.h>
#include <iostream>
#include <fstream>
#include <boost/program_options.hpp>

View File

@ -24,6 +24,7 @@ install(FILES concordia.hpp DESTINATION include/concordia/)
target_link_libraries(concordia log4cpp)
target_link_libraries(concordia ${LIBSTEMMER_LIB})
target_link_libraries(concordia ${Boost_LIBRARIES})
target_link_libraries(concordia divsufsort)
if (WITH_RE2)
target_link_libraries(concordia re2)

View File

@ -8,7 +8,12 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
throw(ConcordiaException) {
if (boost::filesystem::exists(wordMapFilepath)) {
if (boost::filesystem::exists(hashedIndexFilepath)) {
// create hashed index file for appending
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
ios::app | ios::binary);
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E03: Failed to open hashed index "
"file for appending.");
}
} else {
throw ConcordiaException("E01: Word map file exists "
"but hashed index file absent.");
@ -18,9 +23,21 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
throw ConcordiaException("E02: Hashed index file exists "
"but word map file absent.");
} else {
// create hashed index file for writing
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
ios::binary);
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E04: Failed to open hashed index "
"file for writing.");
}
}
}
_suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out |
ios::binary);
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E05: Failed to open suffix array "
"file for writing.");
}
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(wordMapFilepath));
}
@ -33,8 +50,42 @@ void ConcordiaIndex::serializeWordMap() {
}
void ConcordiaIndex::generateSuffixArray() {
/* Get the file size. */
long n = _hashedIndexFile.tellg();
/* Allocate 5blocksize bytes of memory. */
sauchar_t *T;
saidx_t *SA;
T = reinterpret_cast<sauchar_t *> (malloc((size_t)n * sizeof(sauchar_t)));
SA = reinterpret_cast<saidx_t *> (malloc((size_t)n * sizeof(saidx_t)));
if ((T == NULL) || (SA == NULL)) {
throw ConcordiaException("Cannot allocate memory.");
}
/* Read n bytes of data. */
_hashedIndexFile.seekg(0, ios::beg);
_hashedIndexFile.read(reinterpret_cast<char*> (T), (size_t)n);
/* Construct the suffix array. */
if (divsufsort(T, SA, (saidx_t)n) != 0) {
throw ConcordiaException("Error creating suffix array.");
}
/* Write the suffix array. */
_suffixArrayFile << *SA;
/* Deallocate memory. */
free(SA);
free(T);
}
void ConcordiaIndex::addSentence(const string & sentence) {
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
for (vector<sauchar_t>::iterator it = hash.begin();
it != hash.end(); ++it) {
_hashedIndexFile << *it;
}
}

View File

@ -1,7 +1,12 @@
#ifndef CONCORDIA_INDEX_HDR
#define CONCORDIA_INDEX_HDR
#include <divsufsort.h>
#include <boost/shared_ptr.hpp>
#include <fstream>
#include <iostream>
#include <sstream>
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
@ -31,6 +36,10 @@ public:
private:
boost::shared_ptr<HashGenerator> _hashGenerator;
fstream _hashedIndexFile;
ofstream _suffixArrayFile;
};
#endif

View File

@ -20,15 +20,15 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
HashGenerator::~HashGenerator() {
}
vector<int> HashGenerator::generateHash(const string & sentence) {
vector<int> result;
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) {
vector<sauchar_t> result;
vector<string> tokenTexts;
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
for (vector<string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
string token = *it;
int code = _wordMap->getWordCode(token);
sauchar_t code = _wordMap->getWordCode(token);
result.push_back(code);
}

View File

@ -8,6 +8,9 @@
#include "concordia/word_map.hpp"
#include "concordia/concordia_exception.hpp"
#include <divsufsort.h>
/*!
Class for generating a sentence hash.
@ -24,7 +27,7 @@ public:
*/
virtual ~HashGenerator();
vector<int> generateHash(const string & sentence);
vector<sauchar_t> generateHash(const string & sentence);
void serializeWordMap();

View File

@ -1,8 +1,12 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/concordia_exception.hpp"
#include "tests/common/test_resources_manager.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
using namespace std;
BOOST_AUTO_TEST_SUITE(concordia_index)
@ -15,4 +19,59 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
}
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
{
bool exceptionThrown = false;
string message = "";
try {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"),
TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"),
TestResourcesManager::getTestSuffixArrayFilePath());
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK(exceptionThrown);
BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true);
}
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
{
bool exceptionThrown = false;
string message = "";
try {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"),
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"),
TestResourcesManager::getTestSuffixArrayFilePath());
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK(exceptionThrown);
BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true);
}
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
{
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"),
TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"),
TestResourcesManager::getTestSuffixArrayFilePath());
index.addSentence("Ala ma kota");
index.generateSuffixArray();
index.serializeWordMap();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath()));
boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"));
boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"));
boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -18,8 +18,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash = hashGenerator.generateHash("Ala ma kota");
vector<int> expected;
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota");
vector<sauchar_t> expected;
expected.push_back(0);
expected.push_back(1);
expected.push_back(2);
@ -34,8 +34,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
}
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash1 = hashGenerator1.generateHash("Ala ma kota");
vector<int> expected1;
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota");
vector<sauchar_t> expected1;
expected1.push_back(0);
expected1.push_back(1);
expected1.push_back(2);
@ -44,8 +44,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash2 = hashGenerator2.generateHash("Ala ma psa");
vector<int> expected2;
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa");
vector<sauchar_t> expected2;
expected2.push_back(0);
expected2.push_back(1);
expected2.push_back(3);

View File

@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
WordMap::~WordMap() {
}
int WordMap::getWordCode(const string & word) {
sauchar_t WordMap::getWordCode(const string & word) {
if (_map.find(word) == _map.end()) {
int newCode = _nextFree;
sauchar_t newCode = _nextFree;
_map[word] = newCode;
_nextFree++;
return newCode;

View File

@ -8,6 +8,9 @@
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/map.hpp>
#include <divsufsort.h>
/*!
Class representing dictionary for word to int encoding.
@ -24,7 +27,7 @@ public:
*/
virtual ~WordMap();
int getWordCode(const string & word);
sauchar_t getWordCode(const string & word);
private:
friend class boost::serialization::access;
@ -36,9 +39,9 @@ private:
ar & _nextFree;
}
map<string, int> _map;
map<string, sauchar_t> _map;
int _nextFree;
sauchar_t _nextFree;
};
#endif