concordia index
This commit is contained in:
parent
656e9dbae9
commit
d3cccff654
@ -1,3 +1,5 @@
|
||||
#include <divsufsort.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <boost/program_options.hpp>
|
||||
|
@ -24,6 +24,7 @@ install(FILES concordia.hpp DESTINATION include/concordia/)
|
||||
target_link_libraries(concordia log4cpp)
|
||||
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
||||
target_link_libraries(concordia ${Boost_LIBRARIES})
|
||||
target_link_libraries(concordia divsufsort)
|
||||
|
||||
if (WITH_RE2)
|
||||
target_link_libraries(concordia re2)
|
||||
|
@ -8,7 +8,12 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
|
||||
throw(ConcordiaException) {
|
||||
if (boost::filesystem::exists(wordMapFilepath)) {
|
||||
if (boost::filesystem::exists(hashedIndexFilepath)) {
|
||||
// create hashed index file for appending
|
||||
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||
ios::app | ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E03: Failed to open hashed index "
|
||||
"file for appending.");
|
||||
}
|
||||
} else {
|
||||
throw ConcordiaException("E01: Word map file exists "
|
||||
"but hashed index file absent.");
|
||||
@ -18,9 +23,21 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
|
||||
throw ConcordiaException("E02: Hashed index file exists "
|
||||
"but word map file absent.");
|
||||
} else {
|
||||
// create hashed index file for writing
|
||||
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||
ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E04: Failed to open hashed index "
|
||||
"file for writing.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out |
|
||||
ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E05: Failed to open suffix array "
|
||||
"file for writing.");
|
||||
}
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(wordMapFilepath));
|
||||
}
|
||||
@ -33,8 +50,42 @@ void ConcordiaIndex::serializeWordMap() {
|
||||
}
|
||||
|
||||
void ConcordiaIndex::generateSuffixArray() {
|
||||
/* Get the file size. */
|
||||
long n = _hashedIndexFile.tellg();
|
||||
|
||||
/* Allocate 5blocksize bytes of memory. */
|
||||
sauchar_t *T;
|
||||
saidx_t *SA;
|
||||
|
||||
T = reinterpret_cast<sauchar_t *> (malloc((size_t)n * sizeof(sauchar_t)));
|
||||
SA = reinterpret_cast<saidx_t *> (malloc((size_t)n * sizeof(saidx_t)));
|
||||
if ((T == NULL) || (SA == NULL)) {
|
||||
throw ConcordiaException("Cannot allocate memory.");
|
||||
}
|
||||
|
||||
/* Read n bytes of data. */
|
||||
|
||||
_hashedIndexFile.seekg(0, ios::beg);
|
||||
_hashedIndexFile.read(reinterpret_cast<char*> (T), (size_t)n);
|
||||
|
||||
/* Construct the suffix array. */
|
||||
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||
throw ConcordiaException("Error creating suffix array.");
|
||||
}
|
||||
|
||||
/* Write the suffix array. */
|
||||
_suffixArrayFile << *SA;
|
||||
|
||||
/* Deallocate memory. */
|
||||
free(SA);
|
||||
free(T);
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
|
||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
_hashedIndexFile << *it;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,12 @@
|
||||
#ifndef CONCORDIA_INDEX_HDR
|
||||
#define CONCORDIA_INDEX_HDR
|
||||
|
||||
#include <divsufsort.h>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
@ -31,6 +36,10 @@ public:
|
||||
|
||||
private:
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
fstream _hashedIndexFile;
|
||||
|
||||
ofstream _suffixArrayFile;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -20,15 +20,15 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
|
||||
HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
vector<int> HashGenerator::generateHash(const string & sentence) {
|
||||
vector<int> result;
|
||||
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) {
|
||||
vector<sauchar_t> result;
|
||||
vector<string> tokenTexts;
|
||||
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
||||
|
||||
for (vector<string>::iterator it = tokenTexts.begin();
|
||||
it != tokenTexts.end(); ++it) {
|
||||
string token = *it;
|
||||
int code = _wordMap->getWordCode(token);
|
||||
sauchar_t code = _wordMap->getWordCode(token);
|
||||
result.push_back(code);
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,9 @@
|
||||
#include "concordia/word_map.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
#include <divsufsort.h>
|
||||
|
||||
|
||||
/*!
|
||||
Class for generating a sentence hash.
|
||||
|
||||
@ -24,7 +27,7 @@ public:
|
||||
*/
|
||||
virtual ~HashGenerator();
|
||||
|
||||
vector<int> generateHash(const string & sentence);
|
||||
vector<sauchar_t> generateHash(const string & sentence);
|
||||
|
||||
void serializeWordMap();
|
||||
|
||||
|
@ -1,8 +1,12 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||
@ -15,4 +19,59 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
|
||||
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
|
||||
BOOST_CHECK(exceptionThrown);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
|
||||
BOOST_CHECK(exceptionThrown);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
{
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
index.addSentence("Ala ma kota");
|
||||
index.generateSuffixArray();
|
||||
index.serializeWordMap();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath()));
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -18,8 +18,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
|
||||
vector<int> hash = hashGenerator.generateHash("Ala ma kota");
|
||||
vector<int> expected;
|
||||
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota");
|
||||
vector<sauchar_t> expected;
|
||||
expected.push_back(0);
|
||||
expected.push_back(1);
|
||||
expected.push_back(2);
|
||||
@ -34,8 +34,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<int> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
vector<int> expected1;
|
||||
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
vector<sauchar_t> expected1;
|
||||
expected1.push_back(0);
|
||||
expected1.push_back(1);
|
||||
expected1.push_back(2);
|
||||
@ -44,8 +44,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<int> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
vector<int> expected2;
|
||||
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
vector<sauchar_t> expected2;
|
||||
expected2.push_back(0);
|
||||
expected2.push_back(1);
|
||||
expected2.push_back(3);
|
||||
|
@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
|
||||
WordMap::~WordMap() {
|
||||
}
|
||||
|
||||
int WordMap::getWordCode(const string & word) {
|
||||
sauchar_t WordMap::getWordCode(const string & word) {
|
||||
if (_map.find(word) == _map.end()) {
|
||||
int newCode = _nextFree;
|
||||
sauchar_t newCode = _nextFree;
|
||||
_map[word] = newCode;
|
||||
_nextFree++;
|
||||
return newCode;
|
||||
|
@ -8,6 +8,9 @@
|
||||
#include <boost/archive/text_iarchive.hpp>
|
||||
#include <boost/serialization/map.hpp>
|
||||
|
||||
#include <divsufsort.h>
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
Class representing dictionary for word to int encoding.
|
||||
@ -24,7 +27,7 @@ public:
|
||||
*/
|
||||
virtual ~WordMap();
|
||||
|
||||
int getWordCode(const string & word);
|
||||
sauchar_t getWordCode(const string & word);
|
||||
|
||||
private:
|
||||
friend class boost::serialization::access;
|
||||
@ -36,9 +39,9 @@ private:
|
||||
ar & _nextFree;
|
||||
}
|
||||
|
||||
map<string, int> _map;
|
||||
map<string, sauchar_t> _map;
|
||||
|
||||
int _nextFree;
|
||||
sauchar_t _nextFree;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user