concordia index
This commit is contained in:
parent
656e9dbae9
commit
d3cccff654
@ -1,3 +1,5 @@
|
|||||||
|
#include <divsufsort.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
|
@ -24,6 +24,7 @@ install(FILES concordia.hpp DESTINATION include/concordia/)
|
|||||||
target_link_libraries(concordia log4cpp)
|
target_link_libraries(concordia log4cpp)
|
||||||
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
||||||
target_link_libraries(concordia ${Boost_LIBRARIES})
|
target_link_libraries(concordia ${Boost_LIBRARIES})
|
||||||
|
target_link_libraries(concordia divsufsort)
|
||||||
|
|
||||||
if (WITH_RE2)
|
if (WITH_RE2)
|
||||||
target_link_libraries(concordia re2)
|
target_link_libraries(concordia re2)
|
||||||
|
@ -8,7 +8,12 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
|
|||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (boost::filesystem::exists(wordMapFilepath)) {
|
if (boost::filesystem::exists(wordMapFilepath)) {
|
||||||
if (boost::filesystem::exists(hashedIndexFilepath)) {
|
if (boost::filesystem::exists(hashedIndexFilepath)) {
|
||||||
// create hashed index file for appending
|
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||||
|
ios::app | ios::binary);
|
||||||
|
if (!_hashedIndexFile.is_open()) {
|
||||||
|
throw ConcordiaException("E03: Failed to open hashed index "
|
||||||
|
"file for appending.");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
throw ConcordiaException("E01: Word map file exists "
|
throw ConcordiaException("E01: Word map file exists "
|
||||||
"but hashed index file absent.");
|
"but hashed index file absent.");
|
||||||
@ -18,9 +23,21 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
|
|||||||
throw ConcordiaException("E02: Hashed index file exists "
|
throw ConcordiaException("E02: Hashed index file exists "
|
||||||
"but word map file absent.");
|
"but word map file absent.");
|
||||||
} else {
|
} else {
|
||||||
// create hashed index file for writing
|
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||||
|
ios::binary);
|
||||||
|
if (!_hashedIndexFile.is_open()) {
|
||||||
|
throw ConcordiaException("E04: Failed to open hashed index "
|
||||||
|
"file for writing.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out |
|
||||||
|
ios::binary);
|
||||||
|
if (!_hashedIndexFile.is_open()) {
|
||||||
|
throw ConcordiaException("E05: Failed to open suffix array "
|
||||||
|
"file for writing.");
|
||||||
|
}
|
||||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||||
new HashGenerator(wordMapFilepath));
|
new HashGenerator(wordMapFilepath));
|
||||||
}
|
}
|
||||||
@ -33,8 +50,42 @@ void ConcordiaIndex::serializeWordMap() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::generateSuffixArray() {
|
void ConcordiaIndex::generateSuffixArray() {
|
||||||
|
/* Get the file size. */
|
||||||
|
long n = _hashedIndexFile.tellg();
|
||||||
|
|
||||||
|
/* Allocate 5blocksize bytes of memory. */
|
||||||
|
sauchar_t *T;
|
||||||
|
saidx_t *SA;
|
||||||
|
|
||||||
|
T = reinterpret_cast<sauchar_t *> (malloc((size_t)n * sizeof(sauchar_t)));
|
||||||
|
SA = reinterpret_cast<saidx_t *> (malloc((size_t)n * sizeof(saidx_t)));
|
||||||
|
if ((T == NULL) || (SA == NULL)) {
|
||||||
|
throw ConcordiaException("Cannot allocate memory.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read n bytes of data. */
|
||||||
|
|
||||||
|
_hashedIndexFile.seekg(0, ios::beg);
|
||||||
|
_hashedIndexFile.read(reinterpret_cast<char*> (T), (size_t)n);
|
||||||
|
|
||||||
|
/* Construct the suffix array. */
|
||||||
|
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||||
|
throw ConcordiaException("Error creating suffix array.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the suffix array. */
|
||||||
|
_suffixArrayFile << *SA;
|
||||||
|
|
||||||
|
/* Deallocate memory. */
|
||||||
|
free(SA);
|
||||||
|
free(T);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||||
|
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
|
||||||
|
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||||
|
it != hash.end(); ++it) {
|
||||||
|
_hashedIndexFile << *it;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,12 @@
|
|||||||
#ifndef CONCORDIA_INDEX_HDR
|
#ifndef CONCORDIA_INDEX_HDR
|
||||||
#define CONCORDIA_INDEX_HDR
|
#define CONCORDIA_INDEX_HDR
|
||||||
|
|
||||||
|
#include <divsufsort.h>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
@ -31,6 +36,10 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||||
|
|
||||||
|
fstream _hashedIndexFile;
|
||||||
|
|
||||||
|
ofstream _suffixArrayFile;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -20,15 +20,15 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
|
|||||||
HashGenerator::~HashGenerator() {
|
HashGenerator::~HashGenerator() {
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<int> HashGenerator::generateHash(const string & sentence) {
|
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) {
|
||||||
vector<int> result;
|
vector<sauchar_t> result;
|
||||||
vector<string> tokenTexts;
|
vector<string> tokenTexts;
|
||||||
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
||||||
|
|
||||||
for (vector<string>::iterator it = tokenTexts.begin();
|
for (vector<string>::iterator it = tokenTexts.begin();
|
||||||
it != tokenTexts.end(); ++it) {
|
it != tokenTexts.end(); ++it) {
|
||||||
string token = *it;
|
string token = *it;
|
||||||
int code = _wordMap->getWordCode(token);
|
sauchar_t code = _wordMap->getWordCode(token);
|
||||||
result.push_back(code);
|
result.push_back(code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,6 +8,9 @@
|
|||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
|
#include <divsufsort.h>
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for generating a sentence hash.
|
Class for generating a sentence hash.
|
||||||
|
|
||||||
@ -24,7 +27,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~HashGenerator();
|
virtual ~HashGenerator();
|
||||||
|
|
||||||
vector<int> generateHash(const string & sentence);
|
vector<sauchar_t> generateHash(const string & sentence);
|
||||||
|
|
||||||
void serializeWordMap();
|
void serializeWordMap();
|
||||||
|
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(concordia_index)
|
BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||||
@ -15,4 +19,59 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
|
||||||
|
{
|
||||||
|
bool exceptionThrown = false;
|
||||||
|
string message = "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"),
|
||||||
|
TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"),
|
||||||
|
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
exceptionThrown = true;
|
||||||
|
message = e.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_CHECK(exceptionThrown);
|
||||||
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
|
||||||
|
{
|
||||||
|
bool exceptionThrown = false;
|
||||||
|
string message = "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"),
|
||||||
|
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"),
|
||||||
|
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
exceptionThrown = true;
|
||||||
|
message = e.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_CHECK(exceptionThrown);
|
||||||
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||||
|
{
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"),
|
||||||
|
TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"),
|
||||||
|
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||||
|
index.addSentence("Ala ma kota");
|
||||||
|
index.generateSuffixArray();
|
||||||
|
index.serializeWordMap();
|
||||||
|
|
||||||
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")));
|
||||||
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")));
|
||||||
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath()));
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath());
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -18,8 +18,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
|
|
||||||
vector<int> hash = hashGenerator.generateHash("Ala ma kota");
|
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota");
|
||||||
vector<int> expected;
|
vector<sauchar_t> expected;
|
||||||
expected.push_back(0);
|
expected.push_back(0);
|
||||||
expected.push_back(1);
|
expected.push_back(1);
|
||||||
expected.push_back(2);
|
expected.push_back(2);
|
||||||
@ -34,8 +34,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
}
|
}
|
||||||
|
|
||||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
vector<int> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||||
vector<int> expected1;
|
vector<sauchar_t> expected1;
|
||||||
expected1.push_back(0);
|
expected1.push_back(0);
|
||||||
expected1.push_back(1);
|
expected1.push_back(1);
|
||||||
expected1.push_back(2);
|
expected1.push_back(2);
|
||||||
@ -44,8 +44,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
hashGenerator1.serializeWordMap();
|
hashGenerator1.serializeWordMap();
|
||||||
|
|
||||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
vector<int> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||||
vector<int> expected2;
|
vector<sauchar_t> expected2;
|
||||||
expected2.push_back(0);
|
expected2.push_back(0);
|
||||||
expected2.push_back(1);
|
expected2.push_back(1);
|
||||||
expected2.push_back(3);
|
expected2.push_back(3);
|
||||||
|
@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
|
|||||||
WordMap::~WordMap() {
|
WordMap::~WordMap() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int WordMap::getWordCode(const string & word) {
|
sauchar_t WordMap::getWordCode(const string & word) {
|
||||||
if (_map.find(word) == _map.end()) {
|
if (_map.find(word) == _map.end()) {
|
||||||
int newCode = _nextFree;
|
sauchar_t newCode = _nextFree;
|
||||||
_map[word] = newCode;
|
_map[word] = newCode;
|
||||||
_nextFree++;
|
_nextFree++;
|
||||||
return newCode;
|
return newCode;
|
||||||
|
@ -8,6 +8,9 @@
|
|||||||
#include <boost/archive/text_iarchive.hpp>
|
#include <boost/archive/text_iarchive.hpp>
|
||||||
#include <boost/serialization/map.hpp>
|
#include <boost/serialization/map.hpp>
|
||||||
|
|
||||||
|
#include <divsufsort.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing dictionary for word to int encoding.
|
Class representing dictionary for word to int encoding.
|
||||||
@ -24,7 +27,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~WordMap();
|
virtual ~WordMap();
|
||||||
|
|
||||||
int getWordCode(const string & word);
|
sauchar_t getWordCode(const string & word);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class boost::serialization::access;
|
friend class boost::serialization::access;
|
||||||
@ -36,9 +39,9 @@ private:
|
|||||||
ar & _nextFree;
|
ar & _nextFree;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<string, int> _map;
|
map<string, sauchar_t> _map;
|
||||||
|
|
||||||
int _nextFree;
|
sauchar_t _nextFree;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user