concordia-library/concordia/hash_generator.cpp

68 lines
2.4 KiB
C++
Raw Normal View History

2013-11-12 16:58:31 +01:00
#include "concordia/hash_generator.hpp"
#include "concordia/common/utils.hpp"
2013-11-12 16:58:31 +01:00
#include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
2013-11-12 22:08:37 +01:00
#include <boost/algorithm/string.hpp>
2013-11-12 16:58:31 +01:00
#include <fstream>
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
2013-11-14 15:44:50 +01:00
throw(ConcordiaException) :
_wordMapFilePath(config->getWordMapFilePath()),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
2015-06-25 10:12:51 +02:00
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
new SentenceTokenizer(config))) {
2013-11-14 20:36:34 +01:00
if (boost::filesystem::exists(_wordMapFilePath)) {
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
2013-11-12 16:58:31 +01:00
boost::archive::binary_iarchive ia(ifs);
2013-11-14 15:44:50 +01:00
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
ia >> *_wordMap;
}
2013-11-12 16:58:31 +01:00
}
HashGenerator::~HashGenerator() {
}
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> result;
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
if (tokenTexts.size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
for (std::vector<std::string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
std::string token = *it;
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result.push_back(code);
2013-11-14 15:44:50 +01:00
}
2013-11-12 16:58:31 +01:00
return result;
}
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
2015-06-25 10:12:51 +02:00
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::string tokenizedSentence = ts->getSentence();
boost::trim(tokenizedSentence);
std::vector<std::string> tokenTexts;
2015-06-25 10:12:51 +02:00
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on);
return tokenTexts;
}
2013-11-12 16:58:31 +01:00
void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
2013-11-12 16:58:31 +01:00
boost::archive::binary_oarchive oa(ofs);
2013-11-14 15:44:50 +01:00
oa << *_wordMap;
2013-11-12 16:58:31 +01:00
}
2015-05-04 20:40:44 +02:00
void HashGenerator::clearWordMap() {
_wordMap = boost::shared_ptr<WordMap>(new WordMap);
boost::filesystem::remove(_wordMapFilePath);
}
2013-11-12 16:58:31 +01:00