2013-11-12 16:58:31 +01:00
|
|
|
#include "concordia/hash_generator.hpp"
|
2015-04-09 22:17:19 +02:00
|
|
|
#include "concordia/common/utils.hpp"
|
2015-06-26 15:38:24 +02:00
|
|
|
#include "concordia/token_annotation.hpp"
|
2014-08-15 13:22:04 +02:00
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include <boost/archive/binary_oarchive.hpp>
|
|
|
|
#include <boost/archive/binary_iarchive.hpp>
|
2013-11-12 22:08:37 +01:00
|
|
|
#include <boost/algorithm/string.hpp>
|
2015-06-26 15:38:24 +02:00
|
|
|
#include <boost/foreach.hpp>
|
2014-08-15 13:22:04 +02:00
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
#include <fstream>
|
|
|
|
|
2015-10-16 22:14:11 +02:00
|
|
|
HashGenerator::HashGenerator(std::string indexPath,
|
|
|
|
boost::shared_ptr<ConcordiaConfig> config)
|
2013-11-14 15:44:50 +01:00
|
|
|
throw(ConcordiaException) :
|
2015-10-16 22:14:11 +02:00
|
|
|
_wordMapFilePath(indexPath+"/"+WORD_MAP_FILE_NAME),
|
2014-04-13 12:21:30 +02:00
|
|
|
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
2015-06-25 10:12:51 +02:00
|
|
|
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
|
|
|
|
new SentenceTokenizer(config))) {
|
2013-11-14 20:36:34 +01:00
|
|
|
if (boost::filesystem::exists(_wordMapFilePath)) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
2013-11-12 16:58:31 +01:00
|
|
|
boost::archive::binary_iarchive ia(ifs);
|
2013-11-14 15:44:50 +01:00
|
|
|
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
|
|
|
|
ia >> *_wordMap;
|
|
|
|
}
|
2013-11-12 16:58:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
HashGenerator::~HashGenerator() {
|
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence HashGenerator::generateHash(
|
2015-12-27 20:54:40 +01:00
|
|
|
const std::string & sentence,
|
|
|
|
bool byWhitespace) throw(ConcordiaException) {
|
|
|
|
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
|
2015-08-19 20:49:26 +02:00
|
|
|
ts.generateHash(_wordMap);
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
2014-04-29 14:46:04 +02:00
|
|
|
throw ConcordiaException("Trying to add too long sentence.");
|
2014-03-14 11:30:17 +01:00
|
|
|
}
|
2013-11-14 15:44:50 +01:00
|
|
|
|
2015-06-26 22:50:53 +02:00
|
|
|
return ts;
|
2014-04-29 14:46:04 +02:00
|
|
|
}
|
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
void HashGenerator::serializeWordMap() {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
2013-11-12 16:58:31 +01:00
|
|
|
boost::archive::binary_oarchive oa(ofs);
|
2013-11-14 15:44:50 +01:00
|
|
|
oa << *_wordMap;
|
2013-11-12 16:58:31 +01:00
|
|
|
}
|
|
|
|
|
2015-05-04 20:40:44 +02:00
|
|
|
void HashGenerator::clearWordMap() {
|
|
|
|
_wordMap = boost::shared_ptr<WordMap>(new WordMap);
|
|
|
|
boost::filesystem::remove(_wordMapFilePath);
|
|
|
|
}
|
2013-11-12 16:58:31 +01:00
|
|
|
|