concordia-library/concordia/hash_generator.cpp

67 lines
2.1 KiB
C++

#include "concordia/hash_generator.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <fstream>
HashGenerator::HashGenerator(std::string indexPath,
boost::shared_ptr<ConcordiaConfig> config):
_wordMapFilePath(indexPath+"/"+WORD_MAP_FILE_NAME),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
new SentenceTokenizer(config))) {
if (boost::filesystem::exists(_wordMapFilePath)) {
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_iarchive ia(ifs);
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
ia >> *_wordMap;
}
}
HashGenerator::~HashGenerator() {
}
TokenizedSentence HashGenerator::generateHash(
const std::string & sentence,
bool byWhitespace) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateHash(_wordMap);
if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
return ts;
}
TokenizedSentence HashGenerator::generateTokens(
const std::string & sentence,
bool byWhitespace) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateTokens();
if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
return ts;
}
void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs);
oa << *_wordMap;
}
void HashGenerator::clearWordMap() {
_wordMap = boost::shared_ptr<WordMap>(new WordMap);
boost::filesystem::remove(_wordMapFilePath);
}