#include "concordia/hash_generator.hpp" #include "concordia/common/utils.hpp" #include "concordia/token_annotation.hpp" #include #include #include #include #include #include HashGenerator::HashGenerator(boost::shared_ptr config) throw(ConcordiaException) : _wordMapFilePath(config->getWordMapFilePath()), _wordMap(boost::shared_ptr(new WordMap)), _sentenceTokenizer(boost::shared_ptr( new SentenceTokenizer(config))) { if (boost::filesystem::exists(_wordMapFilePath)) { std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_iarchive ia(ifs); boost::shared_ptr restoredWordMap(new WordMap); ia >> *_wordMap; } } HashGenerator::~HashGenerator() { } std::vector HashGenerator::generateHash( const std::string & sentence) throw(ConcordiaException) { std::vector result; boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); ts->generateHash(_wordMap); if (ts->getTokens().size() > Utils::maxSentenceSize) { throw ConcordiaException("Trying to add too long sentence."); } return ts->getCodes(); } std::vector HashGenerator::generateTokenVector( const std::string & sentence) { boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); std::vector tokenTexts; BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) { if (annotation.getType() == TokenAnnotation::WORD || annotation.getType() == TokenAnnotation::NE) { tokenTexts.push_back(annotation.getValue()); } } return tokenTexts; } void HashGenerator::serializeWordMap() { std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_oarchive oa(ofs); oa << *_wordMap; } void HashGenerator::clearWordMap() { _wordMap = boost::shared_ptr(new WordMap); boost::filesystem::remove(_wordMapFilePath); }