#include "concordia/hash_generator.hpp" #include "concordia/common/utils.hpp" #include #include #include #include #include HashGenerator::HashGenerator(boost::shared_ptr config) throw(ConcordiaException) : _wordMapFilePath(config->getWordMapFilePath()), _wordMap(boost::shared_ptr(new WordMap)), _sentenceAnonymizer(boost::shared_ptr( new SentenceAnonymizer(config))) { if (boost::filesystem::exists(_wordMapFilePath)) { ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_iarchive ia(ifs); boost::shared_ptr restoredWordMap(new WordMap); ia >> *_wordMap; } } HashGenerator::~HashGenerator() { } boost::shared_ptr > HashGenerator::generateHash( const string & sentence) throw(ConcordiaException) { boost::shared_ptr > result(new vector()); boost::shared_ptr > tokenTexts = generateTokenVector(sentence); if (tokenTexts->size() > Utils::maxSentenceSize) { throw ConcordiaException("Trying to add too long sentence."); } for (vector::iterator it = tokenTexts->begin(); it != tokenTexts->end(); ++it) { string token = *it; INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token); result->push_back(code); } return result; } boost::shared_ptr > HashGenerator::generateTokenVector(const string & sentence) { string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence); boost::trim(anonymizedSentence); boost::shared_ptr > tokenTexts(new vector()); boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"), boost::algorithm::token_compress_on); return tokenTexts; } void HashGenerator::serializeWordMap() { ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_oarchive oa(ofs); oa << *_wordMap; }