2013-11-12 16:58:31 +01:00
|
|
|
#include "concordia/hash_generator.hpp"
|
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include <boost/archive/binary_oarchive.hpp>
|
|
|
|
#include <boost/archive/binary_iarchive.hpp>
|
2013-11-12 22:08:37 +01:00
|
|
|
#include <boost/algorithm/string.hpp>
|
2013-11-12 16:58:31 +01:00
|
|
|
#include <fstream>
|
|
|
|
|
2014-04-13 12:21:30 +02:00
|
|
|
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
2013-11-14 15:44:50 +01:00
|
|
|
throw(ConcordiaException) :
|
2014-04-13 12:21:30 +02:00
|
|
|
_wordMapFilePath(config->getWordMapFilePath()),
|
|
|
|
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
2014-04-29 14:46:04 +02:00
|
|
|
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
|
|
|
|
new SentenceAnonymizer(config))) {
|
2013-11-14 20:36:34 +01:00
|
|
|
if (boost::filesystem::exists(_wordMapFilePath)) {
|
|
|
|
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
2013-11-12 16:58:31 +01:00
|
|
|
boost::archive::binary_iarchive ia(ifs);
|
2013-11-14 15:44:50 +01:00
|
|
|
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
|
|
|
|
ia >> *_wordMap;
|
|
|
|
}
|
2013-11-12 16:58:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
HashGenerator::~HashGenerator() {
|
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
2014-03-14 11:30:17 +01:00
|
|
|
const string & sentence) throw(ConcordiaException) {
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
|
|
|
result(new vector<INDEX_CHARACTER_TYPE>());
|
2014-04-29 14:46:04 +02:00
|
|
|
boost::shared_ptr<vector<string> > tokenTexts =
|
|
|
|
generateTokenVector(sentence);
|
2014-03-14 11:30:17 +01:00
|
|
|
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
|
2014-04-29 14:46:04 +02:00
|
|
|
throw ConcordiaException("Trying to add too long sentence.");
|
2014-03-14 11:30:17 +01:00
|
|
|
}
|
2013-12-14 15:23:17 +01:00
|
|
|
for (vector<string>::iterator it = tokenTexts->begin();
|
|
|
|
it != tokenTexts->end(); ++it) {
|
2013-11-12 22:08:37 +01:00
|
|
|
string token = *it;
|
2013-12-06 22:29:25 +01:00
|
|
|
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
2013-12-14 15:23:17 +01:00
|
|
|
result->push_back(code);
|
2013-11-14 15:44:50 +01:00
|
|
|
}
|
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
boost::shared_ptr<vector<string> >
|
|
|
|
HashGenerator::generateTokenVector(const string & sentence) {
|
|
|
|
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
|
|
|
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
|
|
|
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
|
|
|
boost::algorithm::token_compress_on);
|
|
|
|
|
|
|
|
return tokenTexts;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-11-12 16:58:31 +01:00
|
|
|
void HashGenerator::serializeWordMap() {
|
2013-11-14 20:36:34 +01:00
|
|
|
ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
2013-11-12 16:58:31 +01:00
|
|
|
boost::archive::binary_oarchive oa(ofs);
|
2013-11-14 15:44:50 +01:00
|
|
|
oa << *_wordMap;
|
2013-11-12 16:58:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|