working hash generator

This commit is contained in:
rjawor 2013-11-14 15:44:50 +01:00
parent 3aa4091e4d
commit b238995a16
6 changed files with 98 additions and 28 deletions

View File

@ -3,17 +3,17 @@
#include <boost/archive/binary_oarchive.hpp> #include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp> #include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/serialization/map.hpp>
#include <fstream> #include <fstream>
HashGenerator::HashGenerator(const string & wordMapFilename)
HashGenerator::HashGenerator(const string & wordMapFilename) throw(ConcordiaException) { throw(ConcordiaException) :
_wordMapFilename = wordMapFilename; _wordMapFilename(wordMapFilename),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)) {
if (boost::filesystem::exists(_wordMapFilename)) { if (boost::filesystem::exists(_wordMapFilename)) {
ifstream ifs(_wordMapFilename.c_str(), std::ios::binary); ifstream ifs(_wordMapFilename.c_str(), std::ios::binary);
boost::archive::binary_iarchive ia(ifs); boost::archive::binary_iarchive ia(ifs);
ia >> _wordMap; boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
ia >> *_wordMap;
} }
} }
@ -25,8 +25,11 @@ vector<int> HashGenerator::generateHash(const string & sentence) {
vector<string> tokenTexts; vector<string> tokenTexts;
boost::split(tokenTexts, sentence, boost::is_any_of(" ")); boost::split(tokenTexts, sentence, boost::is_any_of(" "));
for(vector<string>::iterator it = tokenTexts.begin(); it != tokenTexts.end(); ++it) { for (vector<string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
string token = *it; string token = *it;
int code = _wordMap->getWordCode(token);
result.push_back(code);
} }
return result; return result;
@ -35,7 +38,7 @@ vector<int> HashGenerator::generateHash(const string & sentence) {
void HashGenerator::serializeWordMap() { void HashGenerator::serializeWordMap() {
ofstream ofs(_wordMapFilename.c_str(), std::ios::binary); ofstream ofs(_wordMapFilename.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs); boost::archive::binary_oarchive oa(ofs);
oa << _wordMap; oa << *_wordMap;
} }

View File

@ -4,6 +4,8 @@
#include <string> #include <string>
#include <map> #include <map>
#include <vector> #include <vector>
#include <boost/shared_ptr.hpp>
#include "concordia/word_map.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
/*! /*!
@ -15,7 +17,8 @@ using namespace std;
class HashGenerator { class HashGenerator {
public: public:
explicit HashGenerator(const string & wordMapFilename) throw(ConcordiaException); explicit HashGenerator(const string & wordMapFilename)
throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
@ -26,11 +29,9 @@ public:
void serializeWordMap(); void serializeWordMap();
private: private:
boost::shared_ptr<WordMap> _wordMap;
map<string,int> _wordMap;
string _wordMapFilename; string _wordMapFilename;
}; };
#endif #endif

View File

@ -2,6 +2,7 @@ add_library(concordia-tests
test_concordia.cpp test_concordia.cpp
test_concordia_config.cpp test_concordia_config.cpp
test_word_map.cpp test_word_map.cpp
test_hash_generator.cpp
) )
target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common) target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common)

View File

@ -0,0 +1,57 @@
#include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include "concordia/hash_generator.hpp"
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
using namespace std;
BOOST_AUTO_TEST_SUITE(hash_generator)
BOOST_AUTO_TEST_CASE( SimpleHashTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
}
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash = hashGenerator.generateHash("Ala ma kota");
vector<int> expected;
expected.push_back(0);
expected.push_back(1);
expected.push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_CASE( HashSerializationTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
}
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash1 = hashGenerator1.generateHash("Ala ma kota");
vector<int> expected1;
expected1.push_back(0);
expected1.push_back(1);
expected1.push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash1.begin(), hash1.end(), expected1.begin(), expected1.end());
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
vector<int> hash2 = hashGenerator2.generateHash("Ala ma psa");
vector<int> expected2;
expected2.push_back(0);
expected2.push_back(1);
expected2.push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end());
boost::filesystem::remove(TEST_WORD_MAP_PATH);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -9,8 +9,7 @@ WordMap::~WordMap() {
} }
int WordMap::getWordCode(const string & word) { int WordMap::getWordCode(const string & word) {
if (_map.find(word) == _map.end()) {
if (_map.find(word) == _map.end() ) {
int newCode = _nextFree; int newCode = _nextFree;
_map[word] = newCode; _map[word] = newCode;
_nextFree++; _nextFree++;
@ -18,7 +17,6 @@ int WordMap::getWordCode(const string & word) {
} else { } else {
return _map[word]; return _map[word];
} }
} }

View File

@ -4,10 +4,13 @@
#include <string> #include <string>
#include <map> #include <map>
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/map.hpp>
/*! /*!
Class for generating a sentence hash. Class representing dictionary for word to int encoding.
*/ */
@ -24,11 +27,18 @@ public:
int getWordCode(const string & word); int getWordCode(const string & word);
private: private:
friend class boost::serialization::access;
map<string,int> _map; template<class Archive>
void serialize(Archive & ar, const unsigned int version) {
ar & _map;
ar & _nextFree;
}
map<string, int> _map;
int _nextFree; int _nextFree;
}; };
#endif #endif