trimming anonymized sentence

Former-commit-id: 316b76717e4075e466828c628e064076d39481c5
This commit is contained in:
rjawor 2014-08-15 13:22:04 +02:00
parent 2a3c7eddfe
commit f83aaef4ed
5 changed files with 31 additions and 8 deletions

2
.gitignore vendored
View File

@ -6,4 +6,6 @@ concordia/common/config.hpp
tests/resources/concordia-config/concordia.cfg
tests/resources/temp
prod/resources/temp
prod/resources/text-files/jrc_smaller.txt

View File

@ -1,7 +1,5 @@
#!/bin/sh
echo "CONCORDIA SEARCHER: searching for pattern: \"Parlamentu Europejskiego\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
echo "CONCORDIA SEARCHER: searching for pattern: \"Dostęp do zatrudnienia\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "$1"

View File

@ -1,8 +1,10 @@
#include "concordia/hash_generator.hpp"
#include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp>
#include <fstream>
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
@ -44,10 +46,10 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
boost::shared_ptr<vector<string> >
HashGenerator::generateTokenVector(const string & sentence) {
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
boost::trim(anonymizedSentence);
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on);
return tokenTexts;
}

View File

@ -123,9 +123,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<Example> testExamples;
testExamples.push_back(new Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(new Example("czy xjest żółte otwarte",202));
concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -43,8 +43,8 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
HashGenerator hashGenerator = HashGenerator(config);
stringstream ss;
for (int i=0;i<256;i++) {
ss << "a" << i << " ";
for (int i=0;i<257;i++) {
ss << "xx" << i << " ";
}
string longSentence = ss.str();