trimming anonymized sentence

Former-commit-id: 316b76717e4075e466828c628e064076d39481c5
This commit is contained in:
rjawor 2014-08-15 13:22:04 +02:00
parent 2a3c7eddfe
commit f83aaef4ed
5 changed files with 31 additions and 8 deletions

2
.gitignore vendored
View File

@ -6,4 +6,6 @@ concordia/common/config.hpp
tests/resources/concordia-config/concordia.cfg tests/resources/concordia-config/concordia.cfg
tests/resources/temp tests/resources/temp
prod/resources/temp prod/resources/temp
prod/resources/text-files/jrc_smaller.txt

View File

@ -1,7 +1,5 @@
#!/bin/sh #!/bin/sh
echo "CONCORDIA SEARCHER: searching for pattern: \"Parlamentu Europejskiego\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "$1"
echo "CONCORDIA SEARCHER: searching for pattern: \"Dostęp do zatrudnienia\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n

View File

@ -1,8 +1,10 @@
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp> #include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp> #include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <fstream> #include <fstream>
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config) HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
@ -44,10 +46,10 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
boost::shared_ptr<vector<string> > boost::shared_ptr<vector<string> >
HashGenerator::generateTokenVector(const string & sentence) { HashGenerator::generateTokenVector(const string & sentence) {
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence); string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
boost::trim(anonymizedSentence);
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>()); boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"), boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on); boost::algorithm::token_compress_on);
return tokenTexts; return tokenTexts;
} }

View File

@ -123,9 +123,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<Example> testExamples;
testExamples.push_back(new Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(new Example("czy xjest żółte otwarte",202));
concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -43,8 +43,8 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
HashGenerator hashGenerator = HashGenerator(config); HashGenerator hashGenerator = HashGenerator(config);
stringstream ss; stringstream ss;
for (int i=0;i<256;i++) { for (int i=0;i<257;i++) {
ss << "a" << i << " "; ss << "xx" << i << " ";
} }
string longSentence = ss.str(); string longSentence = ss.str();