trimming anonymized sentence
Former-commit-id: 316b76717e4075e466828c628e064076d39481c5
This commit is contained in:
parent
2a3c7eddfe
commit
f83aaef4ed
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,4 +6,6 @@ concordia/common/config.hpp
|
|||||||
tests/resources/concordia-config/concordia.cfg
|
tests/resources/concordia-config/concordia.cfg
|
||||||
tests/resources/temp
|
tests/resources/temp
|
||||||
prod/resources/temp
|
prod/resources/temp
|
||||||
|
prod/resources/text-files/jrc_smaller.txt
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
echo "CONCORDIA SEARCHER: searching for pattern: \"Parlamentu Europejskiego\""
|
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "$1"
|
||||||
echo "CONCORDIA SEARCHER: searching for pattern: \"Dostęp do zatrudnienia\""
|
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n
|
|
||||||
|
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
|
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/archive/binary_oarchive.hpp>
|
#include <boost/archive/binary_oarchive.hpp>
|
||||||
#include <boost/archive/binary_iarchive.hpp>
|
#include <boost/archive/binary_iarchive.hpp>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
@ -44,10 +46,10 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
|||||||
boost::shared_ptr<vector<string> >
|
boost::shared_ptr<vector<string> >
|
||||||
HashGenerator::generateTokenVector(const string & sentence) {
|
HashGenerator::generateTokenVector(const string & sentence) {
|
||||||
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
||||||
|
boost::trim(anonymizedSentence);
|
||||||
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
||||||
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||||
boost::algorithm::token_compress_on);
|
boost::algorithm::token_compress_on);
|
||||||
|
|
||||||
return tokenTexts;
|
return tokenTexts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,9 +123,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
boost::ptr_vector<Example> testExamples;
|
||||||
|
testExamples.push_back(new Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
||||||
|
testExamples.push_back(new Example("czy xjest żółte otwarte",202));
|
||||||
|
concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -43,8 +43,8 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
|||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
stringstream ss;
|
stringstream ss;
|
||||||
for (int i=0;i<256;i++) {
|
for (int i=0;i<257;i++) {
|
||||||
ss << "a" << i << " ";
|
ss << "xx" << i << " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
string longSentence = ss.str();
|
string longSentence = ss.str();
|
||||||
|
Loading…
Reference in New Issue
Block a user