removed stop words - works slower
Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203
This commit is contained in:
parent
5c2ae86097
commit
f64449311d
@ -6,6 +6,9 @@ project(concordia C CXX)
|
||||
set (CONCORDIA_VERSION_MAJOR 0)
|
||||
set (CONCORDIA_VERSION_MINOR 1)
|
||||
|
||||
# Whether to use stop words
|
||||
set (STOP_WORDS_ENABLED "false")
|
||||
|
||||
# Type of the characters in SA
|
||||
|
||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||
|
@ -16,6 +16,9 @@ echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
||||
echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12"
|
||||
echo "CONCORDIA RUNNER: concordia searching for pattern: \"Prawo europejskie umożliwia handel zagraniczny\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny"
|
||||
|
||||
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
||||
|
@ -10,6 +10,7 @@
|
||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||
#define HTML_TAGS_PARAM "html_tags_path"
|
||||
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
|
||||
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
|
||||
#define STOP_WORDS_PARAM "stop_words_path"
|
||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
||||
@ -40,6 +41,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
||||
_spaceSymbolsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
|
||||
_stopWordsEnabled =
|
||||
ConcordiaConfig::_readConfigParameterStr(
|
||||
STOP_WORDS_ENABLED_PARAM) != "false";
|
||||
_stopWordsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM);
|
||||
_namedEntitiesFilePath =
|
||||
|
@ -55,6 +55,10 @@ public:
|
||||
return _spaceSymbolsFilePath;
|
||||
}
|
||||
|
||||
bool & isStopWordsEnabled() {
|
||||
return _stopWordsEnabled;
|
||||
}
|
||||
|
||||
std::string & getStopWordsFilePath() {
|
||||
return _stopWordsFilePath;
|
||||
}
|
||||
@ -88,6 +92,8 @@ private:
|
||||
|
||||
std::string _spaceSymbolsFilePath;
|
||||
|
||||
bool _stopWordsEnabled;
|
||||
|
||||
std::string _stopWordsFilePath;
|
||||
|
||||
std::string _namedEntitiesFilePath;
|
||||
|
@ -12,8 +12,11 @@ SentenceAnonymizer::SentenceAnonymizer(
|
||||
throw(ConcordiaException) {
|
||||
_createNeRules(config->getNamedEntitiesFilePath());
|
||||
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
||||
_stopWordsEnabled = config->isStopWordsEnabled();
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords = _getMultipleReplacementRule(
|
||||
config->getStopWordsFilePath(), "", true);
|
||||
}
|
||||
_stopSymbols = _getMultipleReplacementRule(
|
||||
config->getStopSymbolsFilePath(), "");
|
||||
_spaceSymbols = _getMultipleReplacementRule(
|
||||
@ -34,7 +37,9 @@ std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
|
||||
|
||||
result = TextUtils::getInstance().toLowerCase(result);
|
||||
|
||||
if (_stopWordsEnabled) {
|
||||
result = _stopWords->apply(result);
|
||||
}
|
||||
result = _stopSymbols->apply(result);
|
||||
result = _spaceSymbols->apply(result);
|
||||
|
||||
|
@ -41,6 +41,8 @@ private:
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _htmlTags;
|
||||
|
||||
bool _stopWordsEnabled;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopWords;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
||||
|
@ -121,7 +121,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||
}
|
||||
@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||
@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||
// best overlay: [0,2], [2,3], score =
|
||||
// best overlay: [0,2], [2,3], score = 0.695
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
|
||||
@ -226,32 +226,32 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
addFragment 167,1,2,1
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
||||
|
@ -106,15 +106,19 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
||||
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||
std::vector<std::string> expected;
|
||||
expected.push_back("ne_date");
|
||||
expected.push_back("o");
|
||||
expected.push_back("godzinie");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("doszło");
|
||||
expected.push_back("do");
|
||||
expected.push_back("kolizji");
|
||||
expected.push_back("na");
|
||||
expected.push_back("ulicy");
|
||||
expected.push_back("grobla");
|
||||
expected.push_back("policjanci");
|
||||
expected.push_back("ustalili");
|
||||
expected.push_back("że");
|
||||
expected.push_back("kierowca");
|
||||
expected.push_back("zaparkował");
|
||||
expected.push_back("samochód");
|
||||
|
@ -34,12 +34,11 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
if (config->isStopWordsEnabled()) {
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
|
||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||
|
@ -36,6 +36,8 @@ html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
|
||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
@ -21,6 +21,8 @@ html_tags_path = "/tmp/html_tags.txt"
|
||||
|
||||
space_symbols_path = "/tmp/space_symbols.txt"
|
||||
|
||||
stop_words_enabled = "true"
|
||||
|
||||
stop_words_path = "/tmp/stop_words.txt"
|
||||
|
||||
named_entities_path = "/tmp/named_entities.txt"
|
||||
|
@ -36,6 +36,8 @@ html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
|
||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
Loading…
Reference in New Issue
Block a user