From f64449311de0027b53a57899ce8b0cff77aee596 Mon Sep 17 00:00:00 2001 From: rjawor Date: Tue, 21 Apr 2015 21:33:08 +0200 Subject: [PATCH] removed stop words - works slower Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203 --- CMakeLists.txt | 3 +++ concordia-runner-jrc.sh | 3 +++ concordia/concordia_config.cpp | 4 ++++ concordia/concordia_config.hpp | 6 +++++ concordia/sentence_anonymizer.cpp | 11 +++++++--- concordia/sentence_anonymizer.hpp | 2 ++ concordia/t/test_concordia.cpp | 22 +++++++++---------- concordia/t/test_hash_generator.cpp | 4 ++++ concordia/t/test_sentence_anonymizer.cpp | 11 +++++----- .../concordia-config/concordia.cfg.in | 2 ++ .../concordia-config/concordia-mock.cfg | 2 ++ .../concordia-config/concordia.cfg.in | 2 ++ 12 files changed, 52 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7ecdd3..ea7e2aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,9 @@ project(concordia C CXX) set (CONCORDIA_VERSION_MAJOR 0) set (CONCORDIA_VERSION_MINOR 1) +# Whether to use stop words +set (STOP_WORDS_ENABLED "false") + # Type of the characters in SA set (INDEX_CHARACTER_TYPE "unsigned int") diff --git a/concordia-runner-jrc.sh b/concordia-runner-jrc.sh index 263b348..f9622dc 100755 --- a/concordia-runner-jrc.sh +++ b/concordia-runner-jrc.sh @@ -16,6 +16,9 @@ echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej" echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12" +echo "CONCORDIA RUNNER: concordia searching for pattern: \"Prawo europejskie umożliwia handel zagraniczny\"" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny" + echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index 651bf28..8b0940e 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -10,6 +10,7 @@ #define SUFFIX_ARRAY_PARAM "suffix_array_path" #define HTML_TAGS_PARAM "html_tags_path" #define SPACE_SYMBOLS_PARAM "space_symbols_path" +#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled" #define STOP_WORDS_PARAM "stop_words_path" #define NAMED_ENTITIES_PARAM "named_entities_path" #define STOP_SYMBOLS_PARAM "stop_symbols_path" @@ -40,6 +41,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM); _spaceSymbolsFilePath = ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM); + _stopWordsEnabled = + ConcordiaConfig::_readConfigParameterStr( + STOP_WORDS_ENABLED_PARAM) != "false"; _stopWordsFilePath = ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM); _namedEntitiesFilePath = diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index be69330..abdedb8 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -55,6 +55,10 @@ public: return _spaceSymbolsFilePath; } + bool & isStopWordsEnabled() { + return _stopWordsEnabled; + } + std::string & getStopWordsFilePath() { return _stopWordsFilePath; } @@ -88,6 +92,8 @@ private: std::string _spaceSymbolsFilePath; + bool _stopWordsEnabled; + std::string _stopWordsFilePath; std::string _namedEntitiesFilePath; diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_anonymizer.cpp index f356d8b..33894cc 100644 --- a/concordia/sentence_anonymizer.cpp +++ b/concordia/sentence_anonymizer.cpp @@ -12,8 +12,11 @@ SentenceAnonymizer::SentenceAnonymizer( throw(ConcordiaException) { _createNeRules(config->getNamedEntitiesFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath()); - _stopWords = _getMultipleReplacementRule( - config->getStopWordsFilePath(), "", true); + _stopWordsEnabled = config->isStopWordsEnabled(); + if (_stopWordsEnabled) { + _stopWords = _getMultipleReplacementRule( + config->getStopWordsFilePath(), "", true); + } _stopSymbols = _getMultipleReplacementRule( config->getStopSymbolsFilePath(), ""); _spaceSymbols = _getMultipleReplacementRule( @@ -34,7 +37,9 @@ std::string SentenceAnonymizer::anonymize(const std::string & sentence) { result = TextUtils::getInstance().toLowerCase(result); - result = _stopWords->apply(result); + if (_stopWordsEnabled) { + result = _stopWords->apply(result); + } result = _stopSymbols->apply(result); result = _spaceSymbols->apply(result); diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp index bfa3992..52c7404 100644 --- a/concordia/sentence_anonymizer.hpp +++ b/concordia/sentence_anonymizer.hpp @@ -41,6 +41,8 @@ private: boost::shared_ptr _htmlTags; + bool _stopWordsEnabled; + boost::shared_ptr _stopWords; boost::shared_ptr _stopSymbols; diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 3317cef..70ff375 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -121,7 +121,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); - BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0); + BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); } @@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); - BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2); } BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) @@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) concordia.refreshSAfromRAM(); boost::shared_ptr searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); - // best overlay: [0,2], [2,3], score = + // best overlay: [0,2], [2,3], score = 0.695 BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1); @@ -226,32 +226,32 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) addFragment 167,1,2,1 */ - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1); diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index fdc38b0..c69d966 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -106,15 +106,19 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest ) std::vector tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód."); std::vector expected; expected.push_back("ne_date"); + expected.push_back("o"); expected.push_back("godzinie"); expected.push_back("ne_number"); expected.push_back("ne_number"); expected.push_back("doszło"); + expected.push_back("do"); expected.push_back("kolizji"); + expected.push_back("na"); expected.push_back("ulicy"); expected.push_back("grobla"); expected.push_back("policjanci"); expected.push_back("ustalili"); + expected.push_back("że"); expected.push_back("kierowca"); expected.push_back("zaparkował"); expected.push_back("samochód"); diff --git a/concordia/t/test_sentence_anonymizer.cpp b/concordia/t/test_sentence_anonymizer.cpp index 72d995d..932552c 100644 --- a/concordia/t/test_sentence_anonymizer.cpp +++ b/concordia/t/test_sentence_anonymizer.cpp @@ -34,12 +34,11 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest ) BOOST_AUTO_TEST_CASE( StopWordsTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - - std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne"); - + if (config->isStopWordsEnabled()) { + SentenceAnonymizer anonymizer(config); + std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne"); + } } BOOST_AUTO_TEST_CASE( StopSymbolsTest ) diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in index 38a13ed..9334a7f 100644 --- a/prod/resources/concordia-config/concordia.cfg.in +++ b/prod/resources/concordia-config/concordia.cfg.in @@ -36,6 +36,8 @@ html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" +stop_words_enabled = "@STOP_WORDS_ENABLED@" + stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index cb7cab7..272f184 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -21,6 +21,8 @@ html_tags_path = "/tmp/html_tags.txt" space_symbols_path = "/tmp/space_symbols.txt" +stop_words_enabled = "true" + stop_words_path = "/tmp/stop_words.txt" named_entities_path = "/tmp/named_entities.txt" diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index a5a2bea..413a1a9 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -36,6 +36,8 @@ html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" +stop_words_enabled = "@STOP_WORDS_ENABLED@" + stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"