removed stop words - works slower

Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203
This commit is contained in:
rjawor 2015-04-21 21:33:08 +02:00
parent 5c2ae86097
commit f64449311d
12 changed files with 52 additions and 20 deletions

View File

@ -6,6 +6,9 @@ project(concordia C CXX)
set (CONCORDIA_VERSION_MAJOR 0) set (CONCORDIA_VERSION_MAJOR 0)
set (CONCORDIA_VERSION_MINOR 1) set (CONCORDIA_VERSION_MINOR 1)
# Whether to use stop words
set (STOP_WORDS_ENABLED "false")
# Type of the characters in SA # Type of the characters in SA
set (INDEX_CHARACTER_TYPE "unsigned int") set (INDEX_CHARACTER_TYPE "unsigned int")

View File

@ -16,6 +16,9 @@ echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej"
echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12"
echo "CONCORDIA RUNNER: concordia searching for pattern: \"Prawo europejskie umożliwia handel zagraniczny\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny"
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\"" echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n

View File

@ -10,6 +10,7 @@
#define SUFFIX_ARRAY_PARAM "suffix_array_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path"
#define HTML_TAGS_PARAM "html_tags_path" #define HTML_TAGS_PARAM "html_tags_path"
#define SPACE_SYMBOLS_PARAM "space_symbols_path" #define SPACE_SYMBOLS_PARAM "space_symbols_path"
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
#define STOP_WORDS_PARAM "stop_words_path" #define STOP_WORDS_PARAM "stop_words_path"
#define NAMED_ENTITIES_PARAM "named_entities_path" #define NAMED_ENTITIES_PARAM "named_entities_path"
#define STOP_SYMBOLS_PARAM "stop_symbols_path" #define STOP_SYMBOLS_PARAM "stop_symbols_path"
@ -40,6 +41,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM); ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
_spaceSymbolsFilePath = _spaceSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM); ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
_stopWordsEnabled =
ConcordiaConfig::_readConfigParameterStr(
STOP_WORDS_ENABLED_PARAM) != "false";
_stopWordsFilePath = _stopWordsFilePath =
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM); ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM);
_namedEntitiesFilePath = _namedEntitiesFilePath =

View File

@ -55,6 +55,10 @@ public:
return _spaceSymbolsFilePath; return _spaceSymbolsFilePath;
} }
bool & isStopWordsEnabled() {
return _stopWordsEnabled;
}
std::string & getStopWordsFilePath() { std::string & getStopWordsFilePath() {
return _stopWordsFilePath; return _stopWordsFilePath;
} }
@ -88,6 +92,8 @@ private:
std::string _spaceSymbolsFilePath; std::string _spaceSymbolsFilePath;
bool _stopWordsEnabled;
std::string _stopWordsFilePath; std::string _stopWordsFilePath;
std::string _namedEntitiesFilePath; std::string _namedEntitiesFilePath;

View File

@ -12,8 +12,11 @@ SentenceAnonymizer::SentenceAnonymizer(
throw(ConcordiaException) { throw(ConcordiaException) {
_createNeRules(config->getNamedEntitiesFilePath()); _createNeRules(config->getNamedEntitiesFilePath());
_createHtmlTagsRule(config->getHtmlTagsFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath());
_stopWords = _getMultipleReplacementRule( _stopWordsEnabled = config->isStopWordsEnabled();
config->getStopWordsFilePath(), "", true); if (_stopWordsEnabled) {
_stopWords = _getMultipleReplacementRule(
config->getStopWordsFilePath(), "", true);
}
_stopSymbols = _getMultipleReplacementRule( _stopSymbols = _getMultipleReplacementRule(
config->getStopSymbolsFilePath(), ""); config->getStopSymbolsFilePath(), "");
_spaceSymbols = _getMultipleReplacementRule( _spaceSymbols = _getMultipleReplacementRule(
@ -34,7 +37,9 @@ std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
result = TextUtils::getInstance().toLowerCase(result); result = TextUtils::getInstance().toLowerCase(result);
result = _stopWords->apply(result); if (_stopWordsEnabled) {
result = _stopWords->apply(result);
}
result = _stopSymbols->apply(result); result = _stopSymbols->apply(result);
result = _spaceSymbols->apply(result); result = _spaceSymbols->apply(result);

View File

@ -41,6 +41,8 @@ private:
boost::shared_ptr<RegexReplacement> _htmlTags; boost::shared_ptr<RegexReplacement> _htmlTags;
bool _stopWordsEnabled;
boost::shared_ptr<RegexReplacement> _stopWords; boost::shared_ptr<RegexReplacement> _stopWords;
boost::shared_ptr<RegexReplacement> _stopSymbols; boost::shared_ptr<RegexReplacement> _stopSymbols;

View File

@ -121,7 +121,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
} }
@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
// best overlay: [0,2], [2,3], score = // best overlay: [0,2], [2,3], score = 0.695
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
@ -226,32 +226,32 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
addFragment 167,1,2,1 addFragment 167,1,2,1
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);

View File

@ -106,15 +106,19 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód."); std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
std::vector<std::string> expected; std::vector<std::string> expected;
expected.push_back("ne_date"); expected.push_back("ne_date");
expected.push_back("o");
expected.push_back("godzinie"); expected.push_back("godzinie");
expected.push_back("ne_number"); expected.push_back("ne_number");
expected.push_back("ne_number"); expected.push_back("ne_number");
expected.push_back("doszło"); expected.push_back("doszło");
expected.push_back("do");
expected.push_back("kolizji"); expected.push_back("kolizji");
expected.push_back("na");
expected.push_back("ulicy"); expected.push_back("ulicy");
expected.push_back("grobla"); expected.push_back("grobla");
expected.push_back("policjanci"); expected.push_back("policjanci");
expected.push_back("ustalili"); expected.push_back("ustalili");
expected.push_back("że");
expected.push_back("kierowca"); expected.push_back("kierowca");
expected.push_back("zaparkował"); expected.push_back("zaparkował");
expected.push_back("samochód"); expected.push_back("samochód");

View File

@ -34,12 +34,11 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
BOOST_AUTO_TEST_CASE( StopWordsTest ) BOOST_AUTO_TEST_CASE( StopWordsTest )
{ {
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config); if (config->isStopWordsEnabled()) {
SentenceAnonymizer anonymizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne"); }
} }
BOOST_AUTO_TEST_CASE( StopSymbolsTest ) BOOST_AUTO_TEST_CASE( StopSymbolsTest )

View File

@ -36,6 +36,8 @@ html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
stop_words_enabled = "@STOP_WORDS_ENABLED@"
stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"

View File

@ -21,6 +21,8 @@ html_tags_path = "/tmp/html_tags.txt"
space_symbols_path = "/tmp/space_symbols.txt" space_symbols_path = "/tmp/space_symbols.txt"
stop_words_enabled = "true"
stop_words_path = "/tmp/stop_words.txt" stop_words_path = "/tmp/stop_words.txt"
named_entities_path = "/tmp/named_entities.txt" named_entities_path = "/tmp/named_entities.txt"

View File

@ -36,6 +36,8 @@ html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
stop_words_enabled = "@STOP_WORDS_ENABLED@"
stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"