removed stop words - works slower
Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203
This commit is contained in:
parent
5c2ae86097
commit
f64449311d
@ -6,6 +6,9 @@ project(concordia C CXX)
|
|||||||
set (CONCORDIA_VERSION_MAJOR 0)
|
set (CONCORDIA_VERSION_MAJOR 0)
|
||||||
set (CONCORDIA_VERSION_MINOR 1)
|
set (CONCORDIA_VERSION_MINOR 1)
|
||||||
|
|
||||||
|
# Whether to use stop words
|
||||||
|
set (STOP_WORDS_ENABLED "false")
|
||||||
|
|
||||||
# Type of the characters in SA
|
# Type of the characters in SA
|
||||||
|
|
||||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||||
|
@ -16,6 +16,9 @@ echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw
|
|||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
||||||
echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12"
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12"
|
||||||
|
echo "CONCORDIA RUNNER: concordia searching for pattern: \"Prawo europejskie umożliwia handel zagraniczny\""
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny"
|
||||||
|
|
||||||
|
|
||||||
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||||
#define HTML_TAGS_PARAM "html_tags_path"
|
#define HTML_TAGS_PARAM "html_tags_path"
|
||||||
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
|
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
|
||||||
|
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
|
||||||
#define STOP_WORDS_PARAM "stop_words_path"
|
#define STOP_WORDS_PARAM "stop_words_path"
|
||||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
||||||
@ -40,6 +41,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
|||||||
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
||||||
_spaceSymbolsFilePath =
|
_spaceSymbolsFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
|
||||||
|
_stopWordsEnabled =
|
||||||
|
ConcordiaConfig::_readConfigParameterStr(
|
||||||
|
STOP_WORDS_ENABLED_PARAM) != "false";
|
||||||
_stopWordsFilePath =
|
_stopWordsFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM);
|
||||||
_namedEntitiesFilePath =
|
_namedEntitiesFilePath =
|
||||||
|
@ -55,6 +55,10 @@ public:
|
|||||||
return _spaceSymbolsFilePath;
|
return _spaceSymbolsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool & isStopWordsEnabled() {
|
||||||
|
return _stopWordsEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
std::string & getStopWordsFilePath() {
|
std::string & getStopWordsFilePath() {
|
||||||
return _stopWordsFilePath;
|
return _stopWordsFilePath;
|
||||||
}
|
}
|
||||||
@ -88,6 +92,8 @@ private:
|
|||||||
|
|
||||||
std::string _spaceSymbolsFilePath;
|
std::string _spaceSymbolsFilePath;
|
||||||
|
|
||||||
|
bool _stopWordsEnabled;
|
||||||
|
|
||||||
std::string _stopWordsFilePath;
|
std::string _stopWordsFilePath;
|
||||||
|
|
||||||
std::string _namedEntitiesFilePath;
|
std::string _namedEntitiesFilePath;
|
||||||
|
@ -12,8 +12,11 @@ SentenceAnonymizer::SentenceAnonymizer(
|
|||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
_createNeRules(config->getNamedEntitiesFilePath());
|
_createNeRules(config->getNamedEntitiesFilePath());
|
||||||
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
||||||
_stopWords = _getMultipleReplacementRule(
|
_stopWordsEnabled = config->isStopWordsEnabled();
|
||||||
config->getStopWordsFilePath(), "", true);
|
if (_stopWordsEnabled) {
|
||||||
|
_stopWords = _getMultipleReplacementRule(
|
||||||
|
config->getStopWordsFilePath(), "", true);
|
||||||
|
}
|
||||||
_stopSymbols = _getMultipleReplacementRule(
|
_stopSymbols = _getMultipleReplacementRule(
|
||||||
config->getStopSymbolsFilePath(), "");
|
config->getStopSymbolsFilePath(), "");
|
||||||
_spaceSymbols = _getMultipleReplacementRule(
|
_spaceSymbols = _getMultipleReplacementRule(
|
||||||
@ -34,7 +37,9 @@ std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
|
|||||||
|
|
||||||
result = TextUtils::getInstance().toLowerCase(result);
|
result = TextUtils::getInstance().toLowerCase(result);
|
||||||
|
|
||||||
result = _stopWords->apply(result);
|
if (_stopWordsEnabled) {
|
||||||
|
result = _stopWords->apply(result);
|
||||||
|
}
|
||||||
result = _stopSymbols->apply(result);
|
result = _stopSymbols->apply(result);
|
||||||
result = _spaceSymbols->apply(result);
|
result = _spaceSymbols->apply(result);
|
||||||
|
|
||||||
|
@ -41,6 +41,8 @@ private:
|
|||||||
|
|
||||||
boost::shared_ptr<RegexReplacement> _htmlTags;
|
boost::shared_ptr<RegexReplacement> _htmlTags;
|
||||||
|
|
||||||
|
bool _stopWordsEnabled;
|
||||||
|
|
||||||
boost::shared_ptr<RegexReplacement> _stopWords;
|
boost::shared_ptr<RegexReplacement> _stopWords;
|
||||||
|
|
||||||
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
||||||
|
@ -121,7 +121,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||||
}
|
}
|
||||||
@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||||
@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||||
// best overlay: [0,2], [2,3], score =
|
// best overlay: [0,2], [2,3], score = 0.695
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
|
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
|
||||||
@ -226,32 +226,32 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
addFragment 167,1,2,1
|
addFragment 167,1,2,1
|
||||||
*/
|
*/
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
||||||
|
@ -106,15 +106,19 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
|||||||
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||||
std::vector<std::string> expected;
|
std::vector<std::string> expected;
|
||||||
expected.push_back("ne_date");
|
expected.push_back("ne_date");
|
||||||
|
expected.push_back("o");
|
||||||
expected.push_back("godzinie");
|
expected.push_back("godzinie");
|
||||||
expected.push_back("ne_number");
|
expected.push_back("ne_number");
|
||||||
expected.push_back("ne_number");
|
expected.push_back("ne_number");
|
||||||
expected.push_back("doszło");
|
expected.push_back("doszło");
|
||||||
|
expected.push_back("do");
|
||||||
expected.push_back("kolizji");
|
expected.push_back("kolizji");
|
||||||
|
expected.push_back("na");
|
||||||
expected.push_back("ulicy");
|
expected.push_back("ulicy");
|
||||||
expected.push_back("grobla");
|
expected.push_back("grobla");
|
||||||
expected.push_back("policjanci");
|
expected.push_back("policjanci");
|
||||||
expected.push_back("ustalili");
|
expected.push_back("ustalili");
|
||||||
|
expected.push_back("że");
|
||||||
expected.push_back("kierowca");
|
expected.push_back("kierowca");
|
||||||
expected.push_back("zaparkował");
|
expected.push_back("zaparkował");
|
||||||
expected.push_back("samochód");
|
expected.push_back("samochód");
|
||||||
|
@ -34,12 +34,11 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|||||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceAnonymizer anonymizer(config);
|
if (config->isStopWordsEnabled()) {
|
||||||
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||||
|
@ -36,6 +36,8 @@ html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
|||||||
|
|
||||||
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||||
|
|
||||||
|
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||||
|
|
||||||
stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||||
|
|
||||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||||
|
@ -21,6 +21,8 @@ html_tags_path = "/tmp/html_tags.txt"
|
|||||||
|
|
||||||
space_symbols_path = "/tmp/space_symbols.txt"
|
space_symbols_path = "/tmp/space_symbols.txt"
|
||||||
|
|
||||||
|
stop_words_enabled = "true"
|
||||||
|
|
||||||
stop_words_path = "/tmp/stop_words.txt"
|
stop_words_path = "/tmp/stop_words.txt"
|
||||||
|
|
||||||
named_entities_path = "/tmp/named_entities.txt"
|
named_entities_path = "/tmp/named_entities.txt"
|
||||||
|
@ -36,6 +36,8 @@ html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
|||||||
|
|
||||||
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||||
|
|
||||||
|
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||||
|
|
||||||
stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||||
|
|
||||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||||
|
Loading…
Reference in New Issue
Block a user