diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cff576..388e2d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,10 +57,6 @@ set (PROD_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/prod/resources") # ============================== # set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources") -set (TEMP_WORD_MAP "temp_word_map.bin") -set (TEMP_HASHED_INDEX "temp_hashed_index.bin") -set (TEMP_MARKERS "temp_markers.bin") -set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin") file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp) file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp) diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index 23951e1..afaa27f 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -108,6 +108,8 @@ int main(int argc, char** argv) { desc.add_options() ("help,h", "Display this message") + ("index,i", boost::program_options::value(), + "Index directory (required)") ("config,c", boost::program_options::value(), "Concordia configuration file (required)") ("simple-search,s", boost::program_options::value(), @@ -144,12 +146,20 @@ int main(int argc, char** argv) { return 1; } + std::string indexDirectory; + if (cli.count("index")) { + indexDirectory = cli["index"].as(); + } else { + std::cerr << "No index directory path given. Terminating." + << std::endl; + return 1; + } try { std::cout << "\tInitializing concordia..." << std::endl; boost::posix_time::ptime time_start = boost::posix_time::microsec_clock::local_time(); - Concordia concordia(configFile); + Concordia concordia(indexDirectory, configFile); boost::posix_time::ptime time_end = boost::posix_time::microsec_clock::local_time(); boost::posix_time::time_duration msdiff = time_end - time_start; diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index 51a8f5d..4f7bba5 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -2,10 +2,6 @@ #define CONCORDIA_VERSION_MINOR @CONCORDIA_VERSION_MINOR@ #define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@" -#define TEMP_WORD_MAP "@TEMP_WORD_MAP@" -#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@" -#define TEMP_MARKERS "@TEMP_MARKERS@" -#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@" #define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@" @@ -30,3 +26,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. #define CONCORDIA_SEARCH_MAX_RESULTS 3 + +#define WORD_MAP_FILE_NAME "word_map.bin" +#define MARKERS_FILE_NAME "markers.bin" +#define HASHED_INDEX_FILE_NAME "hashed_index.bin" diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index e855dc6..7ca51c4 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -15,13 +15,15 @@ std::string Concordia::_libraryVersion = _createLibraryVersion(); // =========================================== -Concordia::Concordia(const std::string & configFilePath) - throw(ConcordiaException) { +Concordia::Concordia(const std::string & indexPath, + const std::string & configFilePath) + throw(ConcordiaException) : + _indexPath(indexPath) { _config = boost::shared_ptr ( new ConcordiaConfig(configFilePath)); _index = boost::shared_ptr( - new ConcordiaIndex(_config->getHashedIndexFilePath(), - _config->getMarkersFilePath())); + new ConcordiaIndex(_getHashedIndexFilePath(), + _getMarkersFilePath())); _searcher = boost::shared_ptr(new IndexSearcher()); _initializeIndex(); } @@ -101,14 +103,14 @@ std::vector Concordia::addAllExamples( } void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { - if (boost::filesystem::exists(_config->getWordMapFilePath()) - && boost::filesystem::exists(_config->getHashedIndexFilePath()) - && boost::filesystem::exists(_config->getMarkersFilePath())) { + if (boost::filesystem::exists(_getWordMapFilePath()) + && boost::filesystem::exists(_getHashedIndexFilePath()) + && boost::filesystem::exists(_getMarkersFilePath())) { // reading index from file _T->clear(); std::ifstream hashedIndexFile; hashedIndexFile.open( - _config->getHashedIndexFilePath().c_str(), std::ios::in + _getHashedIndexFilePath().c_str(), std::ios::in | std::ios::ate | std::ios::binary); saidx_t hiFileSize = hashedIndexFile.tellg(); if (hiFileSize > 0) { @@ -128,7 +130,7 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { // reading markers from file _markers->clear(); std::ifstream markersFile; - markersFile.open(_config->getMarkersFilePath().c_str(), std::ios::in + markersFile.open(_getMarkersFilePath().c_str(), std::ios::in | std::ios::ate | std::ios::binary); saidx_t maFileSize = markersFile.tellg(); if (maFileSize > 0) { @@ -158,16 +160,17 @@ void Concordia::refreshSAfromRAM() throw(ConcordiaException) { void Concordia::_initializeIndex() throw(ConcordiaException) { _hashGenerator = boost::shared_ptr( - new HashGenerator(_config)); + new HashGenerator(_indexPath, + _config)); _T = boost::shared_ptr >( new std::vector); _markers = boost::shared_ptr >( new std::vector); - if (boost::filesystem::exists(_config->getWordMapFilePath()) - && boost::filesystem::exists(_config->getHashedIndexFilePath())) { + if (boost::filesystem::exists(_getWordMapFilePath()) + && boost::filesystem::exists(_getHashedIndexFilePath())) { loadRAMIndexFromDisk(); - } else if (!boost::filesystem::exists(_config->getWordMapFilePath()) - && !boost::filesystem::exists(_config->getHashedIndexFilePath())) { + } else if (!boost::filesystem::exists(_getWordMapFilePath()) + && !boost::filesystem::exists(_getHashedIndexFilePath())) { // empty index _SA = boost::shared_ptr >( new std::vector); @@ -233,7 +236,19 @@ void Concordia::clearIndex() throw(ConcordiaException) { _SA = boost::shared_ptr >( new std::vector); - boost::filesystem::remove(_config->getHashedIndexFilePath()); - boost::filesystem::remove(_config->getMarkersFilePath()); + boost::filesystem::remove(_getHashedIndexFilePath()); + boost::filesystem::remove(_getMarkersFilePath()); +} + +std::string Concordia::_getWordMapFilePath() { + return _indexPath+"/"+WORD_MAP_FILE_NAME; +} + +std::string Concordia::_getHashedIndexFilePath() { + return _indexPath+"/"+HASHED_INDEX_FILE_NAME; +} + +std::string Concordia::_getMarkersFilePath() { + return _indexPath+"/"+MARKERS_FILE_NAME; } diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 38bd034..ba168d9 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -38,10 +38,12 @@ class Concordia { public: /*! Constructor. + \param indexPath path to the index directory \param configFilePath path to the Concordia configuration file \throws ConcordiaException */ - explicit Concordia(const std::string & configFilePath) + explicit Concordia(const std::string & indexPath, + const std::string & configFilePath) throw(ConcordiaException); /*! Destructor. */ @@ -163,10 +165,18 @@ public: void clearIndex() throw(ConcordiaException); private: + std::string _getWordMapFilePath(); + + std::string _getHashedIndexFilePath(); + + std::string _getMarkersFilePath(); + void _initializeIndex() throw(ConcordiaException); static std::string _libraryVersion; + std::string _indexPath; + boost::shared_ptr _config; boost::shared_ptr _index; diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index 29e3080..23ff35c 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -4,9 +4,6 @@ #include "concordia/common/logging.hpp" #define PUDDLE_TAGSET_PARAM "puddle_tagset_path" -#define WORD_MAP_PARAM "word_map_path" -#define HASHED_INDEX_PARAM "hashed_index_path" -#define MARKERS_PARAM "markers_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path" #define HTML_TAGS_PARAM "html_tags_path" #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled" @@ -25,12 +22,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) +configFilePath); } - _wordMapFilePath = - ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM); - _hashedIndexFilePath = - ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM); - _markersFilePath = - ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM); _htmlTagsFilePath = ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM); _stopWordsEnabled = diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index a7c12d6..5fbb3e7 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -24,30 +24,6 @@ public: */ virtual ~ConcordiaConfig(); - /*! Getter for word map file path. - For more information see \ref tutorial3. - \returns word map file path - */ - std::string & getWordMapFilePath() { - return _wordMapFilePath; - } - - /*! Getter for hashed index file path. - For more information see \ref tutorial3. - \returns hashed index file path - */ - std::string & getHashedIndexFilePath() { - return _hashedIndexFilePath; - } - - /*! Getter for markers file path. - For more information see \ref tutorial3. - \returns markers file path - */ - std::string & getMarkersFilePath() { - return _markersFilePath; - } - /*! Getter for html tags file path. For more information see \ref tutorial3. \returns html tags file path @@ -91,12 +67,6 @@ public: private: libconfig::Config _config; - std::string _wordMapFilePath; - - std::string _hashedIndexFilePath; - - std::string _markersFilePath; - std::string _htmlTagsFilePath; bool _stopWordsEnabled; diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 89d5997..04c7f3c 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -10,9 +10,10 @@ #include -HashGenerator::HashGenerator(boost::shared_ptr config) +HashGenerator::HashGenerator(std::string indexPath, + boost::shared_ptr config) throw(ConcordiaException) : - _wordMapFilePath(config->getWordMapFilePath()), + _wordMapFilePath(indexPath+"/"+WORD_MAP_FILE_NAME), _wordMap(boost::shared_ptr(new WordMap)), _sentenceTokenizer(boost::shared_ptr( new SentenceTokenizer(config))) { diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index e94f8d6..8c308c1 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -30,9 +30,11 @@ class HashGenerator { public: /*! Constructor. + \param indexPath path to the index directory \param config pointer to current config object */ - explicit HashGenerator(boost::shared_ptr config) + explicit HashGenerator(std::string indexPath, + boost::shared_ptr config) throw(ConcordiaException); /*! Destructor. diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index f257191..71a1b0c 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -62,16 +62,17 @@ SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( std::vector hash = hashGenerator->generateHash(pattern).getCodes(); - // append sentence boundary marker, as we are looking only for exact sentence matches + // append sentence boundary marker, + // as we are looking only for exact sentence matches hash.push_back(INDEX_CHARACTER_TYPE_MAX_VALUE); - + saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); - + SUFFIX_MARKER_TYPE occurencesCount = 0; for (int i = 0; i < size; ++i) { saidx_t resultPos = SA->at(left + i); @@ -86,7 +87,7 @@ SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( } delete[] patternArray; - + return occurencesCount; } diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 01cc31a..178f22c 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -18,7 +18,8 @@ BOOST_AUTO_TEST_SUITE(concordia_main) BOOST_AUTO_TEST_CASE( ConcordiaVersion ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::string version = concordia.getVersion(); BOOST_CHECK_EQUAL( version , "1.0"); } @@ -26,7 +27,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14)); /* 0,3 type: 1 value: ala @@ -85,7 +87,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) { // modified stop words to avoid anonymization - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector testExamples; testExamples.push_back(Example("xto xjest okno",312)); testExamples.push_back(Example("czy xjest okno otwarte",202)); @@ -128,7 +131,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) */ - Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector searchResult1 = concordia2.simpleSearch("xto xjest"); std::vector searchResult2 = concordia2.simpleSearch("xjest okno"); @@ -156,13 +160,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector testExamples; testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312)); testExamples.push_back(Example("czy xjest żółte otwarte",202)); concordia.addAllExamples(testExamples); - Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); concordia2.clearIndex(); @@ -175,7 +181,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); concordia.addExample(Example("Ala posiada kota",14)); concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Marysia posiada rysia",123)); @@ -220,7 +227,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); concordia.addExample(Example("Ala posiada kota",14)); concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Marysia posiada rysia",123)); @@ -292,7 +300,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); /* concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a dog", 23)); @@ -349,7 +358,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) BOOST_AUTO_TEST_CASE( Tokenize ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TokenizedSentence ts = concordia.tokenize(" Ala posiada kota"); /* 0,3 type: 1 value: ala @@ -382,7 +392,8 @@ BOOST_AUTO_TEST_CASE( Tokenize ) BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) { - Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); concordia.addExample(Example("Ala posiada kota",14)); concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Ala posiada kota",16)); diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp index dc05ec1..4e608d5 100644 --- a/concordia/t/test_concordia_config.cpp +++ b/concordia/t/test_concordia_config.cpp @@ -12,9 +12,6 @@ BOOST_AUTO_TEST_SUITE(concordia_config) BOOST_AUTO_TEST_CASE( ConfigParameters ) { ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")); - BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" ); - BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" ); - BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" ); BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" ); BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" ); BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" ); diff --git a/concordia/t/test_concordia_searcher.cpp b/concordia/t/test_concordia_searcher.cpp index f8ab3fa..ff4f911 100644 --- a/concordia/t/test_concordia_searcher.cpp +++ b/concordia/t/test_concordia_searcher.cpp @@ -356,11 +356,11 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest ) SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 */ - ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX), - TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); + ConcordiaIndex index(TestResourcesManager::getTempPath()+"/"+HASHED_INDEX_FILE_NAME, + TestResourcesManager::getTempPath()+"/"+MARKERS_FILE_NAME); boost::shared_ptr config( new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - boost::shared_ptr hashGenerator(new HashGenerator(config)); + boost::shared_ptr hashGenerator(new HashGenerator(TestResourcesManager::getTempPath(), config)); boost::shared_ptr > T(new std::vector()); boost::shared_ptr > markers(new std::vector()); @@ -428,9 +428,9 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest ) BOOST_CHECK_EQUAL(patternIntervals123[1].getEnd(), 4); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME); + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+HASHED_INDEX_FILE_NAME); + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+MARKERS_FILE_NAME); } diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index c1fd782..cf80ca4 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -17,11 +17,11 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - if (boost::filesystem::exists(config->getWordMapFilePath())) { - boost::filesystem::remove(config->getWordMapFilePath()); + if (boost::filesystem::exists(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME)) { + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME); } - HashGenerator hashGenerator = HashGenerator(config); + HashGenerator hashGenerator = HashGenerator(TestResourcesManager::getTempPath(), config); std::vector hash = hashGenerator.generateHash("Ala posiada kota").getCodes(); std::vector expected; @@ -38,11 +38,11 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - if (boost::filesystem::exists(config->getWordMapFilePath())) { - boost::filesystem::remove(config->getWordMapFilePath()); + if (boost::filesystem::exists(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME)) { + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME); } - HashGenerator hashGenerator = HashGenerator(config); + HashGenerator hashGenerator = HashGenerator(TestResourcesManager::getTempPath(), config); std::stringstream ss; for (int i=0;i<65537;i++) { @@ -70,11 +70,11 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - if (boost::filesystem::exists(config->getWordMapFilePath())) { - boost::filesystem::remove(config->getWordMapFilePath()); + if (boost::filesystem::exists(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME)) { + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME); } - HashGenerator hashGenerator1 = HashGenerator(config); + HashGenerator hashGenerator1 = HashGenerator(TestResourcesManager::getTempPath(), config); std::vector hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes(); std::vector expected1; @@ -85,7 +85,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) hashGenerator1.serializeWordMap(); - HashGenerator hashGenerator2 = HashGenerator(config); + HashGenerator hashGenerator2 = HashGenerator(TestResourcesManager::getTempPath(), config); std::vector hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes(); std::vector expected2; expected2.push_back(0); @@ -100,11 +100,11 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - if (boost::filesystem::exists(config->getWordMapFilePath())) { - boost::filesystem::remove(config->getWordMapFilePath()); + if (boost::filesystem::exists(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME)) { + boost::filesystem::remove(TestResourcesManager::getTempPath()+"/"+WORD_MAP_FILE_NAME); } - HashGenerator hashGenerator = HashGenerator(config); + HashGenerator hashGenerator = HashGenerator(TestResourcesManager::getTempPath(), config); TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód."); diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox index 481c4a0..e913bc3 100644 --- a/concordia/tutorial.dox +++ b/concordia/tutorial.dox @@ -29,7 +29,7 @@ File first.cpp: using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); cout << concordia.getVersion() << endl; } \endverbatim @@ -52,7 +52,7 @@ File simple_search.cpp: using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); // adding sentences to index concordia.addExample(Example("Alice has a cat", 56)); @@ -116,7 +116,7 @@ File concordia_searching.cpp: using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); TokenizedSentence ts = concordia.addExample(Example("Alice has a cat", 56)); cout << "Added the following tokens: " << endl; @@ -209,17 +209,6 @@ Every option is documented in comments within the configuration file. #--------------------------- # -#------------------------------------------------------------------------------- -# The below set the paths for hashed index, markers array and word map files. -# If all the files pointed by these paths exist, Concordia reads them to its -# RAM index. When none of these files exist, a new empty index is created. -# However, if any of these files exist and any other is missing, the index -# is considered corrupt and Concordia does not start. - -hashed_index_path = "/tests/resources/temp/temp_hashed_index.bin" -markers_path = "/tests/resources/temp/temp_markers.bin" -word_map_path = "/tests/resources/temp/temp_word_map.bin" - #------------------------------------------------------------------------------- # The following settings control the sentence tokenizer mechanism. Tokenizer # takes into account html tags, substitutes predefined symbols @@ -260,6 +249,7 @@ The full list of program options is given below: \verbatim -h [ --help ] Display this message -c [ --config ] arg Concordia configuration file (required) + -i [ --index ] arg Index directory path (required) -s [ --simple-search ] arg Pattern to be searched in the index -n [ --silent ] While searching, do not output search results @@ -277,12 +267,12 @@ From directory: Read sentences from file sentences.txt \verbatim -./build/concordia-console/concordia-console -c tests/resources/concordia-config/concordia.cfg -r ~/sentences.txt +./build/concordia-console/concordia-console -i /tmp -c tests/resources/concordia-config/concordia.cfg -r ~/sentences.txt \endverbatim Run concordia search on the index \verbatim -./build/concordia-console/concordia-console -c tests/resources/concordia-config/concordia.cfg -x "some pattern" +./build/concordia-console/concordia-console -i /tmp -c tests/resources/concordia-config/concordia.cfg -x "some pattern" \endverbatim */ diff --git a/examples/concordia_search.cpp b/examples/concordia_search.cpp index c99932c..45a4a29 100644 --- a/examples/concordia_search.cpp +++ b/examples/concordia_search.cpp @@ -12,7 +12,7 @@ using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); TokenizedSentence ts = concordia.addExample(Example("Alice has a cat", 56)); cout << "Added the following tokens: " << endl; diff --git a/examples/first.cpp b/examples/first.cpp index 9ac4bf9..8d9e161 100644 --- a/examples/first.cpp +++ b/examples/first.cpp @@ -7,6 +7,6 @@ using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); cout << concordia.getVersion() << endl; } diff --git a/examples/simple_search.cpp b/examples/simple_search.cpp index 9c02b55..4883896 100644 --- a/examples/simple_search.cpp +++ b/examples/simple_search.cpp @@ -10,7 +10,7 @@ using namespace std; int main() { - Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + Concordia concordia("/tmp", EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); // adding sentences to index concordia.addExample(Example("Alice has a cat", 56)); diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in index c14cb97..1616eb0 100644 --- a/prod/resources/concordia-config/concordia.cfg.in +++ b/prod/resources/concordia-config/concordia.cfg.in @@ -3,17 +3,6 @@ #--------------------------- # -#------------------------------------------------------------------------------- -# The below set the paths for hashed index, markers array and word map files. -# If all the files pointed by these paths exist, Concordia reads them to its -# RAM index. When none of these files exist, a new empty index is created. -# However, if any of these files exist and any other is missing, the index -# is considered corrupt and Concordia does not start. - -hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" -markers_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@" -word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" - #------------------------------------------------------------------------------- # The following settings control the sentence anonymizer mechanism. It is used to # remove unnecessary symbols and possibly words from sentences added to index diff --git a/scripts/concordia-anubissearch.sh b/scripts/concordia-anubissearch.sh index 5846737..2b9d03e 100755 --- a/scripts/concordia-anubissearch.sh +++ b/scripts/concordia-anubissearch.sh @@ -1,5 +1,5 @@ #!/bin/sh -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -a "$1" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -a "$1" diff --git a/scripts/concordia-concordiasearch.sh b/scripts/concordia-concordiasearch.sh index 899ad1e..9ec2f58 100755 --- a/scripts/concordia-concordiasearch.sh +++ b/scripts/concordia-concordiasearch.sh @@ -1,5 +1,5 @@ #!/bin/sh -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -x "$1" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -x "$1" diff --git a/scripts/concordia-index-jrc.sh b/scripts/concordia-index-jrc.sh index 695443d..6b9ed9c 100755 --- a/scripts/concordia-index-jrc.sh +++ b/scripts/concordia-index-jrc.sh @@ -9,6 +9,6 @@ echo "CONCORDIA INDEXER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA INDEXER: reading from file" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/jrc_smaller.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/jrc_smaller.txt diff --git a/scripts/concordia-runner-jrc.sh b/scripts/concordia-runner-jrc.sh index 1811b5f..064fa0e 100755 --- a/scripts/concordia-runner-jrc.sh +++ b/scripts/concordia-runner-jrc.sh @@ -9,22 +9,22 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: reading from file" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/jrc_smaller.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/jrc_smaller.txt echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej" echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12" echo "CONCORDIA RUNNER: concordia searching for pattern: \"Prawo europejskie umożliwia handel zagraniczny\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -x "Prawo europejskie umożliwia handel zagraniczny" echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n echo "CONCORDIA RUNNER: searching for pattern: \"Dostęp do zatrudnienia\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n echo "CONCORDIA RUNNER: searching for pattern: \"Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem" -n rm ../prod/resources/text-files/jrc_smaller.txt diff --git a/scripts/concordia-runner-large.sh b/scripts/concordia-runner-large.sh index e8c15be..8286406 100755 --- a/scripts/concordia-runner-large.sh +++ b/scripts/concordia-runner-large.sh @@ -9,10 +9,10 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: reading from file" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/large.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/large.txt echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" -n echo "CONCORDIA RUNNER: searching for pattern: \"it is\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n rm ../prod/resources/text-files/large.txt diff --git a/scripts/concordia-runner.sh b/scripts/concordia-runner.sh index 3cab4c5..e7c9b0c 100755 --- a/scripts/concordia-runner.sh +++ b/scripts/concordia-runner.sh @@ -4,8 +4,8 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: reading from file" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/medium.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/medium.txt echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" echo "CONCORDIA RUNNER: searching for pattern: \"it is\"" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n diff --git a/scripts/concordia-search.sh b/scripts/concordia-search.sh index 0e729b7..ef18bb9 100755 --- a/scripts/concordia-search.sh +++ b/scripts/concordia-search.sh @@ -1,5 +1,5 @@ #!/bin/sh -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -s "$1" -n +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "$1" -n diff --git a/scripts/concordia-test-jrc.sh b/scripts/concordia-test-jrc.sh index 2824f08..1881a6b 100755 --- a/scripts/concordia-test-jrc.sh +++ b/scripts/concordia-test-jrc.sh @@ -9,6 +9,6 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: testing" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -t ../prod/resources/text-files/jrc_smaller.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -t ../prod/resources/text-files/jrc_smaller.txt rm ../prod/resources/text-files/jrc_smaller.txt diff --git a/scripts/concordia-test-medium.sh b/scripts/concordia-test-medium.sh index 3138e3e..3e3c7ec 100755 --- a/scripts/concordia-test-medium.sh +++ b/scripts/concordia-test-medium.sh @@ -5,5 +5,5 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: testing" -../build/concordia-console/concordia-console -c ../prod/resources/concordia-config/concordia.cfg -t ../prod/resources/text-files/medium.txt +../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -t ../prod/resources/text-files/medium.txt diff --git a/tests/common/test_resources_manager.cpp b/tests/common/test_resources_manager.cpp index ffd52f9..df6fbc4 100644 --- a/tests/common/test_resources_manager.cpp +++ b/tests/common/test_resources_manager.cpp @@ -25,3 +25,8 @@ std::string TestResourcesManager::getTestFilePath(const std::string & module, co return result + "/" + module + "/" + filename; } +std::string TestResourcesManager::getTempPath() { + std::string result = std::string(TEST_RESOURCES_DIRECTORY); + return result + "/temp"; +} + diff --git a/tests/common/test_resources_manager.hpp b/tests/common/test_resources_manager.hpp index 029e253..ca8a01c 100644 --- a/tests/common/test_resources_manager.hpp +++ b/tests/common/test_resources_manager.hpp @@ -16,6 +16,7 @@ public: static std::string getTestFilePath(const std::string & module, const std::string & filename); + static std::string getTempPath(); }; #endif diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index 6558a52..62000cd 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -6,12 +6,6 @@ # Anubis score threshold anubis_threshold = "0.3" -word_map_path = "/tmp/wm.bin" - -hashed_index_path = "/tmp/hi.bin" - -markers_path = "/tmp/ma.bin" - html_tags_path = "/tmp/html_tags.txt" stop_words_enabled = "true" diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index 5591f3b..f78e940 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -3,17 +3,6 @@ #--------------------------- # -#------------------------------------------------------------------------------- -# The below set the paths for hashed index, markers array and word map files. -# If all the files pointed by these paths exist, Concordia reads them to its -# RAM index. When none of these files exist, a new empty index is created. -# However, if any of these files exist and any other is missing, the index -# is considered corrupt and Concordia does not start. - -hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" -markers_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@" -word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" - #------------------------------------------------------------------------------- # The following settings control the sentence anonymizer mechanism. It is used to # remove unnecessary symbols and possibly words from sentences added to index