diff --git a/.gitignore b/.gitignore index 76ccb17..148a2d8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,5 @@ tests/resources/concordia-config/concordia.cfg tests/resources/temp prod/resources/temp prod/resources/text-files/jrc_smaller.txt - - +examples/build +examples/config.hpp diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 816b27d..70ac20d 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -71,9 +71,11 @@ if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) link_directories(${LIBCONFIG_LIB}) endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) +target_link_libraries(concordia config++) target_link_libraries(concordia log4cpp) target_link_libraries(concordia ${Boost_LIBRARIES}) target_link_libraries(concordia divsufsort) +target_link_libraries(concordia utf8case) if (WITH_RE2) target_link_libraries(concordia re2) diff --git a/concordia/compilation.dox b/concordia/compilation.dox index 32afcbc..c40141e 100644 --- a/concordia/compilation.dox +++ b/concordia/compilation.dox @@ -84,29 +84,6 @@ This should generate a single file called refman.pdf in the same directory. \section compilation4 Sample program -In order to verify whether Concordia has been installed successfully, run the following minimal example. Prepare the file test.cpp with the following contents (remember to substitute with the path of the unpacked Concordia package). - -\verbatim - -#include -#include - -using namespace std; - -int main() { - - Concordia concordia("/tests/resources/concordia-config/concordia.cfg"); - cout << concordia.getVersion() << endl; - -} - -\endverbatim - -Compilation method: - -\verbatim - -g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case - +In order to verify whether Concordia has been installed successfully, proceed to \ref tutorial1 and run sample programs. \endverbatim */ diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 54dbc17..4bdee82 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -169,4 +169,17 @@ boost::shared_ptr Concordia::concordiaSearch( } } +void Concordia::clearIndex() throw(ConcordiaException) { + _hashGenerator->clearWordMap(); + _T = boost::shared_ptr >( + new std::vector); + _markers = boost::shared_ptr >( + new std::vector); + _SA = boost::shared_ptr >( + new std::vector); + + boost::filesystem::remove(_config->getHashedIndexFilePath()); + boost::filesystem::remove(_config->getMarkersFilePath()); +} + diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 58efbd5..48f0fdd 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -108,6 +108,11 @@ public: */ void refreshSAfromRAM() throw(ConcordiaException); + /*! Clears all the examples from the index + \throws ConcordiaException + */ + void clearIndex() throw(ConcordiaException); + private: void _initializeIndex() throw(ConcordiaException); diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index cdb33b1..0385652 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -59,4 +59,8 @@ void HashGenerator::serializeWordMap() { oa << *_wordMap; } +void HashGenerator::clearWordMap() { + _wordMap = boost::shared_ptr(new WordMap); + boost::filesystem::remove(_wordMapFilePath); +} diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index cb92fb6..676abda 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -63,6 +63,11 @@ public: */ void serializeWordMap(); + /*! + Clears word map. + */ + void clearWordMap(); + private: boost::shared_ptr _wordMap; diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index d65bf21..0012e91 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -52,9 +52,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) std::vector searchResult1 = concordia.simpleSearch("posiada rysia"); std::vector searchResult2 = concordia.simpleSearch("posiada kota Ala"); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); @@ -107,9 +105,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) std::vector searchResult1 = concordia2.simpleSearch("xto xjest"); std::vector searchResult2 = concordia2.simpleSearch("xjest okno"); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia2.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 3); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); @@ -137,9 +133,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia2.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); @@ -188,9 +182,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51); BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia.clearIndex(); } BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) @@ -262,9 +254,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia.clearIndex(); } BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) @@ -308,8 +298,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getEnd(), 9); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); - boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + concordia.clearIndex(); } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index c69d966..8fdef81 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -90,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) expected2.push_back(3); BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end()); - boost::filesystem::remove(config->getWordMapFilePath()); + hashGenerator1.clearWordMap(); } BOOST_AUTO_TEST_CASE( TokenVectorTest ) diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox index e4d29a1..2e0be41 100644 --- a/concordia/tutorial.dox +++ b/concordia/tutorial.dox @@ -2,33 +2,35 @@ \section tutorial1 Code examples -This section gives a few examples of programs in C++ which make use of the Concordia library. You can run them after successful installation of Concordia (the installation process is covered in \ref compilation). Each of these sample programs is compiled with the command: +This section describes a few examples of programs in C++ which make use of the Concordia library. You can run them after successful installation of Concordia (the installation process is covered in \ref compilation). Their source codes are located in the project's main directory, in the subfolder "examples". +The directory also contains a simple CMakeLists.txt file, which helps to perform compilation and linking of the examples. In order to compile the examples, issue the following commands from within the examples directory: \verbatim -g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case +mkdir build +cd build +cmake .. +make \endverbatim -Do not forget to substitute "" with the path to unpacked Concordia sources. Also, make sure that the folder: /tests/resources/temp is empty before running each example (this is explained in \ref tutorial2): - -\verbatim -rm /tests/resources/temp/* -\endverbatim +After these operations, three executables are created in the build directory: first, simple_search and concordia_search. A small config.hpp file is also generated to store the path to the examples folder. \subsection tutorial1_1 Minimal example -Only crate the Concordia object and print version of the library. +This program only creates the Concordia object and print version of the library. +File first.cpp: \verbatim #include #include +#include "config.hpp" + + using namespace std; int main() { - - Concordia concordia("/tests/resources/concordia-config/concordia.cfg"); + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); cout << concordia.getVersion() << endl; - } \endverbatim @@ -36,19 +38,21 @@ int main() { This code snippet shows the basic Concordia functionality - simple substring lookup in the index. +File simple_search.cpp: \verbatim #include #include #include +#include "config.hpp" + #include #include using namespace std; int main() { - - Concordia concordia("/tests/resources/concordia-config/concordia.cfg"); + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); // adding sentences to index concordia.addExample(Example("Alice has a cat", 56)); @@ -67,7 +71,10 @@ int main() { for(vector::iterator it = result.begin(); it != result.end(); ++it) { cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; - } + } + + // clearing index + concordia.clearIndex(); } \endverbatim @@ -91,22 +98,22 @@ Concordia is equipped with a unique functionality of so called Concordia search, Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples. -Sample concordia searching: - +File concordia_searching.cpp: \verbatim #include #include #include #include +#include "config.hpp" + #include #include using namespace std; int main() { - - Concordia concordia("/tests/resources/concordia-config/concordia.cfg"); + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a dog", 23)); @@ -138,7 +145,8 @@ int main() { cout << "Best overlay score: " << result->getBestOverlayScore() << endl; - + // clearing index + concordia.clearIndex(); } \endverbatim diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..c0033a7 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,44 @@ +# Tutorial: http://www.cmake.org/cmake/help/cmake_tutorial.html + +cmake_minimum_required(VERSION 2.6) +project(examples CXX) + +# Put the path to the examples folder in config.hpp +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/config.hpp @ONLY) + +# Find boost libraries +find_package(Boost COMPONENTS + serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED) + + +# 1. example - minimal program + +add_executable(first first.cpp) +target_link_libraries(first concordia) +target_link_libraries(first config++) +target_link_libraries(first log4cpp) +target_link_libraries(first ${Boost_LIBRARIES}) +target_link_libraries(first divsufsort) +target_link_libraries(first utf8case) + +# 2. example - simple substring lookup + +add_executable(simple_search simple_search.cpp) +target_link_libraries(simple_search concordia) +target_link_libraries(simple_search config++) +target_link_libraries(simple_search log4cpp) +target_link_libraries(simple_search ${Boost_LIBRARIES}) +target_link_libraries(simple_search divsufsort) +target_link_libraries(simple_search utf8case) + + +# 3. example - concordia search + +add_executable(concordia_search concordia_search.cpp) +target_link_libraries(concordia_search concordia) +target_link_libraries(concordia_search config++) +target_link_libraries(concordia_search log4cpp) +target_link_libraries(concordia_search ${Boost_LIBRARIES}) +target_link_libraries(concordia_search divsufsort) +target_link_libraries(concordia_search utf8case) + diff --git a/examples/concordia_search.cpp b/examples/concordia_search.cpp new file mode 100644 index 0000000..5a5cae8 --- /dev/null +++ b/examples/concordia_search.cpp @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +#include "config.hpp" + +#include +#include + +using namespace std; + +int main() { + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + + concordia.addExample(Example("Alice has a cat", 56)); + concordia.addExample(Example("Alice has a dog", 23)); + concordia.addExample(Example("New test product has a mistake", 321)); + concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); + + concordia.refreshSAfromRAM(); + + cout << "Searching for pattern: Our new test product has nothing to do with computers" << endl; + boost::shared_ptr result = + concordia.concordiaSearch("Our new test product has nothing to do with computers"); + + cout << "Printing all matched fragments:" << endl; + BOOST_FOREACH(MatchedPatternFragment fragment, result->getFragments()) { + cout << "Matched pattern fragment found. Pattern fragment: [" + << fragment.getStart() << "," << fragment.getEnd() << "]" + << " in sentence " << fragment.getExampleId() + << ", at offset: " << fragment.getExampleOffset() << endl; + } + + + cout << "Best overlay:" << endl; + BOOST_FOREACH(MatchedPatternFragment fragment, result->getBestOverlay()) { + cout << "\tPattern fragment: [" << fragment.getStart() + << "," << fragment.getEnd() << "]" + << " in sentence " << fragment.getExampleId() + << ", at offset: " << fragment.getExampleOffset() << endl; + } + + cout << "Best overlay score: " << result->getBestOverlayScore() << endl; + + // clearing index + concordia.clearIndex(); +} diff --git a/examples/config.hpp.in b/examples/config.hpp.in new file mode 100644 index 0000000..ce23d91 --- /dev/null +++ b/examples/config.hpp.in @@ -0,0 +1 @@ +#define EXAMPLES_DIR "@CMAKE_CURRENT_SOURCE_DIR@" diff --git a/examples/first.cpp b/examples/first.cpp new file mode 100644 index 0000000..9ac4bf9 --- /dev/null +++ b/examples/first.cpp @@ -0,0 +1,12 @@ +#include +#include + +#include "config.hpp" + + +using namespace std; + +int main() { + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + cout << concordia.getVersion() << endl; +} diff --git a/examples/simple_search.cpp b/examples/simple_search.cpp new file mode 100644 index 0000000..dbac325 --- /dev/null +++ b/examples/simple_search.cpp @@ -0,0 +1,36 @@ +#include +#include +#include + +#include "config.hpp" + +#include +#include + +using namespace std; + +int main() { + Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); + + // adding sentences to index + concordia.addExample(Example("Alice has a cat", 56)); + concordia.addExample(Example("Alice has a dog", 23)); + concordia.addExample(Example("New test product has a mistake", 321)); + concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); + + // generating index + concordia.refreshSAfromRAM(); + + // searching + cout << "Searching for pattern: has a" << endl; + vector result = concordia.simpleSearch("has a"); + + // printing results + for(vector::iterator it = result.begin(); + it != result.end(); ++it) { + cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; + } + + // clearing index + concordia.clearIndex(); +} diff --git a/run-checkers.sh b/run-checkers.sh index bb6e954..f922382 100755 --- a/run-checkers.sh +++ b/run-checkers.sh @@ -4,4 +4,4 @@ TARGET_DIR=build ./cpplint.py --filter=-legal,-build/namespaces,-whitespace/labels,-build/include_what_you_use,-runtime/int,-readability/streams,-build/include_order `find concordia concordia-console -type f -regextype posix-extended -regex '.*\.(cpp|hpp|h|c)' ! -regex '.*\./build.*' ! -regex '.*concordia/common/config.hpp' ! -regex '.*/(t|tests)/.*'` 2> cpplint-result.txt -cppcheck -D__cplusplus -D__GNUC__=3 -f --enable=all echo `find . -type d ! -path './.git*' ! -path "./${TARGET_DIR}"'*' | perl -ne 'chomp; print "-I$_ "'` `find . -type f -regextype posix-extended -regex '.*\.(cpp|hpp)' ! -regex '.*\./build.*'` 2> cppcheck-result.txt +cppcheck -D__cplusplus -D__GNUC__=3 -f --enable=all echo `find . -type d ! -path './.git*' ! -path "./${TARGET_DIR}"'*' | perl -ne 'chomp; print "-I$_ "'` `find . -type f -regextype posix-extended -regex '.*\.(cpp|hpp)' ! -regex '.*\./build.*' ! -regex '.*\./examples/build.*'` 2> cppcheck-result.txt