clear index, examples

This commit is contained in:
rjawor 2015-05-04 20:40:44 +02:00
parent abbd5b1ae8
commit 07d5d4438b
16 changed files with 208 additions and 65 deletions

4
.gitignore vendored
View File

@ -7,5 +7,5 @@ tests/resources/concordia-config/concordia.cfg
tests/resources/temp tests/resources/temp
prod/resources/temp prod/resources/temp
prod/resources/text-files/jrc_smaller.txt prod/resources/text-files/jrc_smaller.txt
examples/build
examples/config.hpp

View File

@ -71,9 +71,11 @@ if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
link_directories(${LIBCONFIG_LIB}) link_directories(${LIBCONFIG_LIB})
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
target_link_libraries(concordia config++)
target_link_libraries(concordia log4cpp) target_link_libraries(concordia log4cpp)
target_link_libraries(concordia ${Boost_LIBRARIES}) target_link_libraries(concordia ${Boost_LIBRARIES})
target_link_libraries(concordia divsufsort) target_link_libraries(concordia divsufsort)
target_link_libraries(concordia utf8case)
if (WITH_RE2) if (WITH_RE2)
target_link_libraries(concordia re2) target_link_libraries(concordia re2)

View File

@ -84,29 +84,6 @@ This should generate a single file called refman.pdf in the same directory.
\section compilation4 Sample program \section compilation4 Sample program
In order to verify whether Concordia has been installed successfully, run the following minimal example. Prepare the file test.cpp with the following contents (remember to substitute <CONCORDIA_HOME> with the path of the unpacked Concordia package). In order to verify whether Concordia has been installed successfully, proceed to \ref tutorial1 and run sample programs.
\verbatim
#include <concordia/concordia.hpp>
#include <iostream>
using namespace std;
int main() {
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
cout << concordia.getVersion() << endl;
}
\endverbatim
Compilation method:
\verbatim
g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case
\endverbatim \endverbatim
*/ */

View File

@ -169,4 +169,17 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
} }
} }
void Concordia::clearIndex() throw(ConcordiaException) {
_hashGenerator->clearWordMap();
_T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>);
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
new std::vector<SUFFIX_MARKER_TYPE>);
_SA = boost::shared_ptr<std::vector<saidx_t> >(
new std::vector<saidx_t>);
boost::filesystem::remove(_config->getHashedIndexFilePath());
boost::filesystem::remove(_config->getMarkersFilePath());
}

View File

@ -108,6 +108,11 @@ public:
*/ */
void refreshSAfromRAM() throw(ConcordiaException); void refreshSAfromRAM() throw(ConcordiaException);
/*! Clears all the examples from the index
\throws ConcordiaException
*/
void clearIndex() throw(ConcordiaException);
private: private:
void _initializeIndex() throw(ConcordiaException); void _initializeIndex() throw(ConcordiaException);

View File

@ -59,4 +59,8 @@ void HashGenerator::serializeWordMap() {
oa << *_wordMap; oa << *_wordMap;
} }
void HashGenerator::clearWordMap() {
_wordMap = boost::shared_ptr<WordMap>(new WordMap);
boost::filesystem::remove(_wordMapFilePath);
}

View File

@ -63,6 +63,11 @@ public:
*/ */
void serializeWordMap(); void serializeWordMap();
/*!
Clears word map.
*/
void clearWordMap();
private: private:
boost::shared_ptr<WordMap> _wordMap; boost::shared_ptr<WordMap> _wordMap;

View File

@ -52,9 +52,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
std::vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia"); std::vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala"); std::vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
@ -107,9 +105,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest"); std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno"); std::vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia2.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL(searchResult1.size(), 3); BOOST_CHECK_EQUAL(searchResult1.size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
@ -137,9 +133,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia2.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
@ -188,9 +182,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51);
BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1); BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1);
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
} }
BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
@ -262,9 +254,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
} }
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
@ -308,8 +298,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getEnd(), 9); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getEnd(), 9);
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); concordia.clearIndex();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -90,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
expected2.push_back(3); expected2.push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end());
boost::filesystem::remove(config->getWordMapFilePath()); hashGenerator1.clearWordMap();
} }
BOOST_AUTO_TEST_CASE( TokenVectorTest ) BOOST_AUTO_TEST_CASE( TokenVectorTest )

View File

@ -2,33 +2,35 @@
\section tutorial1 Code examples \section tutorial1 Code examples
This section gives a few examples of programs in C++ which make use of the Concordia library. You can run them after successful installation of Concordia (the installation process is covered in \ref compilation). Each of these sample programs is compiled with the command: This section describes a few examples of programs in C++ which make use of the Concordia library. You can run them after successful installation of Concordia (the installation process is covered in \ref compilation). Their source codes are located in the project's main directory, in the subfolder "examples".
The directory also contains a simple CMakeLists.txt file, which helps to perform compilation and linking of the examples. In order to compile the examples, issue the following commands from within the examples directory:
\verbatim \verbatim
g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case mkdir build
cd build
cmake ..
make
\endverbatim \endverbatim
Do not forget to substitute "<CONCORDIA_HOME>" with the path to unpacked Concordia sources. Also, make sure that the folder: <CONCORDIA_HOME>/tests/resources/temp is empty before running each example (this is explained in \ref tutorial2): After these operations, three executables are created in the build directory: first, simple_search and concordia_search. A small config.hpp file is also generated to store the path to the examples folder.
\verbatim
rm <CONCORDIA_HOME>/tests/resources/temp/*
\endverbatim
\subsection tutorial1_1 Minimal example \subsection tutorial1_1 Minimal example
Only crate the Concordia object and print version of the library. This program only creates the Concordia object and print version of the library.
File first.cpp:
\verbatim \verbatim
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <iostream> #include <iostream>
#include "config.hpp"
using namespace std; using namespace std;
int main() { int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
cout << concordia.getVersion() << endl; cout << concordia.getVersion() << endl;
} }
\endverbatim \endverbatim
@ -36,19 +38,21 @@ int main() {
This code snippet shows the basic Concordia functionality - simple substring lookup in the index. This code snippet shows the basic Concordia functionality - simple substring lookup in the index.
File simple_search.cpp:
\verbatim \verbatim
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/substring_occurence.hpp> #include <concordia/substring_occurence.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include "config.hpp"
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <vector> #include <vector>
using namespace std; using namespace std;
int main() { int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
// adding sentences to index // adding sentences to index
concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a cat", 56));
@ -67,7 +71,10 @@ int main() {
for(vector<SubstringOccurence>::iterator it = result.begin(); for(vector<SubstringOccurence>::iterator it = result.begin();
it != result.end(); ++it) { it != result.end(); ++it) {
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl;
} }
// clearing index
concordia.clearIndex();
} }
\endverbatim \endverbatim
@ -91,22 +98,22 @@ Concordia is equipped with a unique functionality of so called Concordia search,
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples. Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
Sample concordia searching: File concordia_searching.cpp:
\verbatim \verbatim
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_search_result.hpp> #include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include "config.hpp"
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
using namespace std; using namespace std;
int main() { int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("Alice has a dog", 23));
@ -138,7 +145,8 @@ int main() {
cout << "Best overlay score: " << result->getBestOverlayScore() << endl; cout << "Best overlay score: " << result->getBestOverlayScore() << endl;
// clearing index
concordia.clearIndex();
} }
\endverbatim \endverbatim

44
examples/CMakeLists.txt Normal file
View File

@ -0,0 +1,44 @@
# Tutorial: http://www.cmake.org/cmake/help/cmake_tutorial.html
cmake_minimum_required(VERSION 2.6)
project(examples CXX)
# Put the path to the examples folder in config.hpp
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/config.hpp @ONLY)
# Find boost libraries
find_package(Boost COMPONENTS
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
# 1. example - minimal program
add_executable(first first.cpp)
target_link_libraries(first concordia)
target_link_libraries(first config++)
target_link_libraries(first log4cpp)
target_link_libraries(first ${Boost_LIBRARIES})
target_link_libraries(first divsufsort)
target_link_libraries(first utf8case)
# 2. example - simple substring lookup
add_executable(simple_search simple_search.cpp)
target_link_libraries(simple_search concordia)
target_link_libraries(simple_search config++)
target_link_libraries(simple_search log4cpp)
target_link_libraries(simple_search ${Boost_LIBRARIES})
target_link_libraries(simple_search divsufsort)
target_link_libraries(simple_search utf8case)
# 3. example - concordia search
add_executable(concordia_search concordia_search.cpp)
target_link_libraries(concordia_search concordia)
target_link_libraries(concordia_search config++)
target_link_libraries(concordia_search log4cpp)
target_link_libraries(concordia_search ${Boost_LIBRARIES})
target_link_libraries(concordia_search divsufsort)
target_link_libraries(concordia_search utf8case)

View File

@ -0,0 +1,48 @@
#include <concordia/concordia.hpp>
#include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp>
#include "config.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
using namespace std;
int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
concordia.refreshSAfromRAM();
cout << "Searching for pattern: Our new test product has nothing to do with computers" << endl;
boost::shared_ptr<ConcordiaSearchResult> result =
concordia.concordiaSearch("Our new test product has nothing to do with computers");
cout << "Printing all matched fragments:" << endl;
BOOST_FOREACH(MatchedPatternFragment fragment, result->getFragments()) {
cout << "Matched pattern fragment found. Pattern fragment: ["
<< fragment.getStart() << "," << fragment.getEnd() << "]"
<< " in sentence " << fragment.getExampleId()
<< ", at offset: " << fragment.getExampleOffset() << endl;
}
cout << "Best overlay:" << endl;
BOOST_FOREACH(MatchedPatternFragment fragment, result->getBestOverlay()) {
cout << "\tPattern fragment: [" << fragment.getStart()
<< "," << fragment.getEnd() << "]"
<< " in sentence " << fragment.getExampleId()
<< ", at offset: " << fragment.getExampleOffset() << endl;
}
cout << "Best overlay score: " << result->getBestOverlayScore() << endl;
// clearing index
concordia.clearIndex();
}

1
examples/config.hpp.in Normal file
View File

@ -0,0 +1 @@
#define EXAMPLES_DIR "@CMAKE_CURRENT_SOURCE_DIR@"

12
examples/first.cpp Normal file
View File

@ -0,0 +1,12 @@
#include <concordia/concordia.hpp>
#include <iostream>
#include "config.hpp"
using namespace std;
int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
cout << concordia.getVersion() << endl;
}

View File

@ -0,0 +1,36 @@
#include <concordia/concordia.hpp>
#include <concordia/substring_occurence.hpp>
#include <concordia/example.hpp>
#include "config.hpp"
#include <boost/shared_ptr.hpp>
#include <vector>
using namespace std;
int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
// adding sentences to index
concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
// generating index
concordia.refreshSAfromRAM();
// searching
cout << "Searching for pattern: has a" << endl;
vector<SubstringOccurence> result = concordia.simpleSearch("has a");
// printing results
for(vector<SubstringOccurence>::iterator it = result.begin();
it != result.end(); ++it) {
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl;
}
// clearing index
concordia.clearIndex();
}

View File

@ -4,4 +4,4 @@ TARGET_DIR=build
./cpplint.py --filter=-legal,-build/namespaces,-whitespace/labels,-build/include_what_you_use,-runtime/int,-readability/streams,-build/include_order `find concordia concordia-console -type f -regextype posix-extended -regex '.*\.(cpp|hpp|h|c)' ! -regex '.*\./build.*' ! -regex '.*concordia/common/config.hpp' ! -regex '.*/(t|tests)/.*'` 2> cpplint-result.txt ./cpplint.py --filter=-legal,-build/namespaces,-whitespace/labels,-build/include_what_you_use,-runtime/int,-readability/streams,-build/include_order `find concordia concordia-console -type f -regextype posix-extended -regex '.*\.(cpp|hpp|h|c)' ! -regex '.*\./build.*' ! -regex '.*concordia/common/config.hpp' ! -regex '.*/(t|tests)/.*'` 2> cpplint-result.txt
cppcheck -D__cplusplus -D__GNUC__=3 -f --enable=all echo `find . -type d ! -path './.git*' ! -path "./${TARGET_DIR}"'*' | perl -ne 'chomp; print "-I$_ "'` `find . -type f -regextype posix-extended -regex '.*\.(cpp|hpp)' ! -regex '.*\./build.*'` 2> cppcheck-result.txt cppcheck -D__cplusplus -D__GNUC__=3 -f --enable=all echo `find . -type d ! -path './.git*' ! -path "./${TARGET_DIR}"'*' | perl -ne 'chomp; print "-I$_ "'` `find . -type f -regextype posix-extended -regex '.*\.(cpp|hpp)' ! -regex '.*\./build.*' ! -regex '.*\./examples/build.*'` 2> cppcheck-result.txt