cleaned configuration, doc
This commit is contained in:
parent
b790c6898f
commit
87a26bfa3b
@ -52,14 +52,11 @@ set (SUFFIX_MARKER_SENTENCE_BYTES 2)
|
||||
|
||||
set (PROD_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/prod/resources")
|
||||
|
||||
set (PROD_PUDDLE_TAGSET_PATH "${PROD_RESOURCES_DIRECTORY}/puddle/tagset.txt")
|
||||
|
||||
# ============================== #
|
||||
# Testing paths
|
||||
# ============================== #
|
||||
|
||||
set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources")
|
||||
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
|
||||
set (TEMP_WORD_MAP "temp_word_map.bin")
|
||||
set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
||||
set (TEMP_MARKERS "temp_markers.bin")
|
||||
|
@ -73,6 +73,8 @@ This should generate a single file called refman.pdf in the same directory.
|
||||
|
||||
In order to verify whether Concordia has been installed successfully, run the following minimal example. Prepare the file test.cpp with the following contents (remember to substitute <CONCORDIA_HOME> with the path of the unpacked Concordia package).
|
||||
|
||||
========= test.cpp ===================
|
||||
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <iostream>
|
||||
|
||||
@ -85,6 +87,8 @@ int main() {
|
||||
|
||||
}
|
||||
|
||||
======================================
|
||||
|
||||
Compilation method:
|
||||
|
||||
g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case
|
||||
|
@ -80,6 +80,9 @@ void performSearch(Concordia & concordia,
|
||||
buffer.clear();
|
||||
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
if (timeElapsed == 0) {
|
||||
timeElapsed++;
|
||||
}
|
||||
double speed = static_cast<double>(
|
||||
1000 * sentencesSearched / timeElapsed);
|
||||
std::cout << "\tSearched a portion of " <<
|
||||
@ -103,7 +106,7 @@ int main(int argc, char** argv) {
|
||||
("simple-search,s", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched in the index")
|
||||
("silent,n",
|
||||
"While searching with simple-search, do not output search results")
|
||||
"While searching, do not output search results")
|
||||
("anubis-search,a", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched by anubis search in the index")
|
||||
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||
@ -240,6 +243,7 @@ int main(int argc, char** argv) {
|
||||
std::ifstream text_file(filePath.c_str());
|
||||
std::string line;
|
||||
if (text_file.is_open()) {
|
||||
std::cout << "\tFile open" << std::endl;
|
||||
long lineCount = 0;
|
||||
std::vector<Example> buffer;
|
||||
boost::posix_time::ptime timeStart =
|
||||
@ -255,6 +259,9 @@ int main(int argc, char** argv) {
|
||||
boost::posix_time::time_duration msdiff =
|
||||
timeEnd - timeStart;
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
if (timeElapsed == 0) {
|
||||
timeElapsed++;
|
||||
}
|
||||
double speed = static_cast<double>(
|
||||
1000 * lineCount / timeElapsed);
|
||||
std::cout << "\tRead and added to index " <<
|
||||
@ -272,6 +279,10 @@ int main(int argc, char** argv) {
|
||||
boost::posix_time::time_duration totalMsdiff =
|
||||
timeTotalEnd - timeStart;
|
||||
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||
if (totalTimeElapsed == 0) {
|
||||
totalTimeElapsed++;
|
||||
}
|
||||
|
||||
double totalSpeed =
|
||||
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||
std::cout << "\tReading finished. Read and added to index "
|
||||
@ -305,6 +316,9 @@ int main(int argc, char** argv) {
|
||||
boost::posix_time::time_duration msdiff =
|
||||
timeEnd - timeStart;
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
if (timeElapsed == 0) {
|
||||
timeElapsed++;
|
||||
}
|
||||
double speed = static_cast<double>(
|
||||
1000 * lineCount / timeElapsed);
|
||||
std::cout << "\tRead and added to index " <<
|
||||
@ -322,6 +336,9 @@ int main(int argc, char** argv) {
|
||||
boost::posix_time::time_duration totalMsdiff =
|
||||
timeTotalEnd - timeStart;
|
||||
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||
if (totalTimeElapsed == 0) {
|
||||
totalTimeElapsed++;
|
||||
}
|
||||
double totalSpeed =
|
||||
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||
std::cout << "\tReading finished. Read and added to index "
|
||||
|
@ -1,4 +1,4 @@
|
||||
/** \page compilation Concordia Installation & Build Manual
|
||||
/** \page compilation Build & installation
|
||||
|
||||
This page describes how to compile, build
|
||||
and install Concordia library.
|
||||
|
@ -27,16 +27,12 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
+configFilePath);
|
||||
}
|
||||
|
||||
_puddleTagsetFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(PUDDLE_TAGSET_PARAM);
|
||||
_wordMapFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
|
||||
_hashedIndexFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
|
||||
_markersFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
|
||||
_suffixArrayFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
|
||||
_htmlTagsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
||||
_spaceSymbolsFilePath =
|
||||
@ -45,13 +41,14 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
ConcordiaConfig::_readConfigParameterStr(
|
||||
STOP_WORDS_ENABLED_PARAM) != "false";
|
||||
_stopWordsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM);
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
|
||||
_namedEntitiesFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
||||
_stopSymbolsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
||||
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
|
||||
ANUBIS_THRESHOLD_PARAM).c_str());
|
||||
ANUBIS_THRESHOLD_PARAM,
|
||||
"0.3").c_str());
|
||||
}
|
||||
|
||||
ConcordiaConfig::~ConcordiaConfig() {
|
||||
@ -65,3 +62,14 @@ std::string ConcordiaConfig::_readConfigParameterStr(const std::string & name)
|
||||
return _config.lookup(name);
|
||||
}
|
||||
}
|
||||
|
||||
std::string ConcordiaConfig::_readConfigParameterStr(
|
||||
const std::string & name,
|
||||
const std::string & defaultValue)
|
||||
throw(ConcordiaException) {
|
||||
if (!_config.exists(name)) {
|
||||
return defaultValue;
|
||||
} else {
|
||||
return _config.lookup(name);
|
||||
}
|
||||
}
|
||||
|
@ -24,13 +24,6 @@ public:
|
||||
*/
|
||||
virtual ~ConcordiaConfig();
|
||||
|
||||
/*! Getter for the puddle file path parameter.
|
||||
\returns file path of the puddle tagset
|
||||
*/
|
||||
std::string & getPuddleTagsetFilePath() {
|
||||
return _puddleTagsetFilePath;
|
||||
}
|
||||
|
||||
std::string & getWordMapFilePath() {
|
||||
return _wordMapFilePath;
|
||||
}
|
||||
@ -43,10 +36,6 @@ public:
|
||||
return _markersFilePath;
|
||||
}
|
||||
|
||||
std::string & getSuffixArrayFilePath() {
|
||||
return _suffixArrayFilePath;
|
||||
}
|
||||
|
||||
std::string & getHtmlTagsFilePath() {
|
||||
return _htmlTagsFilePath;
|
||||
}
|
||||
@ -78,16 +67,12 @@ public:
|
||||
private:
|
||||
libconfig::Config _config;
|
||||
|
||||
std::string _puddleTagsetFilePath;
|
||||
|
||||
std::string _wordMapFilePath;
|
||||
|
||||
std::string _hashedIndexFilePath;
|
||||
|
||||
std::string _markersFilePath;
|
||||
|
||||
std::string _suffixArrayFilePath;
|
||||
|
||||
std::string _htmlTagsFilePath;
|
||||
|
||||
std::string _spaceSymbolsFilePath;
|
||||
@ -104,6 +89,10 @@ private:
|
||||
|
||||
std::string _readConfigParameterStr(const std::string & name)
|
||||
throw(ConcordiaException);
|
||||
|
||||
std::string _readConfigParameterStr(const std::string & name,
|
||||
const std::string & defaultValue)
|
||||
throw(ConcordiaException);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,13 +1,13 @@
|
||||
/** \mainpage Introduction
|
||||
/** \mainpage Overview
|
||||
|
||||
\section main_1 Concordia
|
||||
### Full-text and concordance searcher for CAT
|
||||
|
||||
|
||||
\section main_2 Overview
|
||||
\section main_2 Help contents
|
||||
|
||||
- \subpage compilation This chapter contains instructions for compilation and installation of the Concordia library.
|
||||
- \subpage running The methods of making use of the Concordia library are described in this chapter.
|
||||
- \subpage tutorial Short tutorial, introducing main functionalities of Concordia.
|
||||
- \subpage technical In this chapter technical information about unit tests, project resources and code style is provided.
|
||||
|
||||
\section main_3 Acknowledgements
|
||||
|
@ -1,32 +0,0 @@
|
||||
/** \page running Running the Concordia library
|
||||
|
||||
\section running1 Programmatical use of the library
|
||||
|
||||
The main access point to the functionalities of the library is the Concordia class. An example programmatical use of the class is shown below:
|
||||
|
||||
\verbatim
|
||||
snippet
|
||||
\endverbatim
|
||||
|
||||
\section running2 The concordia-console program
|
||||
|
||||
|
||||
After successful build of the project (see \ref compilation2) the concordia-console program is available in the folder build/concordia-console.
|
||||
|
||||
\subsection running2_1 concordia-console options
|
||||
|
||||
The full list of program options is given below:
|
||||
|
||||
\verbatim
|
||||
-h [ --help ] Display this message
|
||||
-c [ --config ] arg Concordia configuration file (required)
|
||||
\endverbatim
|
||||
|
||||
\subsection running2_2 concordia-console example run
|
||||
|
||||
\subsection running2_3 concordia-console output format
|
||||
|
||||
|
||||
\section running3 The Concordia configuration
|
||||
|
||||
Concordia is configured by the means of a configuration file in the libconfig format (http://www.hyperrealm.com/libconfig/).
|
@ -267,4 +267,49 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||
// best overlay:
|
||||
|
||||
/*
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
|
||||
|
||||
/*
|
||||
Matched pattern fragment found. Pattern fragment: [4,9]14,6,4,5
|
||||
Matched pattern fragment found. Pattern fragment: [1,5]321,0,1,4
|
||||
Matched pattern fragment found. Pattern fragment: [5,9]14,7,5,4
|
||||
Matched pattern fragment found. Pattern fragment: [2,5]321,1,2,3
|
||||
Matched pattern fragment found. Pattern fragment: [6,9]14,8,6,3
|
||||
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
|
||||
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
|
||||
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getEnd(), 9);
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
}
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -12,11 +12,9 @@ BOOST_AUTO_TEST_SUITE(concordia_config)
|
||||
BOOST_AUTO_TEST_CASE( ConfigParameters )
|
||||
{
|
||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
|
||||
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "/tmp/sa.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
|
||||
|
@ -1,8 +1,6 @@
|
||||
/** \page technical Project technical information
|
||||
|
||||
\section technical1 Development
|
||||
|
||||
\subsection technical1_1 Code style
|
||||
\section technical1 Development - code style
|
||||
|
||||
Use: ./run-checkers.sh script to find the most
|
||||
C++ coding errors. The script uses the following
|
||||
@ -14,7 +12,7 @@ external tools:
|
||||
The reports are stored in the XXX-result.txt files (where XXX is the name of the tool)
|
||||
in the current directory.
|
||||
|
||||
\subsection technical1_2 Unit tests
|
||||
\section technical2 Unit tests
|
||||
|
||||
Unit tests are integrated into makefiles. Unit tests codes are
|
||||
put in the t/ subdirectory for each library.
|
||||
@ -26,3 +24,5 @@ make test
|
||||
You can get detailed test report by running:
|
||||
|
||||
./tests/unit-tests/test_runner
|
||||
|
||||
*/
|
||||
|
267
concordia/tutorial.dox
Normal file
267
concordia/tutorial.dox
Normal file
@ -0,0 +1,267 @@
|
||||
/** \page tutorial Quick tutorial
|
||||
|
||||
\section tutorial1 Code examples
|
||||
|
||||
This section gives a few examples of programs in C++ which make use of the Concordia library. You can run them after successful installation of Concordia (the installation process is covered in \ref compilation). Each of these sample programs is compiled with the command:
|
||||
|
||||
\verbatim
|
||||
g++ test.cpp -lconcordia -lconfig++ -lboost_system -lboost_serialization -lboost_unit_test_framework -lboost_filesystem -lboost_program_options -lboost_iostreams -lboost_regex -lboost_locale -lutf8case
|
||||
\endverbatim
|
||||
|
||||
Do not forget to substitute "<CONCORDIA_HOME>" with the path to unpacked Concordia sources. Also, make sure that the folder: <CONCORDIA_HOME>/tests/resources/temp is empty before running each example (this is explained in \ref tutorial2):
|
||||
|
||||
\verbatim
|
||||
rm <CONCORDIA_HOME>/tests/resources/temp/*
|
||||
\endverbatim
|
||||
|
||||
\subsection tutorial1_1 Minimal example
|
||||
|
||||
Only crate the Concordia object and print version of the library.
|
||||
|
||||
\verbatim
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
|
||||
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
|
||||
cout << concordia.getVersion() << endl;
|
||||
|
||||
}
|
||||
\endverbatim
|
||||
|
||||
\subsection tutorial1_2 Simple substring lookup
|
||||
|
||||
This code snippet shows the basic Concordia functionality - simple substring lookup in the index.
|
||||
|
||||
\verbatim
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <concordia/substring_occurence.hpp>
|
||||
#include <concordia/example.hpp>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
|
||||
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
|
||||
|
||||
// adding sentences to index
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
|
||||
// generating index
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
// searching
|
||||
cout << "Searching for pattern: has a" << endl;
|
||||
vector<SubstringOccurence> result = concordia.simpleSearch("has a");
|
||||
|
||||
// printing results
|
||||
for(vector<SubstringOccurence>::iterator it = result.begin();
|
||||
it != result.end(); ++it) {
|
||||
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl;
|
||||
}
|
||||
}
|
||||
\endverbatim
|
||||
|
||||
First, sentences are added to the index along with their integer IDs. The pair (sentence, id) is called an Example. Note that the IDs used in the above code are not consecutive, as there is no such requirement. Sentence ID may come from other data sources, e.g. a database and is used only as sentence meta-information.
|
||||
|
||||
After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2.
|
||||
|
||||
The search returns a vector of SubstringOccurence objects, which is then printed out. Each occurence represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results:
|
||||
|
||||
\verbatim
|
||||
Found substring in sentence: 56 at offset: 1
|
||||
Found substring in sentence: 23 at offset: 1
|
||||
Found substring in sentence: 321 at offset: 3
|
||||
\endverbatim
|
||||
|
||||
Match (321, 3) represents matching of the pattern "has a" in the sentence 321 ("New test product has a mistake"), starting at position 3, i.e. after the third word, which is "product".
|
||||
|
||||
\subsection tutorial1_3 Concordia searching
|
||||
|
||||
Concordia is equipped with a unique functionality of so called Concordia search, which is best suited to use in Computer-Aided Translation systems. This operation is aimed at finding the longest matches from the index that cover the search pattern. Such match is called MatchedPatternFragment. Then, out of all matched pattern fragments, the best pattern overlay is computed. Pattern overlay is a set of matched pattern fragments which do not intersect with each other. Best pattern overlay is an overlay that matches the most of the pattern with the fewest number of fragments.
|
||||
|
||||
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
|
||||
|
||||
Sample concordia searching:
|
||||
|
||||
\verbatim
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <concordia/concordia_search_result.hpp>
|
||||
#include <concordia/matched_pattern_fragment.hpp>
|
||||
#include <concordia/example.hpp>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
|
||||
Concordia concordia("<CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg");
|
||||
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
cout << "Searching for pattern: Our new test product has nothing to do with computers" << endl;
|
||||
boost::shared_ptr<ConcordiaSearchResult> result =
|
||||
concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||
|
||||
cout << "Printing all matched fragments:" << endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment, result->getFragments()) {
|
||||
cout << "Matched pattern fragment found. Pattern fragment: ["
|
||||
<< fragment.getStart() << "," << fragment.getEnd() << "]"
|
||||
<< " in sentence " << fragment.getExampleId()
|
||||
<< ", at offset: " << fragment.getExampleOffset() << endl;
|
||||
}
|
||||
|
||||
|
||||
cout << "Best overlay:" << endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment, result->getBestOverlay()) {
|
||||
cout << "\tPattern fragment: [" << fragment.getStart()
|
||||
<< "," << fragment.getEnd() << "]"
|
||||
<< " in sentence " << fragment.getExampleId()
|
||||
<< ", at offset: " << fragment.getExampleOffset() << endl;
|
||||
}
|
||||
|
||||
cout << "Best overlay score: " << result->getBestOverlayScore() << endl;
|
||||
|
||||
|
||||
}
|
||||
\endverbatim
|
||||
|
||||
This program should print:
|
||||
|
||||
\verbatim
|
||||
Searching for pattern: Our new test product has nothing to do with computers
|
||||
Printing all matched fragments:
|
||||
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
|
||||
Matched pattern fragment found. Pattern fragment: [1,5] in sentence 321, at offset: 0
|
||||
Matched pattern fragment found. Pattern fragment: [5,9] in sentence 14, at offset: 7
|
||||
Matched pattern fragment found. Pattern fragment: [2,5] in sentence 321, at offset: 1
|
||||
Matched pattern fragment found. Pattern fragment: [6,9] in sentence 14, at offset: 8
|
||||
Matched pattern fragment found. Pattern fragment: [3,5] in sentence 321, at offset: 2
|
||||
Matched pattern fragment found. Pattern fragment: [7,9] in sentence 14, at offset: 9
|
||||
Matched pattern fragment found. Pattern fragment: [8,9] in sentence 14, at offset: 10
|
||||
Best overlay:
|
||||
Pattern fragment: [1,5] in sentence 321, at offset: 0
|
||||
Pattern fragment: [5,9] in sentence 14, at offset: 7
|
||||
Best overlay score: 0.53695
|
||||
\endverbatim
|
||||
|
||||
These results list all the longest matched pattern fragments. The longest is [4,9] (length 5, as the end index is exclusive) which corresponds to the pattern fragment "has nothing to do with", found in the sentence 14 at offset 7. However, this longest fragment was not chosen to the best overlay. The best overlay are two fragments of length 4: [1,5] "new test product has" and [5,9] "nothing to do with". Notice that if the fragment [4,9] was chosen to the overlay, it would eliminate the [1,5] fragment.
|
||||
|
||||
The score of such overlay is 0.53695, which can be considered as quite satisfactory to serve as an aid for a translator.
|
||||
|
||||
\section tutorial2 Concept of HDD and RAM index
|
||||
|
||||
Concordia index consists of 4 data structures: hashed index, markers array, word map and suffix array. For searching to work, all of these structures must be present in RAM.
|
||||
|
||||
However, due to the fact that hashed index, markers array and word map are potentially large and their generation might take considerable amount of time, they are backed up on hard disk. Each operation of adding to index adds simultaneously to hashed index, markers array and word map in RAM and on HDD.
|
||||
|
||||
The last element of the index, the suffix array, is never backed up on disk but always dynamically generated. The generation is done by the method refreshSAfromRAM(). It is used to generate suffix array (SA) based on current hashed index, markers array and word map in RAM. Generation of SA for an index containing 2 000 000 000 sentences takes about 7 seconds on a personal computer. The reason for not backing up SA on HDD is that it needs to be freshly generated from scratch everytime the index changes. There is no way of incrementally augmenting this structure.
|
||||
|
||||
There is another method: loadRAMIndexFromDisk(). It loads hashed index, markers array and word map from HDD to RAM and calls refreshSAfromRAM(). The method loadRAMIndexFromDisk() is called when Concordia starts and the paths of hashed index, markers array and word map point to non-empty files on HDD (i.e. something was added to the index in previous runs of Concordia).
|
||||
|
||||
\section tutorial3 Concordia configuration
|
||||
|
||||
Concordia is configured by the means of a configuration file in the libconfig format (http://www.hyperrealm.com/libconfig/). Here is the sample configuration file, which comes with the library. Its path is <CONCORDIA_HOME>/tests/resources/concordia-config/concordia.cfg. Note that all the settings in this file are required.
|
||||
|
||||
Every option is documented in comments within the configuration file.
|
||||
|
||||
\verbatim
|
||||
#----------------------------
|
||||
# Concordia configuration file
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# The below set the paths for hashed index, markers array and word map files.
|
||||
# If all the files pointed by these paths exist, Concordia reads them to its
|
||||
# RAM index. When none of these files exist, a new empty index is created.
|
||||
# However, if any of these files exist and any other is missing, the index
|
||||
# is considered corrupt and Concordia does not start.
|
||||
|
||||
hashed_index_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_hashed_index.bin"
|
||||
markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
|
||||
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
||||
# remove unnecessary symbols and possibly words from sentences added to index
|
||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
# with a single space, removes stop words (if the option is enabled), as well as
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
# File containing all html tags (one per line)
|
||||
html_tags_path = "<CONCORDIA_HOME>/tests/resources/anonymizer/html_tags.txt"
|
||||
|
||||
# File containing all symbols to be replaced by spaces
|
||||
space_symbols_path = "<CONCORDIA_HOME>/tests/resources/anonymizer/space_symbols.txt"
|
||||
|
||||
# If set to true, words from predefined list are removed
|
||||
stop_words_enabled = "false"
|
||||
|
||||
# If stop_words_enabled is true, set the path to the stop words file
|
||||
#stop_words_path = "<CONCORDIA_HOME>/tests/resources/anonymizer/stop_words.txt"
|
||||
|
||||
# File containing regular expressions that match named entities
|
||||
named_entities_path = "<CONCORDIA_HOME>/tests/resources/anonymizer/named_entities.txt"
|
||||
|
||||
# File containing special symbols (one per line) to be removed
|
||||
stop_symbols_path = "<CONCORDIA_HOME>/tests/resources/anonymizer/stop_symbols.txt"
|
||||
|
||||
### eof
|
||||
\endverbatim
|
||||
|
||||
|
||||
\section tutorial4 The concordia-console program
|
||||
|
||||
|
||||
After successful build of the project (see \ref compilation2) the concordia-console program is available in the folder build/concordia-console.
|
||||
|
||||
\subsection tutorial4_1 concordia-console options
|
||||
|
||||
The full list of program options is given below:
|
||||
|
||||
\verbatim
|
||||
-h [ --help ] Display this message
|
||||
-c [ --config ] arg Concordia configuration file (required)
|
||||
-s [ --simple-search ] arg Pattern to be searched in the index
|
||||
-n [ --silent ] While searching, do not
|
||||
output search results
|
||||
-a [ --anubis-search ] arg Pattern to be searched by anubis search in the
|
||||
index
|
||||
-x [ --concordia-search ] arg Pattern to be searched by concordia search in
|
||||
the index
|
||||
-r [ --read-file ] arg File to be read and added to index
|
||||
-t [ --test ] arg Run performance and correctness tests on file
|
||||
\endverbatim
|
||||
|
||||
\subsection tutorial4_2 concordia-console example run
|
||||
|
||||
From <CONCORDIA_HOME> directory:
|
||||
|
||||
Read sentences from file sentences.txt
|
||||
\verbatim
|
||||
./build/concordia-console/concordia-console -c tests/resources/concordia-config/concordia.cfg -r ~/sentences.txt
|
||||
\endverbatim
|
||||
|
||||
Run concordia search on the index
|
||||
\verbatim
|
||||
./build/concordia-console/concordia-console -c tests/resources/concordia-config/concordia.cfg -x "some pattern"
|
||||
\endverbatim
|
||||
|
||||
*/
|
@ -3,45 +3,40 @@
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@"
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#Word map, hashed index and suffix array files are in a temporary directory
|
||||
#and should be deleted at the end of each test procedure.
|
||||
|
||||
#Word map file containing unique codes for tokens
|
||||
|
||||
word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#File containing the "text" for suffix array searching, i.e. sequence of codes
|
||||
# The below set the paths for hashed index, markers array and word map files.
|
||||
# If all the files pointed by these paths exist, Concordia reads them to its
|
||||
# RAM index. When none of these files exist, a new empty index is created.
|
||||
# However, if any of these files exist and any other is missing, the index
|
||||
# is considered corrupt and Concordia does not start.
|
||||
|
||||
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||
|
||||
#File containing suffix markers (sentence ids and offsets)
|
||||
|
||||
markers_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
|
||||
word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#Binarized suffix array
|
||||
|
||||
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||
#-------------------------------------------------------------------------------
|
||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
||||
# remove unnecessary symbols and possibly words from sentences added to index
|
||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
# with a single space, removes stop words (if the option is enabled), as well as
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
# Anonymizer -------------------------------------------------------------------
|
||||
|
||||
# File containing all html tags (one per line)
|
||||
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
# File containing all symbols to be replaced by spaces
|
||||
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
|
||||
# If set to true, words from predefined list are removed
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
# If stop_words_enabled is true, set the path to the stop words file
|
||||
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
|
||||
# File containing regular expressions that match named entities
|
||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
||||
# File containing special symbols (one per line) to be removed
|
||||
stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
||||
|
||||
### eof
|
||||
|
@ -1,7 +0,0 @@
|
||||
[ATTR]
|
||||
|
||||
case = nom gen dat acc inst loc voc
|
||||
|
||||
[POS]
|
||||
|
||||
subst = case
|
@ -6,17 +6,12 @@
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "puddle/tagset.txt";
|
||||
|
||||
word_map_path = "/tmp/wm.bin"
|
||||
|
||||
hashed_index_path = "/tmp/hi.bin"
|
||||
|
||||
markers_path = "/tmp/ma.bin"
|
||||
|
||||
suffix_array_path = "/tmp/sa.bin"
|
||||
|
||||
html_tags_path = "/tmp/html_tags.txt"
|
||||
|
||||
space_symbols_path = "/tmp/space_symbols.txt"
|
||||
|
@ -3,45 +3,40 @@
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#Word map, hashed index and suffix array files are in a temporary directory
|
||||
#and should be deleted at the end of each test procedure.
|
||||
|
||||
#Word map file containing unique codes for tokens
|
||||
|
||||
word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#File containing the "text" for suffix array searching, i.e. sequence of codes
|
||||
# The below set the paths for hashed index, markers array and word map files.
|
||||
# If all the files pointed by these paths exist, Concordia reads them to its
|
||||
# RAM index. When none of these files exist, a new empty index is created.
|
||||
# However, if any of these files exist and any other is missing, the index
|
||||
# is considered corrupt and Concordia does not start.
|
||||
|
||||
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||
|
||||
#File containing suffix markers (sentence ids and offsets)
|
||||
|
||||
markers_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
|
||||
word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#Binarized suffix array
|
||||
|
||||
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||
#-------------------------------------------------------------------------------
|
||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
||||
# remove unnecessary symbols and possibly words from sentences added to index
|
||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
# with a single space, removes stop words (if the option is enabled), as well as
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
# Anonymizer -------------------------------------------------------------------
|
||||
|
||||
# File containing all html tags (one per line)
|
||||
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
# File containing all symbols to be replaced by spaces
|
||||
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
|
||||
# If set to true, words from predefined list are removed
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
# If stop_words_enabled is true, set the path to the stop words file
|
||||
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
|
||||
# File containing regular expressions that match named entities
|
||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
||||
# File containing special symbols (one per line) to be removed
|
||||
stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
||||
|
||||
### eof
|
||||
|
@ -1,44 +0,0 @@
|
||||
# ======================================
|
||||
# TIME
|
||||
# ======================================
|
||||
|
||||
Rule "test 17:00"
|
||||
Match: [orth~"([012][0-9]|[1-9]):([0-6][0-9])"];
|
||||
Eval: group(ne_TIME, 1);
|
||||
|
||||
# ======================================
|
||||
# DATES
|
||||
# ======================================
|
||||
|
||||
Rule "day number two digits"
|
||||
Match: [orth~"(0?[1-9]|[12][0-9]|3[01])"];
|
||||
Eval: group(at_DAY_OF_MONTH_NUMBER, 1);
|
||||
|
||||
Rule "ORDINAL as day number"
|
||||
Match: [orth~"(0?[1-9]|[12][0-9]|3[01]).*" && type~"ORDINAL"];
|
||||
Eval: group(at_DAY_OF_MONTH_NUMBER, 1);
|
||||
|
||||
Rule "year number four digits"
|
||||
Match: [orth~"(19[0-9][0-9]|2[01][0-9][0-9])"];
|
||||
Eval: group(at_YEAR_NUMBER, 1);
|
||||
|
||||
Rule "date: number MONTH_NAME year"
|
||||
Match: [type~"at_DAY_OF_MONTH_NUMBER"] [type~"MONTH_NAME"] [type~"at_YEAR_NUMBER"]?;
|
||||
Eval: group(ne_DATE, 1);
|
||||
|
||||
# =======================================
|
||||
# PERSON
|
||||
# =======================================
|
||||
|
||||
Rule "person: first_name and upper case"
|
||||
Match: [type~"FIRST_NAME"] [orth~"[A-Z].*"];
|
||||
Eval: group(ne_PERSON, 1);
|
||||
|
||||
# =======================================
|
||||
# Testing purposes
|
||||
# =======================================
|
||||
|
||||
Rule "city: city of XXX"
|
||||
Match: [orth~"city"/i] [orth~"of"/i] [type~"CITY"];
|
||||
Eval: group(ne_CITY, 3);
|
||||
|
@ -1,7 +0,0 @@
|
||||
[ATTR]
|
||||
|
||||
case = nom gen dat acc inst loc voc
|
||||
|
||||
[POS]
|
||||
|
||||
subst = case
|
Loading…
Reference in New Issue
Block a user