done word positions
This commit is contained in:
parent
724bf0d080
commit
dba70b4e24
10
TODO.txt
10
TODO.txt
@ -1,10 +1,7 @@
|
||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||
|
||||
- repair stop words feature
|
||||
- work on word regex pattern (allow for some symbols and digits within the word)
|
||||
- document the code (classes, cfg files) and update tutorial
|
||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
||||
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||
- testy zużycia pamięci
|
||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||
@ -13,6 +10,11 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
|
||||
|
||||
|
||||
---------------------------- Archive -----------------------------
|
||||
DONE - deal with 0 length patterns
|
||||
DONE - repair concordia-console test feature
|
||||
DONE - update tests
|
||||
DONE - work on word regex pattern (allow for some symbols and digits within the word)
|
||||
REJECTED - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
||||
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
|
||||
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
|
||||
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
||||
@ -26,7 +28,7 @@ REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie
|
||||
|
||||
DONE - wyłączyć stopWords
|
||||
|
||||
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||
DONE - Przy concordia searCh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||
|
||||
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
@ -27,30 +28,32 @@ void checkConcordiaResults(
|
||||
long baseLineCount) {
|
||||
long lineIndex = 1;
|
||||
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
|
||||
|
||||
if (result.getBestOverlay().size() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay has more than one fragment.");
|
||||
}
|
||||
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay fragment has different size than pattern.");
|
||||
}
|
||||
if (result.getBestOverlayScore() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay score is not 1.");
|
||||
}
|
||||
if (result.getFragments().size() == 0) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"there are no matched fragments.");
|
||||
}
|
||||
if (result.getFragments().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"the first fragment does not cover the whole pattern.");
|
||||
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
|
||||
if (patternSize > 0) {
|
||||
if (result.getBestOverlay().size() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay has more than one fragment.");
|
||||
}
|
||||
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay fragment has different size than pattern.");
|
||||
}
|
||||
if (result.getBestOverlayScore() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay score is not 1.");
|
||||
}
|
||||
if (result.getFragments().size() == 0) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"there are no matched fragments.");
|
||||
}
|
||||
if (result.getFragments().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"the first fragment does not cover the whole pattern.");
|
||||
}
|
||||
}
|
||||
lineIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -198,8 +201,8 @@ int main(int argc, char** argv) {
|
||||
msdiff = time_end - time_start;
|
||||
|
||||
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
||||
BOOST_FOREACH(std::string token, result->getTokenVector()) {
|
||||
std::cout << token << " ";
|
||||
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
|
||||
std::cout << annotation.getValue() << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
|
@ -44,16 +44,16 @@ std::string _createLibraryVersion() {
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
void Concordia::addExample(const Example & example)
|
||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
|
||||
throw(ConcordiaException) {
|
||||
_index->addExample(_hashGenerator, _T, _markers, example);
|
||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||
}
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
void Concordia::addAllExamples(const std::vector<Example> & examples)
|
||||
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException) {
|
||||
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||
}
|
||||
|
||||
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
||||
@ -163,9 +163,9 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
||||
return _searcher->concordiaSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
std::vector<std::string> empty;
|
||||
std::string empty;
|
||||
return boost::shared_ptr<ConcordiaSearchResult>(
|
||||
new ConcordiaSearchResult(empty));
|
||||
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/index_searcher.hpp"
|
||||
#include "concordia/concordia_search_result.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/anubis_search_result.hpp"
|
||||
#include <divsufsort.h>
|
||||
|
||||
@ -55,13 +56,13 @@ public:
|
||||
\param example example to be added
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addExample(const Example & example) throw(ConcordiaException);
|
||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
|
||||
|
||||
/*! Adds multiple examples to the index.
|
||||
\param examples vector of examples to be added
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addAllExamples(const std::vector<Example> & examples)
|
||||
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Performs a simple substring lookup on the index.
|
||||
@ -97,7 +98,7 @@ public:
|
||||
|
||||
/*! Loads HDD stored index files to RAM and generates
|
||||
suffix array based on RAM stored data structures.
|
||||
For more info see \ref tutorial2.
|
||||
For more info see \ref tutorial2.
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||
|
@ -34,25 +34,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||
return result;
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
std::ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
std::ofstream markersFile;
|
||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, example);
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
hashGenerator->serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addAllExamples(
|
||||
std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
@ -64,25 +46,50 @@ void ConcordiaIndex::addAllExamples(
|
||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
|
||||
std::vector<TokenizedSentence> hashedPatterns;
|
||||
BOOST_FOREACH(Example example, examples) {
|
||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, example);
|
||||
hashedPatterns.push_back(*hashedPattern);
|
||||
}
|
||||
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
hashGenerator->serializeWordMap();
|
||||
|
||||
return hashedPatterns;
|
||||
}
|
||||
|
||||
void ConcordiaIndex::_addSingleExample(
|
||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
std::ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
std::ofstream markersFile;
|
||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, example);
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
hashGenerator->serializeWordMap();
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash
|
||||
= hashGenerator->generateHash(example.getSentence());
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
||||
|
||||
int offset = 0;
|
||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
@ -110,5 +117,7 @@ void ConcordiaIndex::_addSingleExample(
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "concordia/example.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include <divsufsort.h>
|
||||
|
||||
/*!
|
||||
@ -50,7 +51,7 @@ public:
|
||||
\param example example to be added to index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addExample(
|
||||
boost::shared_ptr<TokenizedSentence> addExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
@ -68,7 +69,7 @@ public:
|
||||
\param examples vector of examples to be added to index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addAllExamples(
|
||||
std::vector<TokenizedSentence> addAllExamples(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
@ -82,7 +83,7 @@ public:
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
void _addSingleExample(std::ofstream & hashedIndexFile,
|
||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -4,8 +4,8 @@
|
||||
#include <algorithm>
|
||||
|
||||
ConcordiaSearchResult::ConcordiaSearchResult(
|
||||
const std::vector<std::string> & tokenVector):
|
||||
_tokenVector(tokenVector),
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedPattern):
|
||||
_tokenizedPattern(tokenizedPattern),
|
||||
_bestOverlayScore(0) {
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
|
||||
// the fragments are already sorted by their ends, ascending
|
||||
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||
-1,
|
||||
_tokenVector.size());
|
||||
_tokenizedPattern->getTokens().size());
|
||||
}
|
||||
|
||||
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||
|
@ -3,7 +3,9 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
@ -25,7 +27,7 @@ public:
|
||||
\param tokenVector tokenized pattern which was used for searching
|
||||
*/
|
||||
explicit ConcordiaSearchResult(
|
||||
const std::vector<std::string> & tokenVector);
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
@ -49,8 +51,8 @@ public:
|
||||
/*! Getter for tokenized pattern.
|
||||
\returns tokenized search pattern
|
||||
*/
|
||||
std::vector<std::string> getTokenVector() const {
|
||||
return _tokenVector;
|
||||
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
|
||||
return _tokenizedPattern;
|
||||
}
|
||||
|
||||
/*! Getter for all matched pattern fragments list.
|
||||
@ -80,7 +82,7 @@ private:
|
||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||
SUFFIX_MARKER_TYPE patternSize);
|
||||
|
||||
std::vector<std::string> _tokenVector;
|
||||
boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
|
||||
|
||||
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
||||
|
||||
|
@ -27,10 +27,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
|
||||
const std::string & sentence) throw(ConcordiaException) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> result;
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
ts->generateHash(_wordMap);
|
||||
|
||||
@ -38,23 +36,9 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
|
||||
return ts->getCodes();
|
||||
return ts;
|
||||
}
|
||||
|
||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||
const std::string & sentence) {
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
std::vector<std::string> tokenTexts;
|
||||
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
|
||||
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
tokenTexts.push_back(annotation.getValue());
|
||||
}
|
||||
}
|
||||
return tokenTexts;
|
||||
}
|
||||
|
||||
|
||||
void HashGenerator::serializeWordMap() {
|
||||
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||
boost::archive::binary_oarchive oa(ofs);
|
||||
|
@ -44,20 +44,9 @@ public:
|
||||
\param sentence sentence to generate hash from
|
||||
\returns vector of integers
|
||||
*/
|
||||
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
||||
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
Generates vector of tokens from a sentence. This method is internally
|
||||
used by generateHash. However, for the sake of concordiaSearch
|
||||
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
||||
tokenization is also needed.
|
||||
\param sentence sentence to tokenize
|
||||
\returns vector of tokens
|
||||
*/
|
||||
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
||||
|
||||
|
||||
/*!
|
||||
Saves the contents of current WordMap to HDD.
|
||||
*/
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "concordia/index_searcher.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
IndexSearcher::IndexSearcher() {
|
||||
@ -22,7 +23,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
|
||||
int left;
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
hashGenerator->generateHash(pattern)->getCodes();
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
@ -56,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
hashGenerator->generateHash(pattern)->getCodes();
|
||||
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
|
||||
}
|
||||
|
||||
@ -66,12 +67,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
hashGenerator->generateHash(pattern);
|
||||
boost::shared_ptr<ConcordiaSearchResult> result =
|
||||
boost::shared_ptr<ConcordiaSearchResult>(
|
||||
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
|
||||
new ConcordiaSearchResult(hashedPattern));
|
||||
|
||||
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
|
||||
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
|
||||
return result;
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_hash_generator.cpp
|
||||
test_regex_rule.cpp
|
||||
test_tokenized_sentence.cpp
|
||||
test_concordia_searcher.cpp
|
||||
@ -10,7 +11,6 @@ add_library(concordia-tests
|
||||
test_logging.cpp
|
||||
test_utils.cpp
|
||||
test_word_map.cpp
|
||||
test_hash_generator.cpp
|
||||
test_concordia_index.cpp
|
||||
test_concordia_config.cpp
|
||||
test_concordia.cpp
|
||||
|
@ -1,13 +1,17 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/anubis_search_result.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(concordia_main)
|
||||
|
||||
@ -22,7 +26,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addExample(Example("Ala posiada kota",14));
|
||||
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
|
||||
/*
|
||||
0,3 type: 1 value: ala
|
||||
4,11 type: 1 value: posiada
|
||||
12,16 type: 1 value: kota
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
|
||||
|
||||
concordia.addExample(Example("Ala posiada rysia",51));
|
||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||
concordia.refreshSAfromRAM();
|
||||
@ -62,7 +77,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
|
||||
// Checking pattern spanning over 2 segments
|
||||
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
@ -74,7 +88,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
testExamples.push_back(Example("czy xjest okno otwarte",202));
|
||||
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
|
||||
testExamples.push_back(Example("xto xjest",29));
|
||||
concordia.addAllExamples(testExamples);
|
||||
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
|
||||
/* checking hashed pattern of sentence "chyba xto xjest xtutuaj":
|
||||
0,5 type: 1 value: chyba
|
||||
6,9 type: 1 value: xto
|
||||
10,15 type: 1 value: xjest
|
||||
16,22 type: 1 value: xtutaj
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getStart(), 10);
|
||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getEnd(), 15);
|
||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
|
||||
|
||||
/*The test index contains 4 sentences:
|
||||
312: "xto xjest okno"
|
||||
@ -119,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
|
@ -354,16 +354,14 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
|
||||
*/
|
||||
|
||||
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX),
|
||||
TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
boost::shared_ptr<ConcordiaConfig> config(
|
||||
new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
boost::shared_ptr<HashGenerator> hashGenerator(new HashGenerator(config));
|
||||
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||
|
||||
@ -372,13 +370,13 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||
index.addExample(hashGenerator, T, markers, Example("Marysia posiada rysia",123));
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||
|
||||
|
||||
|
||||
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
|
||||
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
|
||||
|
||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
|
||||
|
||||
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
|
||||
|
||||
TmMatches * tmMatches14 = tmMatchesMap->find(14)->second;
|
||||
@ -436,5 +434,4 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -4,8 +4,11 @@
|
||||
#include <sstream>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(hash_generator)
|
||||
@ -20,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
|
||||
std::vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(0);
|
||||
expected.push_back(1);
|
||||
@ -73,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
|
||||
std::vector<INDEX_CHARACTER_TYPE> expected1;
|
||||
expected1.push_back(0);
|
||||
expected1.push_back(1);
|
||||
@ -83,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
|
||||
std::vector<INDEX_CHARACTER_TYPE> expected2;
|
||||
expected2.push_back(0);
|
||||
expected2.push_back(1);
|
||||
@ -103,27 +106,48 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||
std::vector<std::string> expected;
|
||||
expected.push_back("ne_date");
|
||||
expected.push_back("o");
|
||||
expected.push_back("godzinie");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("doszło");
|
||||
expected.push_back("do");
|
||||
expected.push_back("kolizji");
|
||||
expected.push_back("na");
|
||||
expected.push_back("ulicy");
|
||||
expected.push_back("grobla");
|
||||
expected.push_back("policjanci");
|
||||
expected.push_back("ustalili");
|
||||
expected.push_back("że");
|
||||
expected.push_back("kierowca");
|
||||
expected.push_back("zaparkował");
|
||||
expected.push_back("samochód");
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
|
||||
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
|
||||
|
||||
/*
|
||||
BOOST_FOREACH(TokenAnnotation annotation, tokens) {
|
||||
std::cout << annotation.getStart() << ","
|
||||
<< annotation.getEnd() << " type: "
|
||||
<< annotation.getType() << " value: "
|
||||
<< annotation.getValue() << std::endl;
|
||||
}
|
||||
|
||||
0,10 type: 0 value: ne_date
|
||||
13,14 type: 1 value: o
|
||||
16,24 type: 1 value: godzinie
|
||||
25,27 type: 0 value: ne_number
|
||||
28,30 type: 0 value: ne_number
|
||||
31,37 type: 1 value: doszło
|
||||
38,40 type: 1 value: do
|
||||
41,48 type: 1 value: kolizji
|
||||
49,51 type: 1 value: na
|
||||
52,57 type: 1 value: ulicy
|
||||
58,64 type: 1 value: grobla
|
||||
66,76 type: 1 value: policjanci
|
||||
77,85 type: 1 value: ustalili
|
||||
87,89 type: 1 value: że
|
||||
93,101 type: 1 value: kierowca
|
||||
106,116 type: 1 value: zaparkował
|
||||
118,126 type: 1 value: samochód
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(17,tokens.size());
|
||||
|
||||
BOOST_CHECK_EQUAL(tokens.at(0).getStart(),0);
|
||||
BOOST_CHECK_EQUAL(tokens.at(0).getEnd(),10);
|
||||
BOOST_CHECK_EQUAL(tokens.at(0).getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(tokens.at(0).getValue(),"ne_date");
|
||||
|
||||
BOOST_CHECK_EQUAL(tokens.at(15).getStart(),106);
|
||||
BOOST_CHECK_EQUAL(tokens.at(15).getEnd(),116);
|
||||
BOOST_CHECK_EQUAL(tokens.at(15).getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(tokens.at(15).getValue(),"zaparkował");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -217,15 +217,6 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
/*
|
||||
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
|
||||
std::cout << annotation.getStart() << ","
|
||||
<< annotation.getEnd() << " type: "
|
||||
<< annotation.getType() << " value: "
|
||||
<< annotation.getValue() << std::endl;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
0,4 type: 1 value: this
|
||||
|
Loading…
Reference in New Issue
Block a user