done word positions

This commit is contained in:
rjawor 2015-06-26 22:50:53 +02:00
parent 724bf0d080
commit dba70b4e24
19 changed files with 178 additions and 150 deletions

View File

@ -1,10 +1,7 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- repair stop words feature
- work on word regex pattern (allow for some symbols and digits within the word)
- document the code (classes, cfg files) and update tutorial
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
- testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
@ -13,6 +10,11 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
---------------------------- Archive -----------------------------
DONE - deal with 0 length patterns
DONE - repair concordia-console test feature
DONE - update tests
DONE - work on word regex pattern (allow for some symbols and digits within the word)
REJECTED - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
@ -26,7 +28,7 @@ REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie
DONE - wyłączyć stopWords
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
DONE - Przy concordia searCh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.

View File

@ -8,6 +8,7 @@
#include "concordia/concordia.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
#include "build/libdivsufsort/include/divsufsort.h"
@ -27,30 +28,32 @@ void checkConcordiaResults(
long baseLineCount) {
long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
}
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay score is not 1.");
}
if (result.getFragments().size() == 0) {
reportError(baseLineCount + lineIndex,
"there are no matched fragments.");
}
if (result.getFragments().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"the first fragment does not cover the whole pattern.");
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
}
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay score is not 1.");
}
if (result.getFragments().size() == 0) {
reportError(baseLineCount + lineIndex,
"there are no matched fragments.");
}
if (result.getFragments().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"the first fragment does not cover the whole pattern.");
}
}
lineIndex++;
}
}
@ -198,8 +201,8 @@ int main(int argc, char** argv) {
msdiff = time_end - time_start;
std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(std::string token, result->getTokenVector()) {
std::cout << token << " ";
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
std::cout << annotation.getValue() << " ";
}
std::cout << std::endl;

View File

@ -44,16 +44,16 @@ std::string _createLibraryVersion() {
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addExample(const Example & example)
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
throw(ConcordiaException) {
_index->addExample(_hashGenerator, _T, _markers, example);
return _index->addExample(_hashGenerator, _T, _markers, example);
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addAllExamples(const std::vector<Example> & examples)
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException) {
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
}
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
@ -163,9 +163,9 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
return _searcher->concordiaSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<std::string> empty;
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(empty));
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
}
}

View File

@ -13,6 +13,7 @@
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/anubis_search_result.hpp"
#include <divsufsort.h>
@ -55,13 +56,13 @@ public:
\param example example to be added
\throws ConcordiaException
*/
void addExample(const Example & example) throw(ConcordiaException);
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
/*! Adds multiple examples to the index.
\param examples vector of examples to be added
\throws ConcordiaException
*/
void addAllExamples(const std::vector<Example> & examples)
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException);
/*! Performs a simple substring lookup on the index.

View File

@ -34,25 +34,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
return result;
}
void ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
}
void ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -64,25 +46,50 @@ void ConcordiaIndex::addAllExamples(
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) {
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedPatterns.push_back(*hashedPattern);
}
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPatterns;
}
void ConcordiaIndex::_addSingleExample(
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPattern;
}
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::vector<INDEX_CHARACTER_TYPE> hash
= hashGenerator->generateHash(example.getSentence());
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
@ -110,5 +117,7 @@ void ConcordiaIndex::_addSingleExample(
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
return hashedPattern;
}

View File

@ -11,6 +11,7 @@
#include "concordia/example.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <divsufsort.h>
/*!
@ -50,7 +51,7 @@ public:
\param example example to be added to index
\throws ConcordiaException
*/
void addExample(
boost::shared_ptr<TokenizedSentence> addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -68,7 +69,7 @@ public:
\param examples vector of examples to be added to index
\throws ConcordiaException
*/
void addAllExamples(
std::vector<TokenizedSentence> addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -82,7 +83,7 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T);
private:
void _addSingleExample(std::ofstream & hashedIndexFile,
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -4,8 +4,8 @@
#include <algorithm>
ConcordiaSearchResult::ConcordiaSearchResult(
const std::vector<std::string> & tokenVector):
_tokenVector(tokenVector),
boost::shared_ptr<TokenizedSentence> tokenizedPattern):
_tokenizedPattern(tokenizedPattern),
_bestOverlayScore(0) {
}
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
// the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1,
_tokenVector.size());
_tokenizedPattern->getTokens().size());
}
void ConcordiaSearchResult::_checkPossibleOverlays(

View File

@ -3,7 +3,9 @@
#include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <boost/shared_ptr.hpp>
#include <vector>
#include <string>
@ -25,7 +27,7 @@ public:
\param tokenVector tokenized pattern which was used for searching
*/
explicit ConcordiaSearchResult(
const std::vector<std::string> & tokenVector);
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
/*! Destructor.
*/
@ -49,8 +51,8 @@ public:
/*! Getter for tokenized pattern.
\returns tokenized search pattern
*/
std::vector<std::string> getTokenVector() const {
return _tokenVector;
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
return _tokenizedPattern;
}
/*! Getter for all matched pattern fragments list.
@ -80,7 +82,7 @@ private:
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize);
std::vector<std::string> _tokenVector;
boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
std::vector<MatchedPatternFragment> _matchedPatternFragments;

View File

@ -27,10 +27,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
HashGenerator::~HashGenerator() {
}
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> result;
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
@ -38,23 +36,9 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
throw ConcordiaException("Trying to add too long sentence.");
}
return ts->getCodes();
return ts;
}
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::vector<std::string> tokenTexts;
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
tokenTexts.push_back(annotation.getValue());
}
}
return tokenTexts;
}
void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs);

View File

@ -44,20 +44,9 @@ public:
\param sentence sentence to generate hash from
\returns vector of integers
*/
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/
std::vector<std::string> generateTokenVector(const std::string & sentence);
/*!
Saves the contents of current WordMap to HDD.
*/

View File

@ -1,6 +1,7 @@
#include "concordia/index_searcher.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher() {
@ -22,7 +23,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern);
hashGenerator->generateHash(pattern)->getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -56,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern);
hashGenerator->generateHash(pattern)->getCodes();
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
}
@ -66,12 +67,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(pattern);
boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
return result;
}

View File

@ -1,4 +1,5 @@
add_library(concordia-tests
test_hash_generator.cpp
test_regex_rule.cpp
test_tokenized_sentence.cpp
test_concordia_searcher.cpp
@ -10,7 +11,6 @@ add_library(concordia-tests
test_logging.cpp
test_utils.cpp
test_word_map.cpp
test_hash_generator.cpp
test_concordia_index.cpp
test_concordia_config.cpp
test_concordia.cpp

View File

@ -1,13 +1,17 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/concordia.hpp"
#include "concordia/anubis_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "tests/common/test_resources_manager.hpp"
#include "concordia/common/config.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include <string>
#include <vector>
BOOST_AUTO_TEST_SUITE(concordia_main)
@ -22,7 +26,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addExample(Example("Ala posiada kota",14));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
12,16 type: 1 value: kota
*/
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM();
@ -62,7 +77,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
// Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -74,7 +88,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
testExamples.push_back(Example("czy xjest okno otwarte",202));
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
testExamples.push_back(Example("xto xjest",29));
concordia.addAllExamples(testExamples);
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
/* checking hashed pattern of sentence "chyba xto xjest xtutuaj":
0,5 type: 1 value: chyba
6,9 type: 1 value: xto
10,15 type: 1 value: xjest
16,22 type: 1 value: xtutaj
*/
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getStart(), 10);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getEnd(), 15);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
/*The test index contains 4 sentences:
312: "xto xjest okno"

View File

@ -354,7 +354,6 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX),
@ -363,7 +362,6 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
boost::shared_ptr<HashGenerator> hashGenerator(new HashGenerator(config));
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
@ -373,12 +371,12 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
TmMatches * tmMatches14 = tmMatchesMap->find(14)->second;
@ -436,5 +434,4 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -4,8 +4,11 @@
#include <sstream>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/common/config.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(hash_generator)
@ -20,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0);
expected.push_back(1);
@ -73,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
HashGenerator hashGenerator1 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0);
expected1.push_back(1);
@ -83,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0);
expected2.push_back(1);
@ -103,27 +106,48 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
HashGenerator hashGenerator = HashGenerator(config);
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
std::vector<std::string> expected;
expected.push_back("ne_date");
expected.push_back("o");
expected.push_back("godzinie");
expected.push_back("ne_number");
expected.push_back("ne_number");
expected.push_back("doszło");
expected.push_back("do");
expected.push_back("kolizji");
expected.push_back("na");
expected.push_back("ulicy");
expected.push_back("grobla");
expected.push_back("policjanci");
expected.push_back("ustalili");
expected.push_back("że");
expected.push_back("kierowca");
expected.push_back("zaparkował");
expected.push_back("samochód");
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
/*
BOOST_FOREACH(TokenAnnotation annotation, tokens) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
0,10 type: 0 value: ne_date
13,14 type: 1 value: o
16,24 type: 1 value: godzinie
25,27 type: 0 value: ne_number
28,30 type: 0 value: ne_number
31,37 type: 1 value: doszło
38,40 type: 1 value: do
41,48 type: 1 value: kolizji
49,51 type: 1 value: na
52,57 type: 1 value: ulicy
58,64 type: 1 value: grobla
66,76 type: 1 value: policjanci
77,85 type: 1 value: ustalili
87,89 type: 1 value: że
93,101 type: 1 value: kierowca
106,116 type: 1 value: zaparkował
118,126 type: 1 value: samochód
*/
BOOST_CHECK_EQUAL(17,tokens.size());
BOOST_CHECK_EQUAL(tokens.at(0).getStart(),0);
BOOST_CHECK_EQUAL(tokens.at(0).getEnd(),10);
BOOST_CHECK_EQUAL(tokens.at(0).getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(tokens.at(0).getValue(),"ne_date");
BOOST_CHECK_EQUAL(tokens.at(15).getStart(),106);
BOOST_CHECK_EQUAL(tokens.at(15).getEnd(),116);
BOOST_CHECK_EQUAL(tokens.at(15).getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(tokens.at(15).getValue(),"zaparkował");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -218,15 +218,6 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
*/
/*
0,4 type: 1 value: this
5,7 type: 1 value: is