done word positions

This commit is contained in:
rjawor 2015-06-26 22:50:53 +02:00
parent 724bf0d080
commit dba70b4e24
19 changed files with 178 additions and 150 deletions

View File

@ -1,10 +1,7 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- repair stop words feature
- work on word regex pattern (allow for some symbols and digits within the word)
- document the code (classes, cfg files) and update tutorial
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
- testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
@ -13,6 +10,11 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
---------------------------- Archive -----------------------------
DONE - deal with 0 length patterns
DONE - repair concordia-console test feature
DONE - update tests
DONE - work on word regex pattern (allow for some symbols and digits within the word)
REJECTED - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
@ -26,7 +28,7 @@ REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie
DONE - wyłączyć stopWords
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
DONE - Przy concordia searCh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.

View File

@ -8,6 +8,7 @@
#include "concordia/concordia.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
#include "build/libdivsufsort/include/divsufsort.h"
@ -27,30 +28,32 @@ void checkConcordiaResults(
long baseLineCount) {
long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
}
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay score is not 1.");
}
if (result.getFragments().size() == 0) {
reportError(baseLineCount + lineIndex,
"there are no matched fragments.");
}
if (result.getFragments().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"the first fragment does not cover the whole pattern.");
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
}
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay score is not 1.");
}
if (result.getFragments().size() == 0) {
reportError(baseLineCount + lineIndex,
"there are no matched fragments.");
}
if (result.getFragments().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"the first fragment does not cover the whole pattern.");
}
}
lineIndex++;
}
}
@ -198,8 +201,8 @@ int main(int argc, char** argv) {
msdiff = time_end - time_start;
std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(std::string token, result->getTokenVector()) {
std::cout << token << " ";
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
std::cout << annotation.getValue() << " ";
}
std::cout << std::endl;

View File

@ -44,16 +44,16 @@ std::string _createLibraryVersion() {
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addExample(const Example & example)
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
throw(ConcordiaException) {
_index->addExample(_hashGenerator, _T, _markers, example);
return _index->addExample(_hashGenerator, _T, _markers, example);
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addAllExamples(const std::vector<Example> & examples)
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException) {
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
}
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
@ -163,9 +163,9 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
return _searcher->concordiaSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<std::string> empty;
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(empty));
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
}
}

View File

@ -13,6 +13,7 @@
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/anubis_search_result.hpp"
#include <divsufsort.h>
@ -55,13 +56,13 @@ public:
\param example example to be added
\throws ConcordiaException
*/
void addExample(const Example & example) throw(ConcordiaException);
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
/*! Adds multiple examples to the index.
\param examples vector of examples to be added
\throws ConcordiaException
*/
void addAllExamples(const std::vector<Example> & examples)
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException);
/*! Performs a simple substring lookup on the index.
@ -97,7 +98,7 @@ public:
/*! Loads HDD stored index files to RAM and generates
suffix array based on RAM stored data structures.
For more info see \ref tutorial2.
For more info see \ref tutorial2.
\throws ConcordiaException
*/
void loadRAMIndexFromDisk() throw(ConcordiaException);

View File

@ -34,25 +34,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
return result;
}
void ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
}
void ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -64,25 +46,50 @@ void ConcordiaIndex::addAllExamples(
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) {
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedPatterns.push_back(*hashedPattern);
}
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPatterns;
}
void ConcordiaIndex::_addSingleExample(
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPattern;
}
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
std::vector<INDEX_CHARACTER_TYPE> hash
= hashGenerator->generateHash(example.getSentence());
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
@ -110,5 +117,7 @@ void ConcordiaIndex::_addSingleExample(
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
return hashedPattern;
}

View File

@ -11,6 +11,7 @@
#include "concordia/example.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <divsufsort.h>
/*!
@ -50,7 +51,7 @@ public:
\param example example to be added to index
\throws ConcordiaException
*/
void addExample(
boost::shared_ptr<TokenizedSentence> addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -68,7 +69,7 @@ public:
\param examples vector of examples to be added to index
\throws ConcordiaException
*/
void addAllExamples(
std::vector<TokenizedSentence> addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -82,7 +83,7 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T);
private:
void _addSingleExample(std::ofstream & hashedIndexFile,
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -4,8 +4,8 @@
#include <algorithm>
ConcordiaSearchResult::ConcordiaSearchResult(
const std::vector<std::string> & tokenVector):
_tokenVector(tokenVector),
boost::shared_ptr<TokenizedSentence> tokenizedPattern):
_tokenizedPattern(tokenizedPattern),
_bestOverlayScore(0) {
}
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
// the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1,
_tokenVector.size());
_tokenizedPattern->getTokens().size());
}
void ConcordiaSearchResult::_checkPossibleOverlays(

View File

@ -3,7 +3,9 @@
#include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <boost/shared_ptr.hpp>
#include <vector>
#include <string>
@ -25,7 +27,7 @@ public:
\param tokenVector tokenized pattern which was used for searching
*/
explicit ConcordiaSearchResult(
const std::vector<std::string> & tokenVector);
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
/*! Destructor.
*/
@ -49,8 +51,8 @@ public:
/*! Getter for tokenized pattern.
\returns tokenized search pattern
*/
std::vector<std::string> getTokenVector() const {
return _tokenVector;
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
return _tokenizedPattern;
}
/*! Getter for all matched pattern fragments list.
@ -80,7 +82,7 @@ private:
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize);
std::vector<std::string> _tokenVector;
boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
std::vector<MatchedPatternFragment> _matchedPatternFragments;

View File

@ -27,10 +27,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
HashGenerator::~HashGenerator() {
}
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> result;
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
@ -38,23 +36,9 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
throw ConcordiaException("Trying to add too long sentence.");
}
return ts->getCodes();
return ts;
}
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::vector<std::string> tokenTexts;
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
tokenTexts.push_back(annotation.getValue());
}
}
return tokenTexts;
}
void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs);

View File

@ -44,20 +44,9 @@ public:
\param sentence sentence to generate hash from
\returns vector of integers
*/
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/
std::vector<std::string> generateTokenVector(const std::string & sentence);
/*!
Saves the contents of current WordMap to HDD.
*/

View File

@ -1,6 +1,7 @@
#include "concordia/index_searcher.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher() {
@ -22,7 +23,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern);
hashGenerator->generateHash(pattern)->getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -56,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern);
hashGenerator->generateHash(pattern)->getCodes();
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
}
@ -66,12 +67,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(pattern);
boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
return result;
}

View File

@ -1,4 +1,5 @@
add_library(concordia-tests
test_hash_generator.cpp
test_regex_rule.cpp
test_tokenized_sentence.cpp
test_concordia_searcher.cpp
@ -10,7 +11,6 @@ add_library(concordia-tests
test_logging.cpp
test_utils.cpp
test_word_map.cpp
test_hash_generator.cpp
test_concordia_index.cpp
test_concordia_config.cpp
test_concordia.cpp

View File

@ -1,13 +1,17 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/concordia.hpp"
#include "concordia/anubis_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "tests/common/test_resources_manager.hpp"
#include "concordia/common/config.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include <string>
#include <vector>
BOOST_AUTO_TEST_SUITE(concordia_main)
@ -22,7 +26,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addExample(Example("Ala posiada kota",14));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
12,16 type: 1 value: kota
*/
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM();
@ -62,7 +77,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
// Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -74,7 +88,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
testExamples.push_back(Example("czy xjest okno otwarte",202));
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
testExamples.push_back(Example("xto xjest",29));
concordia.addAllExamples(testExamples);
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
/* checking hashed pattern of sentence "chyba xto xjest xtutuaj":
0,5 type: 1 value: chyba
6,9 type: 1 value: xto
10,15 type: 1 value: xjest
16,22 type: 1 value: xtutaj
*/
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getStart(), 10);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getEnd(), 15);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
/*The test index contains 4 sentences:
312: "xto xjest okno"
@ -119,7 +143,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )

View File

@ -354,16 +354,14 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX),
TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::shared_ptr<ConcordiaConfig> config(
new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
boost::shared_ptr<HashGenerator> hashGenerator(new HashGenerator(config));
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
@ -372,13 +370,13 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
index.addExample(hashGenerator, T, markers, Example("Marysia posiada rysia",123));
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
TmMatches * tmMatches14 = tmMatchesMap->find(14)->second;
@ -436,5 +434,4 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -4,8 +4,11 @@
#include <sstream>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/common/config.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(hash_generator)
@ -20,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0);
expected.push_back(1);
@ -73,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
HashGenerator hashGenerator1 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0);
expected1.push_back(1);
@ -83,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0);
expected2.push_back(1);
@ -103,27 +106,48 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
HashGenerator hashGenerator = HashGenerator(config);
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
std::vector<std::string> expected;
expected.push_back("ne_date");
expected.push_back("o");
expected.push_back("godzinie");
expected.push_back("ne_number");
expected.push_back("ne_number");
expected.push_back("doszło");
expected.push_back("do");
expected.push_back("kolizji");
expected.push_back("na");
expected.push_back("ulicy");
expected.push_back("grobla");
expected.push_back("policjanci");
expected.push_back("ustalili");
expected.push_back("że");
expected.push_back("kierowca");
expected.push_back("zaparkował");
expected.push_back("samochód");
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
/*
BOOST_FOREACH(TokenAnnotation annotation, tokens) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
0,10 type: 0 value: ne_date
13,14 type: 1 value: o
16,24 type: 1 value: godzinie
25,27 type: 0 value: ne_number
28,30 type: 0 value: ne_number
31,37 type: 1 value: doszło
38,40 type: 1 value: do
41,48 type: 1 value: kolizji
49,51 type: 1 value: na
52,57 type: 1 value: ulicy
58,64 type: 1 value: grobla
66,76 type: 1 value: policjanci
77,85 type: 1 value: ustalili
87,89 type: 1 value: że
93,101 type: 1 value: kierowca
106,116 type: 1 value: zaparkował
118,126 type: 1 value: samochód
*/
BOOST_CHECK_EQUAL(17,tokens.size());
BOOST_CHECK_EQUAL(tokens.at(0).getStart(),0);
BOOST_CHECK_EQUAL(tokens.at(0).getEnd(),10);
BOOST_CHECK_EQUAL(tokens.at(0).getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(tokens.at(0).getValue(),"ne_date");
BOOST_CHECK_EQUAL(tokens.at(15).getStart(),106);
BOOST_CHECK_EQUAL(tokens.at(15).getEnd(),116);
BOOST_CHECK_EQUAL(tokens.at(15).getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(tokens.at(15).getValue(),"zaparkował");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -217,15 +217,6 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
*/
/*
0,4 type: 1 value: this