adding all tokenized examples

This commit is contained in:
rjawor 2015-08-19 20:49:26 +02:00
parent a765443a01
commit 68fecaddf8
20 changed files with 220 additions and 119 deletions

View File

@ -1,4 +1,5 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr<TokenizedSentence>
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
IN PROGRESS - document the code (classes, cfg files) and update tutorial
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.

View File

@ -29,7 +29,7 @@ void checkConcordiaResults(
long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize =
result.getTokenizedPattern()->getTokens().size();
result.getTokenizedPattern().getTokens().size();
if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
@ -203,7 +203,7 @@ int main(int argc, char** argv) {
std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(TokenAnnotation annotation,
result->getTokenizedPattern()->getTokens()) {
result->getTokenizedPattern().getTokens()) {
std::cout << annotation.getValue() << " ";
}
std::cout << std::endl;

View File

@ -1,4 +1,5 @@
#include <sstream>
#include <boost/foreach.hpp>
#include "concordia/concordia.hpp"
#include "concordia/common/config.hpp"
@ -42,19 +43,31 @@ std::string _createLibraryVersion() {
return version.str();
}
boost::shared_ptr<TokenizedSentence>
TokenizedSentence
Concordia::tokenize(const std::string & sentence)
throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> result =
TokenizedSentence result =
_hashGenerator->generateHash(sentence);
_hashGenerator->serializeWordMap();
return result;
}
std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences)
throw(ConcordiaException) {
std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence));
}
_hashGenerator->serializeWordMap();
return result;
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
TokenizedSentence Concordia::addExample(
const Example & example)
throw(ConcordiaException) {
return _index->addExample(_hashGenerator, _T, _markers, example);
@ -63,13 +76,21 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id)
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id)
throw(ConcordiaException) {
_index->addTokenizedExample(_hashGenerator, _T,
_markers, tokenizedSentence, id);
}
void Concordia::addAllTokenizedExamples(
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids)
throw(ConcordiaException) {
_index->addAllTokenizedExamples(_hashGenerator, _T,
_markers, tokenizedSentences, ids);
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
@ -188,8 +209,7 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
} else {
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
new TokenizedSentence(empty))));
new ConcordiaSearchResult(TokenizedSentence(empty)));
}
}

View File

@ -58,7 +58,16 @@ public:
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
TokenizedSentence tokenize(const std::string & sentence)
throw(ConcordiaException);
/*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized
\returns vector of tokenized sentence objects
\throws ConcordiaException
*/
std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences)
throw(ConcordiaException);
/*! Adds an Example to the index.
@ -67,17 +76,27 @@ public:
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
throw(ConcordiaException);
TokenizedSentence addExample(const Example & example)
throw(ConcordiaException);
/*! Adds a tokenized example to the index.
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
\param id id of the sentence to be added
\throws ConcordiaException
*/
void addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id)
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id)
throw(ConcordiaException);
/*! Adds multiple tokenized examples to the index.
\param examples vector of examples to be added
\param ids vector of ids of the sentences to be added
\throws ConcordiaException
*/
void addAllTokenizedExamples(
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids)
throw(ConcordiaException);
/*! Adds multiple examples to the index.

View File

@ -4,6 +4,8 @@
#include "concordia/common/config.hpp"
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <boost/make_shared.hpp>
#include <iostream>
#include <climits>
@ -48,10 +50,10 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) {
boost::shared_ptr<TokenizedSentence> hashedPattern =
TokenizedSentence hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedPatterns.push_back(*hashedPattern);
hashedPatterns.push_back(hashedPattern);
}
hashedIndexFile.close();
@ -61,7 +63,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
return hashedPatterns;
}
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
TokenizedSentence ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -72,7 +74,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern =
TokenizedSentence hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id) {
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample(
markersFile.close();
}
void ConcordiaIndex::addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
int index = 0;
BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, tokenizedSentence, ids.at(index));
index++;
}
hashedIndexFile.close();
markersFile.close();
}
void ConcordiaIndex::_addSingleTokenizedExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id) {
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id) {
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample(
markers->push_back(sentenceBoundaryMA);
}
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
TokenizedSentence ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
boost::shared_ptr<TokenizedSentence> hashedPattern =
TokenizedSentence hashedPattern =
hashGenerator->generateHash(example.getSentence());
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, hashedPattern, example.getId());

View File

@ -53,7 +53,7 @@ public:
\returns tokenized example
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(
TokenizedSentence addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -63,7 +63,6 @@ public:
and markers array are appended with the example.
At the same time, HDD versions of these
two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
@ -77,8 +76,28 @@ public:
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id);
/*! Adds multiple tokenized examples to the index. Hashed index
and markers array are appended with the examples.
At the same time, HDD versions of these
two data structures are also appended with the same examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentences vector of tokenized sentences to be added
\param ids vector of ids of the sentences to be added
\throws ConcordiaException
*/
void addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids);
/*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index
@ -114,10 +133,10 @@ private:
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id);
boost::shared_ptr<TokenizedSentence> _addSingleExample(
TokenizedSentence _addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,

View File

@ -4,9 +4,9 @@
#include <algorithm>
ConcordiaSearchResult::ConcordiaSearchResult(
boost::shared_ptr<TokenizedSentence> tokenizedPattern):
_tokenizedPattern(tokenizedPattern),
_bestOverlayScore(0) {
TokenizedSentence tokenizedPattern):
_tokenizedPattern(tokenizedPattern),
_bestOverlayScore(0) {
}
ConcordiaSearchResult::~ConcordiaSearchResult() {
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
// the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1,
_tokenizedPattern->getTokens().size());
_tokenizedPattern.getTokens().size());
}
void ConcordiaSearchResult::_checkPossibleOverlays(

View File

@ -26,8 +26,7 @@ public:
/*! Constructor.
\param tokenVector tokenized pattern which was used for searching
*/
explicit ConcordiaSearchResult(
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern);
/*! Destructor.
*/
@ -51,7 +50,7 @@ public:
/*! Getter for tokenized pattern.
\returns tokenized search pattern
*/
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
TokenizedSentence getTokenizedPattern() const {
return _tokenizedPattern;
}
@ -82,7 +81,7 @@ private:
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize);
boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
TokenizedSentence _tokenizedPattern;
std::vector<MatchedPatternFragment> _matchedPatternFragments;

View File

@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
HashGenerator::~HashGenerator() {
}
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
TokenizedSentence HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> ts =
_sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
ts.generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) {
if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}

View File

@ -44,9 +44,8 @@ public:
\param sentence sentence to generate hash from
\returns tokenized sentence, containing the hash
*/
boost::shared_ptr<TokenizedSentence> generateHash(
const std::string & sentence)
throw(ConcordiaException);
TokenizedSentence generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Saves the contents of current WordMap to HDD.

View File

@ -23,7 +23,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern)->getCodes();
hashGenerator->generateHash(pattern).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -60,7 +60,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern)->getCodes();
hashGenerator->generateHash(pattern).getCodes();
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
}
@ -70,13 +70,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(pattern);
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers,
SA, hashedPattern->getCodes());
SA, hashedPattern.getCodes());
return result;
}

View File

@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString,
RegexRule::~RegexRule() {
}
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
void RegexRule::apply(TokenizedSentence & sentence) {
try {
UnicodeString s(sentence->getSentence().c_str());
UnicodeString s(sentence.getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
_annotationType, value);
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
sentence.addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _annotationType << " to text: "
<< sentence->getSentence();
<< sentence.getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}

View File

@ -42,7 +42,7 @@ public:
/*! Applies regex annotation on tokenized sentence.
\param sentence the input sentence
*/
void apply(boost::shared_ptr<TokenizedSentence> sentence);
void apply(TokenizedSentence & sentence);
private:
int _annotationType;

View File

@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer(
SentenceTokenizer::~SentenceTokenizer() {
}
boost::shared_ptr<TokenizedSentence>
SentenceTokenizer::tokenize(const std::string & sentence) {
boost::shared_ptr<TokenizedSentence>
result(new TokenizedSentence(sentence));
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
TokenizedSentence result(sentence);
_htmlTags->apply(result);
@ -35,7 +33,7 @@ boost::shared_ptr<TokenizedSentence>
neRule.apply(result);
}
result->toLowerCase();
result.toLowerCase();
if (_stopWordsEnabled) {
_stopWords->apply(result);

View File

@ -36,8 +36,7 @@ public:
\param sentence input sentence
\returns tokenized sentence object build on the input sentence
*/
boost::shared_ptr<TokenizedSentence>
tokenize(const std::string & sentence);
TokenizedSentence tokenize(const std::string & sentence);
private:
void _createNeRules(std::string & namedEntitiesPath);

View File

@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
12,16 type: 1 value: kota
*/
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
/*
concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321));
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
*/
std::vector<std::string> sentences;
std::vector<SUFFIX_MARKER_TYPE> ids;
sentences.push_back("Alice has a cat");
ids.push_back(56);
sentences.push_back("Alice has a dog");
ids.push_back(23);
sentences.push_back("New test product has a mistake");
ids.push_back(321);
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
concordia.addAllTokenizedExamples(tokenizedSentences, ids);
TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
concordia.addTokenizedExample(ts, 14);
concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay:
/*
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
*/
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
BOOST_AUTO_TEST_CASE( Tokenize )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota");
TokenizedSentence ts = concordia.tokenize(" Ala posiada kota");
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize )
concordia.clearIndex();
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
std::vector<std::string> sentences;
sentences.push_back("Marysia, ma rysia;");
sentences.push_back("Testing complete;");
sentences.push_back("This, is (a) weird;! sentence <>");
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes();
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);

View File

@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0);
expected.push_back(1);
@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
HashGenerator hashGenerator1 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0);
expected1.push_back(1);
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0);
expected2.push_back(1);
@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
HashGenerator hashGenerator = HashGenerator(config);
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
std::vector<TokenAnnotation> tokens = tokenizedSentence.getTokens();
/*
BOOST_FOREACH(TokenAnnotation annotation, tokens) {

View File

@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleAnnotation )
{
RegexRule rr("a", TokenAnnotation::WORD, "b");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex )
BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
{
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'.");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
{
RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
TokenizedSentence ts("This is AbC and ABC and abc and aBC.");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
{
RegexRule rr("ą", TokenAnnotation::WORD, "x");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
TokenizedSentence ts("zażółć gęślą jaźń");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
{
RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),2);

View File

@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest )
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size());
@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
SentenceTokenizer tokenizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
SentenceTokenizer tokenizer(config);
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
if (config->isStopWordsEnabled()) {
SentenceTokenizer tokenizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence()," wiem konieczne");
}
}
@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(161, annotations.size());