adding all tokenized examples

This commit is contained in:
rjawor 2015-08-19 20:49:26 +02:00
parent a765443a01
commit 68fecaddf8
20 changed files with 220 additions and 119 deletions

View File

@ -1,4 +1,5 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr<TokenizedSentence>
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results. - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
IN PROGRESS - document the code (classes, cfg files) and update tutorial IN PROGRESS - document the code (classes, cfg files) and update tutorial
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.

View File

@ -29,7 +29,7 @@ void checkConcordiaResults(
long lineIndex = 1; long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) { BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = SUFFIX_MARKER_TYPE patternSize =
result.getTokenizedPattern()->getTokens().size(); result.getTokenizedPattern().getTokens().size();
if (patternSize > 0) { if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) { if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex, reportError(baseLineCount + lineIndex,
@ -203,7 +203,7 @@ int main(int argc, char** argv) {
std::cout << "\tPattern used: " << std::endl << "\t\t"; std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(TokenAnnotation annotation, BOOST_FOREACH(TokenAnnotation annotation,
result->getTokenizedPattern()->getTokens()) { result->getTokenizedPattern().getTokens()) {
std::cout << annotation.getValue() << " "; std::cout << annotation.getValue() << " ";
} }
std::cout << std::endl; std::cout << std::endl;

View File

@ -1,4 +1,5 @@
#include <sstream> #include <sstream>
#include <boost/foreach.hpp>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
@ -42,19 +43,31 @@ std::string _createLibraryVersion() {
return version.str(); return version.str();
} }
boost::shared_ptr<TokenizedSentence> TokenizedSentence
Concordia::tokenize(const std::string & sentence) Concordia::tokenize(const std::string & sentence)
throw(ConcordiaException) { throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> result = TokenizedSentence result =
_hashGenerator->generateHash(sentence); _hashGenerator->generateHash(sentence);
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
return result; return result;
} }
std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences)
throw(ConcordiaException) {
std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence));
}
_hashGenerator->serializeWordMap();
return result;
}
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
boost::shared_ptr<TokenizedSentence> Concordia::addExample( TokenizedSentence Concordia::addExample(
const Example & example) const Example & example)
throw(ConcordiaException) { throw(ConcordiaException) {
return _index->addExample(_hashGenerator, _T, _markers, example); return _index->addExample(_hashGenerator, _T, _markers, example);
@ -63,13 +76,21 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
void Concordia::addTokenizedExample( void Concordia::addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id) const SUFFIX_MARKER_TYPE id)
throw(ConcordiaException) { throw(ConcordiaException) {
_index->addTokenizedExample(_hashGenerator, _T, _index->addTokenizedExample(_hashGenerator, _T,
_markers, tokenizedSentence, id); _markers, tokenizedSentence, id);
} }
void Concordia::addAllTokenizedExamples(
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids)
throw(ConcordiaException) {
_index->addAllTokenizedExamples(_hashGenerator, _T,
_markers, tokenizedSentences, ids);
}
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
@ -188,8 +209,7 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
} else { } else {
std::string empty; std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>( return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>( new ConcordiaSearchResult(TokenizedSentence(empty)));
new TokenizedSentence(empty))));
} }
} }

View File

@ -58,7 +58,16 @@ public:
containing information about original word positions containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence) TokenizedSentence tokenize(const std::string & sentence)
throw(ConcordiaException);
/*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized
\returns vector of tokenized sentence objects
\throws ConcordiaException
*/
std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds an Example to the index. /*! Adds an Example to the index.
@ -67,17 +76,27 @@ public:
containing information about original word positions containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) TokenizedSentence addExample(const Example & example)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds a tokenized example to the index. /*! Adds a tokenized example to the index.
\param tokenizedSentence tokenized sentence to be added \param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added \param id id of the sentence to be added
\throws ConcordiaException \throws ConcordiaException
*/ */
void addTokenizedExample( void addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id) const SUFFIX_MARKER_TYPE id)
throw(ConcordiaException);
/*! Adds multiple tokenized examples to the index.
\param examples vector of examples to be added
\param ids vector of ids of the sentences to be added
\throws ConcordiaException
*/
void addAllTokenizedExamples(
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds multiple examples to the index. /*! Adds multiple examples to the index.

View File

@ -4,6 +4,8 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <boost/make_shared.hpp>
#include <iostream> #include <iostream>
#include <climits> #include <climits>
@ -48,10 +50,10 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> hashedPatterns; std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) { BOOST_FOREACH(Example example, examples) {
boost::shared_ptr<TokenizedSentence> hashedPattern = TokenizedSentence hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator, _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example); T, markers, example);
hashedPatterns.push_back(*hashedPattern); hashedPatterns.push_back(hashedPattern);
} }
hashedIndexFile.close(); hashedIndexFile.close();
@ -61,7 +63,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
return hashedPatterns; return hashedPatterns;
} }
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample( TokenizedSentence ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -72,7 +74,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
std::ofstream markersFile; std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out| markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary); std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern = TokenizedSentence hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator, _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example); T, markers, example);
hashedIndexFile.close(); hashedIndexFile.close();
@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id) { const SUFFIX_MARKER_TYPE id) {
std::ofstream hashedIndexFile; std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary); std::ios::app|std::ios::binary);
@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample(
markersFile.close(); markersFile.close();
} }
void ConcordiaIndex::addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
int index = 0;
BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, tokenizedSentence, ids.at(index));
index++;
}
hashedIndexFile.close();
markersFile.close();
}
void ConcordiaIndex::_addSingleTokenizedExample( void ConcordiaIndex::_addSingleTokenizedExample(
std::ofstream & hashedIndexFile, std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id) { const SUFFIX_MARKER_TYPE id) {
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
int offset = 0; int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin(); for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample(
markers->push_back(sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA);
} }
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample( TokenizedSentence ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile, std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) { const Example & example) {
boost::shared_ptr<TokenizedSentence> hashedPattern = TokenizedSentence hashedPattern =
hashGenerator->generateHash(example.getSentence()); hashGenerator->generateHash(example.getSentence());
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, hashedPattern, example.getId()); T, markers, hashedPattern, example.getId());

View File

@ -53,7 +53,7 @@ public:
\returns tokenized example \returns tokenized example
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<TokenizedSentence> addExample( TokenizedSentence addExample(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -63,7 +63,6 @@ public:
and markers array are appended with the example. and markers array are appended with the example.
At the same time, HDD versions of these At the same time, HDD versions of these
two data structures are also appended with the same example. two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash \param hashGenerator hash generator to be used to prepare the hash
of the example of the example
\param T RAM-based hash index to be appended to \param T RAM-based hash index to be appended to
@ -77,8 +76,28 @@ public:
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id); const SUFFIX_MARKER_TYPE id);
/*! Adds multiple tokenized examples to the index. Hashed index
and markers array are appended with the examples.
At the same time, HDD versions of these
two data structures are also appended with the same examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentences vector of tokenized sentences to be added
\param ids vector of ids of the sentences to be added
\throws ConcordiaException
*/
void addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids);
/*! Adds multiple examples to the index. Examples are first hashed using /*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index the hash generator passed to this method. Then, hashed index
@ -114,10 +133,10 @@ private:
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence, const TokenizedSentence & tokenizedSentence,
SUFFIX_MARKER_TYPE id); const SUFFIX_MARKER_TYPE id);
boost::shared_ptr<TokenizedSentence> _addSingleExample( TokenizedSentence _addSingleExample(
std::ofstream & hashedIndexFile, std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,

View File

@ -4,7 +4,7 @@
#include <algorithm> #include <algorithm>
ConcordiaSearchResult::ConcordiaSearchResult( ConcordiaSearchResult::ConcordiaSearchResult(
boost::shared_ptr<TokenizedSentence> tokenizedPattern): TokenizedSentence tokenizedPattern):
_tokenizedPattern(tokenizedPattern), _tokenizedPattern(tokenizedPattern),
_bestOverlayScore(0) { _bestOverlayScore(0) {
} }
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
// the fragments are already sorted by their ends, ascending // the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(), _checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1, -1,
_tokenizedPattern->getTokens().size()); _tokenizedPattern.getTokens().size());
} }
void ConcordiaSearchResult::_checkPossibleOverlays( void ConcordiaSearchResult::_checkPossibleOverlays(

View File

@ -26,8 +26,7 @@ public:
/*! Constructor. /*! Constructor.
\param tokenVector tokenized pattern which was used for searching \param tokenVector tokenized pattern which was used for searching
*/ */
explicit ConcordiaSearchResult( explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern);
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
/*! Destructor. /*! Destructor.
*/ */
@ -51,7 +50,7 @@ public:
/*! Getter for tokenized pattern. /*! Getter for tokenized pattern.
\returns tokenized search pattern \returns tokenized search pattern
*/ */
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const { TokenizedSentence getTokenizedPattern() const {
return _tokenizedPattern; return _tokenizedPattern;
} }
@ -82,7 +81,7 @@ private:
SUFFIX_MARKER_TYPE lastAddedPos, SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize); SUFFIX_MARKER_TYPE patternSize);
boost::shared_ptr<TokenizedSentence> _tokenizedPattern; TokenizedSentence _tokenizedPattern;
std::vector<MatchedPatternFragment> _matchedPatternFragments; std::vector<MatchedPatternFragment> _matchedPatternFragments;

View File

@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
HashGenerator::~HashGenerator() { HashGenerator::~HashGenerator() {
} }
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash( TokenizedSentence HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) { const std::string & sentence) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> ts = TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
_sentenceTokenizer->tokenize(sentence); ts.generateHash(_wordMap);
ts->generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) { if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence."); throw ConcordiaException("Trying to add too long sentence.");
} }

View File

@ -44,8 +44,7 @@ public:
\param sentence sentence to generate hash from \param sentence sentence to generate hash from
\returns tokenized sentence, containing the hash \returns tokenized sentence, containing the hash
*/ */
boost::shared_ptr<TokenizedSentence> generateHash( TokenizedSentence generateHash(const std::string & sentence)
const std::string & sentence)
throw(ConcordiaException); throw(ConcordiaException);
/*! /*!

View File

@ -23,7 +23,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int left; int left;
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern)->getCodes(); hashGenerator->generateHash(pattern).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
@ -60,7 +60,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern)->getCodes(); hashGenerator->generateHash(pattern).getCodes();
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash); return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
} }
@ -70,13 +70,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> hashedPattern = TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
hashGenerator->generateHash(pattern);
boost::shared_ptr<ConcordiaSearchResult> result = boost::shared_ptr<ConcordiaSearchResult> result =
boost::shared_ptr<ConcordiaSearchResult>( boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern)); new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers, _concordiaSearcher->concordiaSearch(result, T, markers,
SA, hashedPattern->getCodes()); SA, hashedPattern.getCodes());
return result; return result;
} }

View File

@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString,
RegexRule::~RegexRule() { RegexRule::~RegexRule() {
} }
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) { void RegexRule::apply(TokenizedSentence & sentence) {
try { try {
UnicodeString s(sentence->getSentence().c_str()); UnicodeString s(sentence.getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin( boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern)); boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end; boost::u32regex_iterator<const UChar*> end;
@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
_annotationType, value); _annotationType, value);
annotations.push_back(annotation); annotations.push_back(annotation);
} }
sentence->addAnnotations(annotations); sentence.addAnnotations(annotations);
} catch(const std::exception & e) { } catch(const std::exception & e) {
std::stringstream ss; std::stringstream ss;
ss << "Exception while applying regex rule: " ss << "Exception while applying regex rule: "
<< _annotationType << " to text: " << _annotationType << " to text: "
<< sentence->getSentence(); << sentence.getSentence();
ss << ", message: " << e.what(); ss << ", message: " << e.what();
throw ConcordiaException(ss.str()); throw ConcordiaException(ss.str());
} }

View File

@ -42,7 +42,7 @@ public:
/*! Applies regex annotation on tokenized sentence. /*! Applies regex annotation on tokenized sentence.
\param sentence the input sentence \param sentence the input sentence
*/ */
void apply(boost::shared_ptr<TokenizedSentence> sentence); void apply(TokenizedSentence & sentence);
private: private:
int _annotationType; int _annotationType;

View File

@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer(
SentenceTokenizer::~SentenceTokenizer() { SentenceTokenizer::~SentenceTokenizer() {
} }
boost::shared_ptr<TokenizedSentence> TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
SentenceTokenizer::tokenize(const std::string & sentence) { TokenizedSentence result(sentence);
boost::shared_ptr<TokenizedSentence>
result(new TokenizedSentence(sentence));
_htmlTags->apply(result); _htmlTags->apply(result);
@ -35,7 +33,7 @@ boost::shared_ptr<TokenizedSentence>
neRule.apply(result); neRule.apply(result);
} }
result->toLowerCase(); result.toLowerCase();
if (_stopWordsEnabled) { if (_stopWordsEnabled) {
_stopWords->apply(result); _stopWords->apply(result);

View File

@ -36,8 +36,7 @@ public:
\param sentence input sentence \param sentence input sentence
\returns tokenized sentence object build on the input sentence \returns tokenized sentence object build on the input sentence
*/ */
boost::shared_ptr<TokenizedSentence> TokenizedSentence tokenize(const std::string & sentence);
tokenize(const std::string & sentence);
private: private:
void _createNeRules(std::string & namedEntitiesPath); void _createNeRules(std::string & namedEntitiesPath);

View File

@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14)); TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
/* /*
0,3 type: 1 value: ala 0,3 type: 1 value: ala
4,11 type: 1 value: posiada 4,11 type: 1 value: posiada
12,16 type: 1 value: kota 12,16 type: 1 value: kota
*/ */
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123)); concordia.addExample(Example("Marysia posiada rysia",123));
@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
/*
concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("New test product has a mistake", 321));
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above"); */
std::vector<std::string> sentences;
std::vector<SUFFIX_MARKER_TYPE> ids;
sentences.push_back("Alice has a cat");
ids.push_back(56);
sentences.push_back("Alice has a dog");
ids.push_back(23);
sentences.push_back("New test product has a mistake");
ids.push_back(321);
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
concordia.addAllTokenizedExamples(tokenizedSentences, ids);
TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
concordia.addTokenizedExample(ts, 14); concordia.addTokenizedExample(ts, 14);
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay: // best overlay:
/*
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9);
*/
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
BOOST_AUTO_TEST_CASE( Tokenize ) BOOST_AUTO_TEST_CASE( Tokenize )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota"); TokenizedSentence ts = concordia.tokenize(" Ala posiada kota");
/* /*
0,3 type: 1 value: ala 0,3 type: 1 value: ala
4,11 type: 1 value: posiada 4,11 type: 1 value: posiada
@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize )
concordia.clearIndex(); concordia.clearIndex();
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
std::vector<std::string> sentences;
sentences.push_back("Marysia, ma rysia;");
sentences.push_back("Testing complete;");
sentences.push_back("This, is (a) weird;! sentence <>");
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4) // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes(); std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes();
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern); boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);

View File

@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(config); HashGenerator hashGenerator = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected; std::vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0); expected.push_back(0);
expected.push_back(1); expected.push_back(1);
@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
HashGenerator hashGenerator1 = HashGenerator(config); HashGenerator hashGenerator1 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected1; std::vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0); expected1.push_back(0);
expected1.push_back(1); expected1.push_back(1);
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap(); hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(config); HashGenerator hashGenerator2 = HashGenerator(config);
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes();
std::vector<INDEX_CHARACTER_TYPE> expected2; std::vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0); expected2.push_back(0);
expected2.push_back(1); expected2.push_back(1);
@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
HashGenerator hashGenerator = HashGenerator(config); HashGenerator hashGenerator = HashGenerator(config);
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód."); TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens(); std::vector<TokenAnnotation> tokens = tokenizedSentence.getTokens();
/* /*
BOOST_FOREACH(TokenAnnotation annotation, tokens) { BOOST_FOREACH(TokenAnnotation annotation, tokens) {

View File

@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleAnnotation ) BOOST_AUTO_TEST_CASE( SimpleAnnotation )
{ {
RegexRule rr("a", TokenAnnotation::WORD, "b"); RegexRule rr("a", TokenAnnotation::WORD, "b");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa")); TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),7); BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex )
BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation ) BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
{ {
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, ""); RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'.");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation ) BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
{ {
RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false); RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC.")); TokenizedSentence ts("This is AbC and ABC and abc and aBC.");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
BOOST_AUTO_TEST_CASE( UnicodeAnnotation ) BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
{ {
RegexRule rr("ą", TokenAnnotation::WORD, "x"); RegexRule rr("ą", TokenAnnotation::WORD, "x");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń")); TokenizedSentence ts("zażółć gęślą jaźń");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
{ {
RegexRule rr("ą", TokenAnnotation::WORD, "x", false); RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{ {
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false); RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18); BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),2); BOOST_CHECK_EQUAL(iter->getStart(),2);

View File

@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest )
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size()); BOOST_CHECK_EQUAL(14,annotations.size());
@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>"; std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
/* /*
@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń"; std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
/* /*
@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
if (config->isStopWordsEnabled()) { if (config->isStopWordsEnabled()) {
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne"); BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence()," wiem konieczne");
} }
} }
@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(161, annotations.size()); BOOST_CHECK_EQUAL(161, annotations.size());