adding all tokenized examples
This commit is contained in:
parent
a765443a01
commit
68fecaddf8
1
TODO.txt
1
TODO.txt
@ -1,4 +1,5 @@
|
|||||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||||
|
DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr<TokenizedSentence>
|
||||||
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
|
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
|
||||||
IN PROGRESS - document the code (classes, cfg files) and update tutorial
|
IN PROGRESS - document the code (classes, cfg files) and update tutorial
|
||||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||||
|
@ -29,7 +29,7 @@ void checkConcordiaResults(
|
|||||||
long lineIndex = 1;
|
long lineIndex = 1;
|
||||||
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||||
SUFFIX_MARKER_TYPE patternSize =
|
SUFFIX_MARKER_TYPE patternSize =
|
||||||
result.getTokenizedPattern()->getTokens().size();
|
result.getTokenizedPattern().getTokens().size();
|
||||||
if (patternSize > 0) {
|
if (patternSize > 0) {
|
||||||
if (result.getBestOverlay().size() != 1) {
|
if (result.getBestOverlay().size() != 1) {
|
||||||
reportError(baseLineCount + lineIndex,
|
reportError(baseLineCount + lineIndex,
|
||||||
@ -203,7 +203,7 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
||||||
BOOST_FOREACH(TokenAnnotation annotation,
|
BOOST_FOREACH(TokenAnnotation annotation,
|
||||||
result->getTokenizedPattern()->getTokens()) {
|
result->getTokenizedPattern().getTokens()) {
|
||||||
std::cout << annotation.getValue() << " ";
|
std::cout << annotation.getValue() << " ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
#include "concordia/concordia.hpp"
|
#include "concordia/concordia.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
@ -42,19 +43,31 @@ std::string _createLibraryVersion() {
|
|||||||
return version.str();
|
return version.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence>
|
TokenizedSentence
|
||||||
Concordia::tokenize(const std::string & sentence)
|
Concordia::tokenize(const std::string & sentence)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
boost::shared_ptr<TokenizedSentence> result =
|
TokenizedSentence result =
|
||||||
_hashGenerator->generateHash(sentence);
|
_hashGenerator->generateHash(sentence);
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||||
|
const std::vector<std::string> & sentences)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
std::vector<TokenizedSentence> result;
|
||||||
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
|
result.push_back(_hashGenerator->generateHash(sentence));
|
||||||
|
}
|
||||||
|
|
||||||
|
_hashGenerator->serializeWordMap();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
TokenizedSentence Concordia::addExample(
|
||||||
const Example & example)
|
const Example & example)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||||
@ -63,13 +76,21 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
|||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
void Concordia::addTokenizedExample(
|
void Concordia::addTokenizedExample(
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id)
|
const SUFFIX_MARKER_TYPE id)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
_index->addTokenizedExample(_hashGenerator, _T,
|
_index->addTokenizedExample(_hashGenerator, _T,
|
||||||
_markers, tokenizedSentence, id);
|
_markers, tokenizedSentence, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Concordia::addAllTokenizedExamples(
|
||||||
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
||||||
|
const std::vector<SUFFIX_MARKER_TYPE> & ids)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
_index->addAllTokenizedExamples(_hashGenerator, _T,
|
||||||
|
_markers, tokenizedSentences, ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
@ -188,8 +209,7 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
|||||||
} else {
|
} else {
|
||||||
std::string empty;
|
std::string empty;
|
||||||
return boost::shared_ptr<ConcordiaSearchResult>(
|
return boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
|
new ConcordiaSearchResult(TokenizedSentence(empty)));
|
||||||
new TokenizedSentence(empty))));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,7 +58,16 @@ public:
|
|||||||
containing information about original word positions
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
|
TokenizedSentence tokenize(const std::string & sentence)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Tokenizes all the given sentences.
|
||||||
|
\param sentences vector of sentences to be tokenized
|
||||||
|
\returns vector of tokenized sentence objects
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
std::vector<TokenizedSentence> tokenizeAll(
|
||||||
|
const std::vector<std::string> & sentences)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds an Example to the index.
|
/*! Adds an Example to the index.
|
||||||
@ -67,17 +76,27 @@ public:
|
|||||||
containing information about original word positions
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
TokenizedSentence addExample(const Example & example)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds a tokenized example to the index.
|
/*! Adds a tokenized example to the index.
|
||||||
\param tokenizedSentence tokenized sentence to be added
|
\param tokenizedSentence tokenized sentence to be added
|
||||||
\param id of the sentence to be added
|
\param id id of the sentence to be added
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
void addTokenizedExample(
|
void addTokenizedExample(
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id)
|
const SUFFIX_MARKER_TYPE id)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Adds multiple tokenized examples to the index.
|
||||||
|
\param examples vector of examples to be added
|
||||||
|
\param ids vector of ids of the sentences to be added
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
void addAllTokenizedExamples(
|
||||||
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
||||||
|
const std::vector<SUFFIX_MARKER_TYPE> & ids)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds multiple examples to the index.
|
/*! Adds multiple examples to the index.
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
|
#include <boost/make_shared.hpp>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
|
|
||||||
@ -48,10 +50,10 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
|||||||
|
|
||||||
std::vector<TokenizedSentence> hashedPatterns;
|
std::vector<TokenizedSentence> hashedPatterns;
|
||||||
BOOST_FOREACH(Example example, examples) {
|
BOOST_FOREACH(Example example, examples) {
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
TokenizedSentence hashedPattern =
|
||||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, example);
|
T, markers, example);
|
||||||
hashedPatterns.push_back(*hashedPattern);
|
hashedPatterns.push_back(hashedPattern);
|
||||||
}
|
}
|
||||||
|
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
@ -61,7 +63,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
|||||||
return hashedPatterns;
|
return hashedPatterns;
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
TokenizedSentence ConcordiaIndex::addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -72,7 +74,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
|||||||
std::ofstream markersFile;
|
std::ofstream markersFile;
|
||||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
std::ios::app|std::ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
TokenizedSentence hashedPattern =
|
||||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, example);
|
T, markers, example);
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample(
|
|||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id) {
|
const SUFFIX_MARKER_TYPE id) {
|
||||||
std::ofstream hashedIndexFile;
|
std::ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||||
std::ios::app|std::ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample(
|
|||||||
markersFile.close();
|
markersFile.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ConcordiaIndex::addAllTokenizedExamples(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
||||||
|
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
|
||||||
|
std::ofstream hashedIndexFile;
|
||||||
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||||
|
std::ios::app|std::ios::binary);
|
||||||
|
std::ofstream markersFile;
|
||||||
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
|
std::ios::app|std::ios::binary);
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
|
||||||
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
|
T, markers, tokenizedSentence, ids.at(index));
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
hashedIndexFile.close();
|
||||||
|
markersFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::_addSingleTokenizedExample(
|
void ConcordiaIndex::_addSingleTokenizedExample(
|
||||||
std::ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id) {
|
const SUFFIX_MARKER_TYPE id) {
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
|
||||||
|
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample(
|
|||||||
markers->push_back(sentenceBoundaryMA);
|
markers->push_back(sentenceBoundaryMA);
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
TokenizedSentence ConcordiaIndex::_addSingleExample(
|
||||||
std::ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example) {
|
const Example & example) {
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
TokenizedSentence hashedPattern =
|
||||||
hashGenerator->generateHash(example.getSentence());
|
hashGenerator->generateHash(example.getSentence());
|
||||||
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, hashedPattern, example.getId());
|
T, markers, hashedPattern, example.getId());
|
||||||
|
@ -53,7 +53,7 @@ public:
|
|||||||
\returns tokenized example
|
\returns tokenized example
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> addExample(
|
TokenizedSentence addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -63,7 +63,6 @@ public:
|
|||||||
and markers array are appended with the example.
|
and markers array are appended with the example.
|
||||||
At the same time, HDD versions of these
|
At the same time, HDD versions of these
|
||||||
two data structures are also appended with the same example.
|
two data structures are also appended with the same example.
|
||||||
The method returns a tokenized version of the example.
|
|
||||||
\param hashGenerator hash generator to be used to prepare the hash
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
of the example
|
of the example
|
||||||
\param T RAM-based hash index to be appended to
|
\param T RAM-based hash index to be appended to
|
||||||
@ -77,8 +76,28 @@ public:
|
|||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id);
|
const SUFFIX_MARKER_TYPE id);
|
||||||
|
|
||||||
|
/*! Adds multiple tokenized examples to the index. Hashed index
|
||||||
|
and markers array are appended with the examples.
|
||||||
|
At the same time, HDD versions of these
|
||||||
|
two data structures are also appended with the same examples.
|
||||||
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
|
of the example
|
||||||
|
\param T RAM-based hash index to be appended to
|
||||||
|
\param markers RAM-based markers array to be appended to
|
||||||
|
\param example example to be added to index
|
||||||
|
\param tokenizedSentences vector of tokenized sentences to be added
|
||||||
|
\param ids vector of ids of the sentences to be added
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
void addAllTokenizedExamples(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
||||||
|
const std::vector<SUFFIX_MARKER_TYPE> & ids);
|
||||||
|
|
||||||
/*! Adds multiple examples to the index. Examples are first hashed using
|
/*! Adds multiple examples to the index. Examples are first hashed using
|
||||||
the hash generator passed to this method. Then, hashed index
|
the hash generator passed to this method. Then, hashed index
|
||||||
@ -114,10 +133,10 @@ private:
|
|||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
SUFFIX_MARKER_TYPE id);
|
const SUFFIX_MARKER_TYPE id);
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
TokenizedSentence _addSingleExample(
|
||||||
std::ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
ConcordiaSearchResult::ConcordiaSearchResult(
|
ConcordiaSearchResult::ConcordiaSearchResult(
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedPattern):
|
TokenizedSentence tokenizedPattern):
|
||||||
_tokenizedPattern(tokenizedPattern),
|
_tokenizedPattern(tokenizedPattern),
|
||||||
_bestOverlayScore(0) {
|
_bestOverlayScore(0) {
|
||||||
}
|
}
|
||||||
@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() {
|
|||||||
// the fragments are already sorted by their ends, ascending
|
// the fragments are already sorted by their ends, ascending
|
||||||
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||||
-1,
|
-1,
|
||||||
_tokenizedPattern->getTokens().size());
|
_tokenizedPattern.getTokens().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaSearchResult::_checkPossibleOverlays(
|
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||||
|
@ -26,8 +26,7 @@ public:
|
|||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
\param tokenVector tokenized pattern which was used for searching
|
\param tokenVector tokenized pattern which was used for searching
|
||||||
*/
|
*/
|
||||||
explicit ConcordiaSearchResult(
|
explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern);
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedPattern);
|
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -51,7 +50,7 @@ public:
|
|||||||
/*! Getter for tokenized pattern.
|
/*! Getter for tokenized pattern.
|
||||||
\returns tokenized search pattern
|
\returns tokenized search pattern
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> getTokenizedPattern() const {
|
TokenizedSentence getTokenizedPattern() const {
|
||||||
return _tokenizedPattern;
|
return _tokenizedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,7 +81,7 @@ private:
|
|||||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||||
SUFFIX_MARKER_TYPE patternSize);
|
SUFFIX_MARKER_TYPE patternSize);
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> _tokenizedPattern;
|
TokenizedSentence _tokenizedPattern;
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
||||||
|
|
||||||
|
@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|||||||
HashGenerator::~HashGenerator() {
|
HashGenerator::~HashGenerator() {
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
|
TokenizedSentence HashGenerator::generateHash(
|
||||||
const std::string & sentence) throw(ConcordiaException) {
|
const std::string & sentence) throw(ConcordiaException) {
|
||||||
boost::shared_ptr<TokenizedSentence> ts =
|
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
|
||||||
_sentenceTokenizer->tokenize(sentence);
|
ts.generateHash(_wordMap);
|
||||||
ts->generateHash(_wordMap);
|
|
||||||
|
|
||||||
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
||||||
throw ConcordiaException("Trying to add too long sentence.");
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,8 +44,7 @@ public:
|
|||||||
\param sentence sentence to generate hash from
|
\param sentence sentence to generate hash from
|
||||||
\returns tokenized sentence, containing the hash
|
\returns tokenized sentence, containing the hash
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> generateHash(
|
TokenizedSentence generateHash(const std::string & sentence)
|
||||||
const std::string & sentence)
|
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -23,7 +23,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
|||||||
|
|
||||||
int left;
|
int left;
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
hashGenerator->generateHash(pattern)->getCodes();
|
hashGenerator->generateHash(pattern).getCodes();
|
||||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
hashGenerator->generateHash(pattern)->getCodes();
|
hashGenerator->generateHash(pattern).getCodes();
|
||||||
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
|
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,13 +70,12 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern);
|
||||||
hashGenerator->generateHash(pattern);
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> result =
|
boost::shared_ptr<ConcordiaSearchResult> result =
|
||||||
boost::shared_ptr<ConcordiaSearchResult>(
|
boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
new ConcordiaSearchResult(hashedPattern));
|
new ConcordiaSearchResult(hashedPattern));
|
||||||
|
|
||||||
_concordiaSearcher->concordiaSearch(result, T, markers,
|
_concordiaSearcher->concordiaSearch(result, T, markers,
|
||||||
SA, hashedPattern->getCodes());
|
SA, hashedPattern.getCodes());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString,
|
|||||||
RegexRule::~RegexRule() {
|
RegexRule::~RegexRule() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
void RegexRule::apply(TokenizedSentence & sentence) {
|
||||||
try {
|
try {
|
||||||
UnicodeString s(sentence->getSentence().c_str());
|
UnicodeString s(sentence.getSentence().c_str());
|
||||||
boost::u32regex_iterator<const UChar*> begin(
|
boost::u32regex_iterator<const UChar*> begin(
|
||||||
boost::make_u32regex_iterator(s, _pattern));
|
boost::make_u32regex_iterator(s, _pattern));
|
||||||
boost::u32regex_iterator<const UChar*> end;
|
boost::u32regex_iterator<const UChar*> end;
|
||||||
@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
|||||||
_annotationType, value);
|
_annotationType, value);
|
||||||
annotations.push_back(annotation);
|
annotations.push_back(annotation);
|
||||||
}
|
}
|
||||||
sentence->addAnnotations(annotations);
|
sentence.addAnnotations(annotations);
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "Exception while applying regex rule: "
|
ss << "Exception while applying regex rule: "
|
||||||
<< _annotationType << " to text: "
|
<< _annotationType << " to text: "
|
||||||
<< sentence->getSentence();
|
<< sentence.getSentence();
|
||||||
ss << ", message: " << e.what();
|
ss << ", message: " << e.what();
|
||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
}
|
}
|
||||||
|
@ -42,7 +42,7 @@ public:
|
|||||||
/*! Applies regex annotation on tokenized sentence.
|
/*! Applies regex annotation on tokenized sentence.
|
||||||
\param sentence the input sentence
|
\param sentence the input sentence
|
||||||
*/
|
*/
|
||||||
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
void apply(TokenizedSentence & sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _annotationType;
|
int _annotationType;
|
||||||
|
@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer(
|
|||||||
SentenceTokenizer::~SentenceTokenizer() {
|
SentenceTokenizer::~SentenceTokenizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence>
|
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||||
SentenceTokenizer::tokenize(const std::string & sentence) {
|
TokenizedSentence result(sentence);
|
||||||
boost::shared_ptr<TokenizedSentence>
|
|
||||||
result(new TokenizedSentence(sentence));
|
|
||||||
|
|
||||||
_htmlTags->apply(result);
|
_htmlTags->apply(result);
|
||||||
|
|
||||||
@ -35,7 +33,7 @@ boost::shared_ptr<TokenizedSentence>
|
|||||||
neRule.apply(result);
|
neRule.apply(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
result->toLowerCase();
|
result.toLowerCase();
|
||||||
|
|
||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords->apply(result);
|
_stopWords->apply(result);
|
||||||
|
@ -36,8 +36,7 @@ public:
|
|||||||
\param sentence input sentence
|
\param sentence input sentence
|
||||||
\returns tokenized sentence object build on the input sentence
|
\returns tokenized sentence object build on the input sentence
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence>
|
TokenizedSentence tokenize(const std::string & sentence);
|
||||||
tokenize(const std::string & sentence);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _createNeRules(std::string & namedEntitiesPath);
|
void _createNeRules(std::string & namedEntitiesPath);
|
||||||
|
@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
|||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Ala posiada kota",14));
|
TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
|
||||||
/*
|
/*
|
||||||
0,3 type: 1 value: ala
|
0,3 type: 1 value: ala
|
||||||
4,11 type: 1 value: posiada
|
4,11 type: 1 value: posiada
|
||||||
12,16 type: 1 value: kota
|
12,16 type: 1 value: kota
|
||||||
*/
|
*/
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||||
|
|
||||||
concordia.addExample(Example("Ala posiada rysia",51));
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
/*
|
||||||
concordia.addExample(Example("Alice has a cat", 56));
|
concordia.addExample(Example("Alice has a cat", 56));
|
||||||
concordia.addExample(Example("Alice has a dog", 23));
|
concordia.addExample(Example("Alice has a dog", 23));
|
||||||
concordia.addExample(Example("New test product has a mistake", 321));
|
concordia.addExample(Example("New test product has a mistake", 321));
|
||||||
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
|
*/
|
||||||
|
std::vector<std::string> sentences;
|
||||||
|
std::vector<SUFFIX_MARKER_TYPE> ids;
|
||||||
|
sentences.push_back("Alice has a cat");
|
||||||
|
ids.push_back(56);
|
||||||
|
sentences.push_back("Alice has a dog");
|
||||||
|
ids.push_back(23);
|
||||||
|
sentences.push_back("New test product has a mistake");
|
||||||
|
ids.push_back(321);
|
||||||
|
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
|
||||||
|
concordia.addAllTokenizedExamples(tokenizedSentences, ids);
|
||||||
|
|
||||||
|
TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
|
||||||
concordia.addTokenizedExample(ts, 14);
|
concordia.addTokenizedExample(ts, 14);
|
||||||
|
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||||
// best overlay:
|
// best overlay:
|
||||||
|
|
||||||
/*
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1);
|
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9);
|
||||||
*/
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8);
|
||||||
|
|
||||||
@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
|||||||
BOOST_AUTO_TEST_CASE( Tokenize )
|
BOOST_AUTO_TEST_CASE( Tokenize )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota");
|
TokenizedSentence ts = concordia.tokenize(" Ala posiada kota");
|
||||||
/*
|
/*
|
||||||
0,3 type: 1 value: ala
|
0,3 type: 1 value: ala
|
||||||
4,11 type: 1 value: posiada
|
4,11 type: 1 value: posiada
|
||||||
@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
|||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||||
|
|
||||||
|
std::vector<std::string> sentences;
|
||||||
|
sentences.push_back("Marysia, ma rysia;");
|
||||||
|
sentences.push_back("Testing complete;");
|
||||||
|
sentences.push_back("This, is (a) weird;! sentence <>");
|
||||||
|
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
||||||
|
|
||||||
}
|
}
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
|||||||
|
|
||||||
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
|
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes();
|
||||||
|
|
||||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota")->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota").getCodes();
|
||||||
std::vector<INDEX_CHARACTER_TYPE> expected;
|
std::vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
expected.push_back(0);
|
expected.push_back(0);
|
||||||
expected.push_back(1);
|
expected.push_back(1);
|
||||||
@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator1 = HashGenerator(config);
|
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes();
|
||||||
std::vector<INDEX_CHARACTER_TYPE> expected1;
|
std::vector<INDEX_CHARACTER_TYPE> expected1;
|
||||||
expected1.push_back(0);
|
expected1.push_back(0);
|
||||||
expected1.push_back(1);
|
expected1.push_back(1);
|
||||||
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
hashGenerator1.serializeWordMap();
|
hashGenerator1.serializeWordMap();
|
||||||
|
|
||||||
HashGenerator hashGenerator2 = HashGenerator(config);
|
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes();
|
||||||
std::vector<INDEX_CHARACTER_TYPE> expected2;
|
std::vector<INDEX_CHARACTER_TYPE> expected2;
|
||||||
expected2.push_back(0);
|
expected2.push_back(0);
|
||||||
expected2.push_back(1);
|
expected2.push_back(1);
|
||||||
@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
|
TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że <b>kierowca</b> zaparkował samochód.");
|
||||||
|
|
||||||
std::vector<TokenAnnotation> tokens = tokenizedSentence->getTokens();
|
std::vector<TokenAnnotation> tokens = tokenizedSentence.getTokens();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
BOOST_FOREACH(TokenAnnotation annotation, tokens) {
|
BOOST_FOREACH(TokenAnnotation annotation, tokens) {
|
||||||
|
@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
|
|||||||
BOOST_AUTO_TEST_CASE( SimpleAnnotation )
|
BOOST_AUTO_TEST_CASE( SimpleAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("a", TokenAnnotation::WORD, "b");
|
RegexRule rr("a", TokenAnnotation::WORD, "b");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||||
@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
|||||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
|
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'.");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||||
@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
|||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
|
RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
|
TokenizedSentence ts("This is AbC and ABC and abc and aBC.");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
|||||||
BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
|
BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą", TokenAnnotation::WORD, "x");
|
RegexRule rr("ą", TokenAnnotation::WORD, "x");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
|
TokenizedSentence ts("zażółć gęślą jaźń");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
|
|||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
|
RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
|||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
|
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||||
|
@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest )
|
|||||||
|
|
||||||
|
|
||||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(14,annotations.size());
|
BOOST_CHECK_EQUAL(14,annotations.size());
|
||||||
@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
|||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
|
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
|
||||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
|
|||||||
if (config->isStopWordsEnabled()) {
|
if (config->isStopWordsEnabled()) {
|
||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence()," wiem konieczne");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
|||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(161, annotations.size());
|
BOOST_CHECK_EQUAL(161, annotations.size());
|
||||||
|
Loading…
Reference in New Issue
Block a user