tokenize only option - no word map
This commit is contained in:
parent
bbf3853d2a
commit
b3d7c993aa
@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
|
|||||||
|
|
||||||
TokenizedSentence
|
TokenizedSentence
|
||||||
Concordia::tokenize(const std::string & sentence,
|
Concordia::tokenize(const std::string & sentence,
|
||||||
bool byWhitespace)
|
bool byWhitespace,
|
||||||
|
bool generateCodes)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
TokenizedSentence result =
|
if (generateCodes) {
|
||||||
_hashGenerator->generateHash(sentence, byWhitespace);
|
TokenizedSentence result =
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->generateHash(sentence, byWhitespace);
|
||||||
return result;
|
_hashGenerator->serializeWordMap();
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return _hashGenerator->generateTokens(sentence, byWhitespace);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||||
const std::vector<std::string> & sentences,
|
const std::vector<std::string> & sentences,
|
||||||
bool byWhitespace)
|
bool byWhitespace,
|
||||||
|
bool generateCodes)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
std::vector<TokenizedSentence> result;
|
std::vector<TokenizedSentence> result;
|
||||||
BOOST_FOREACH(std::string sentence, sentences) {
|
|
||||||
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
if (generateCodes) {
|
||||||
}
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
|
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
||||||
|
}
|
||||||
|
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
|
} else {
|
||||||
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
|
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
|
||||||
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,23 +61,27 @@ public:
|
|||||||
/*! Tokenizes the given sentence.
|
/*! Tokenizes the given sentence.
|
||||||
\param sentence sentence to be tokenized
|
\param sentence sentence to be tokenized
|
||||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
|
\param generateCodes whether to generate codes for tokens using WordMap
|
||||||
\returns tokenized sentence object,
|
\returns tokenized sentence object,
|
||||||
containing information about original word positions
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
TokenizedSentence tokenize(const std::string & sentence,
|
TokenizedSentence tokenize(const std::string & sentence,
|
||||||
bool byWhitespace = false)
|
bool byWhitespace = false,
|
||||||
|
bool generateCodes = true)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Tokenizes all the given sentences.
|
/*! Tokenizes all the given sentences.
|
||||||
\param sentences vector of sentences to be tokenized
|
\param sentences vector of sentences to be tokenized
|
||||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
|
\param generateCodes whether to generate codes for tokens using WordMap
|
||||||
\returns vector of tokenized sentence objects
|
\returns vector of tokenized sentence objects
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<TokenizedSentence> tokenizeAll(
|
std::vector<TokenizedSentence> tokenizeAll(
|
||||||
const std::vector<std::string> & sentences,
|
const std::vector<std::string> & sentences,
|
||||||
bool byWhitespace = false)
|
bool byWhitespace = false,
|
||||||
|
bool generateCodes = true)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds an Example to the index.
|
/*! Adds an Example to the index.
|
||||||
|
@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
|
|||||||
return ts;
|
return ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TokenizedSentence HashGenerator::generateTokens(
|
||||||
|
const std::string & sentence,
|
||||||
|
bool byWhitespace) throw(ConcordiaException) {
|
||||||
|
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
|
||||||
|
ts.generateTokens();
|
||||||
|
|
||||||
|
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
||||||
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ts;
|
||||||
|
}
|
||||||
|
|
||||||
void HashGenerator::serializeWordMap() {
|
void HashGenerator::serializeWordMap() {
|
||||||
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||||
boost::archive::binary_oarchive oa(ofs);
|
boost::archive::binary_oarchive oa(ofs);
|
||||||
|
@ -51,6 +51,17 @@ public:
|
|||||||
bool byWhitespace = false)
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
This method acts like generateHash, but only performs tokenization.
|
||||||
|
Resulting TokenizedSentence does not have token codes information.
|
||||||
|
\param sentence sentence to tokenize
|
||||||
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
|
\returns tokenized sentence, containing the tokens
|
||||||
|
*/
|
||||||
|
TokenizedSentence generateTokens(const std::string & sentence,
|
||||||
|
bool byWhitespace = false)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Saves the contents of current WordMap to HDD.
|
Saves the contents of current WordMap to HDD.
|
||||||
*/
|
*/
|
||||||
|
@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
|||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TokenizedSentence::generateTokens() {
|
||||||
|
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||||
|
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||||
|
annotation.getType() == TokenAnnotation::NE) {
|
||||||
|
_tokens.push_back(annotation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -67,13 +67,21 @@ public:
|
|||||||
/*! Method for generating hash based on annotations.
|
/*! Method for generating hash based on annotations.
|
||||||
This method takes into account annotations of type
|
This method takes into account annotations of type
|
||||||
word and named entity. These are encoded and added
|
word and named entity. These are encoded and added
|
||||||
to to code list. Annotations corresponding to these
|
to code list. Annotations corresponding to these
|
||||||
tokens are added to the tokens list.
|
tokens are added to the tokens list.
|
||||||
\param wordMap word map to use when encoding tokens
|
\param wordMap word map to use when encoding tokens
|
||||||
\returns tokens list
|
|
||||||
*/
|
*/
|
||||||
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||||
|
|
||||||
|
/*! Method for generating tokens based on annotations.
|
||||||
|
This method takes into account annotations of type
|
||||||
|
word and named entity. Unlike in generateHash,
|
||||||
|
these are not encoded or added to code list.
|
||||||
|
Annotations corresponding to these
|
||||||
|
tokens are added to the tokens list.
|
||||||
|
*/
|
||||||
|
void generateTokens();
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Transform the sentence to lower case.
|
Transform the sentence to lower case.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user