tokenize only option - no word map
This commit is contained in:
parent
bbf3853d2a
commit
b3d7c993aa
@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
|
||||
|
||||
TokenizedSentence
|
||||
Concordia::tokenize(const std::string & sentence,
|
||||
bool byWhitespace)
|
||||
bool byWhitespace,
|
||||
bool generateCodes)
|
||||
throw(ConcordiaException) {
|
||||
if (generateCodes) {
|
||||
TokenizedSentence result =
|
||||
_hashGenerator->generateHash(sentence, byWhitespace);
|
||||
_hashGenerator->serializeWordMap();
|
||||
return result;
|
||||
} else {
|
||||
return _hashGenerator->generateTokens(sentence, byWhitespace);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||
const std::vector<std::string> & sentences,
|
||||
bool byWhitespace)
|
||||
bool byWhitespace,
|
||||
bool generateCodes)
|
||||
throw(ConcordiaException) {
|
||||
std::vector<TokenizedSentence> result;
|
||||
|
||||
if (generateCodes) {
|
||||
BOOST_FOREACH(std::string sentence, sentences) {
|
||||
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
||||
}
|
||||
|
||||
_hashGenerator->serializeWordMap();
|
||||
} else {
|
||||
BOOST_FOREACH(std::string sentence, sentences) {
|
||||
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -61,23 +61,27 @@ public:
|
||||
/*! Tokenizes the given sentence.
|
||||
\param sentence sentence to be tokenized
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\param generateCodes whether to generate codes for tokens using WordMap
|
||||
\returns tokenized sentence object,
|
||||
containing information about original word positions
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
TokenizedSentence tokenize(const std::string & sentence,
|
||||
bool byWhitespace = false)
|
||||
bool byWhitespace = false,
|
||||
bool generateCodes = true)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Tokenizes all the given sentences.
|
||||
\param sentences vector of sentences to be tokenized
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\param generateCodes whether to generate codes for tokens using WordMap
|
||||
\returns vector of tokenized sentence objects
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<TokenizedSentence> tokenizeAll(
|
||||
const std::vector<std::string> & sentences,
|
||||
bool byWhitespace = false)
|
||||
bool byWhitespace = false,
|
||||
bool generateCodes = true)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds an Example to the index.
|
||||
|
@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
|
||||
return ts;
|
||||
}
|
||||
|
||||
TokenizedSentence HashGenerator::generateTokens(
|
||||
const std::string & sentence,
|
||||
bool byWhitespace) throw(ConcordiaException) {
|
||||
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
|
||||
ts.generateTokens();
|
||||
|
||||
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
void HashGenerator::serializeWordMap() {
|
||||
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||
boost::archive::binary_oarchive oa(ofs);
|
||||
|
@ -51,6 +51,17 @@ public:
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
This method acts like generateHash, but only performs tokenization.
|
||||
Resulting TokenizedSentence does not have token codes information.
|
||||
\param sentence sentence to tokenize
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\returns tokenized sentence, containing the tokens
|
||||
*/
|
||||
TokenizedSentence generateTokens(const std::string & sentence,
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
Saves the contents of current WordMap to HDD.
|
||||
*/
|
||||
|
@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
||||
}
|
||||
}
|
||||
|
||||
void TokenizedSentence::generateTokens() {
|
||||
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
_tokens.push_back(annotation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -67,13 +67,21 @@ public:
|
||||
/*! Method for generating hash based on annotations.
|
||||
This method takes into account annotations of type
|
||||
word and named entity. These are encoded and added
|
||||
to to code list. Annotations corresponding to these
|
||||
to code list. Annotations corresponding to these
|
||||
tokens are added to the tokens list.
|
||||
\param wordMap word map to use when encoding tokens
|
||||
\returns tokens list
|
||||
*/
|
||||
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||
|
||||
/*! Method for generating tokens based on annotations.
|
||||
This method takes into account annotations of type
|
||||
word and named entity. Unlike in generateHash,
|
||||
these are not encoded or added to code list.
|
||||
Annotations corresponding to these
|
||||
tokens are added to the tokens list.
|
||||
*/
|
||||
void generateTokens();
|
||||
|
||||
/*!
|
||||
Transform the sentence to lower case.
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user