tokenize only option - no word map

This commit is contained in:
rjawor 2016-01-01 20:45:07 +01:00
parent bbf3853d2a
commit b3d7c993aa
7 changed files with 96 additions and 14 deletions

View File

@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
TokenizedSentence TokenizedSentence
Concordia::tokenize(const std::string & sentence, Concordia::tokenize(const std::string & sentence,
bool byWhitespace) bool byWhitespace,
bool generateCodes)
throw(ConcordiaException) { throw(ConcordiaException) {
TokenizedSentence result = if (generateCodes) {
_hashGenerator->generateHash(sentence, byWhitespace); TokenizedSentence result =
_hashGenerator->serializeWordMap(); _hashGenerator->generateHash(sentence, byWhitespace);
return result; _hashGenerator->serializeWordMap();
return result;
} else {
return _hashGenerator->generateTokens(sentence, byWhitespace);
}
} }
std::vector<TokenizedSentence> Concordia::tokenizeAll( std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences, const std::vector<std::string> & sentences,
bool byWhitespace) bool byWhitespace,
bool generateCodes)
throw(ConcordiaException) { throw(ConcordiaException) {
std::vector<TokenizedSentence> result; std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); if (generateCodes) {
} BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
}
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
} else {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
}
}
return result; return result;
} }

View File

@ -61,23 +61,27 @@ public:
/*! Tokenizes the given sentence. /*! Tokenizes the given sentence.
\param sentence sentence to be tokenized \param sentence sentence to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace \param byWhitespace whether to tokenize the sentence by whitespace
\param generateCodes whether to generate codes for tokens using WordMap
\returns tokenized sentence object, \returns tokenized sentence object,
containing information about original word positions containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
TokenizedSentence tokenize(const std::string & sentence, TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false) bool byWhitespace = false,
bool generateCodes = true)
throw(ConcordiaException); throw(ConcordiaException);
/*! Tokenizes all the given sentences. /*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized \param sentences vector of sentences to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace \param byWhitespace whether to tokenize the sentence by whitespace
\param generateCodes whether to generate codes for tokens using WordMap
\returns vector of tokenized sentence objects \returns vector of tokenized sentence objects
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<TokenizedSentence> tokenizeAll( std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences, const std::vector<std::string> & sentences,
bool byWhitespace = false) bool byWhitespace = false,
bool generateCodes = true)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds an Example to the index. /*! Adds an Example to the index.

View File

@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
return ts; return ts;
} }
TokenizedSentence HashGenerator::generateTokens(
const std::string & sentence,
bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateTokens();
if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
return ts;
}
void HashGenerator::serializeWordMap() { void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary); std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs); boost::archive::binary_oarchive oa(ofs);

View File

@ -51,6 +51,17 @@ public:
bool byWhitespace = false) bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*!
This method acts like generateHash, but only performs tokenization.
Resulting TokenizedSentence does not have token codes information.
\param sentence sentence to tokenize
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the tokens
*/
TokenizedSentence generateTokens(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*! /*!
Saves the contents of current WordMap to HDD. Saves the contents of current WordMap to HDD.
*/ */

View File

@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./"); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
concordia.clearIndex(); concordia.clearIndex();
} }
BOOST_AUTO_TEST_CASE( TokenizeOnly )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
} }
} }
void TokenizedSentence::generateTokens() {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_tokens.push_back(annotation);
}
}
}

View File

@ -67,13 +67,21 @@ public:
/*! Method for generating hash based on annotations. /*! Method for generating hash based on annotations.
This method takes into account annotations of type This method takes into account annotations of type
word and named entity. These are encoded and added word and named entity. These are encoded and added
to to code list. Annotations corresponding to these to code list. Annotations corresponding to these
tokens are added to the tokens list. tokens are added to the tokens list.
\param wordMap word map to use when encoding tokens \param wordMap word map to use when encoding tokens
\returns tokens list
*/ */
void generateHash(boost::shared_ptr<WordMap> wordMap); void generateHash(boost::shared_ptr<WordMap> wordMap);
/*! Method for generating tokens based on annotations.
This method takes into account annotations of type
word and named entity. Unlike in generateHash,
these are not encoded or added to code list.
Annotations corresponding to these
tokens are added to the tokens list.
*/
void generateTokens();
/*! /*!
Transform the sentence to lower case. Transform the sentence to lower case.
*/ */