tokenize only option - no word map

This commit is contained in:
rjawor 2016-01-01 20:45:07 +01:00
parent bbf3853d2a
commit b3d7c993aa
7 changed files with 96 additions and 14 deletions

View File

@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
TokenizedSentence
Concordia::tokenize(const std::string & sentence,
bool byWhitespace)
bool byWhitespace,
bool generateCodes)
throw(ConcordiaException) {
TokenizedSentence result =
_hashGenerator->generateHash(sentence, byWhitespace);
_hashGenerator->serializeWordMap();
return result;
if (generateCodes) {
TokenizedSentence result =
_hashGenerator->generateHash(sentence, byWhitespace);
_hashGenerator->serializeWordMap();
return result;
} else {
return _hashGenerator->generateTokens(sentence, byWhitespace);
}
}
std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences,
bool byWhitespace)
bool byWhitespace,
bool generateCodes)
throw(ConcordiaException) {
std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
}
if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
}
_hashGenerator->serializeWordMap();
_hashGenerator->serializeWordMap();
} else {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
}
}
return result;
}

View File

@ -61,23 +61,27 @@ public:
/*! Tokenizes the given sentence.
\param sentence sentence to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\param generateCodes whether to generate codes for tokens using WordMap
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false)
bool byWhitespace = false,
bool generateCodes = true)
throw(ConcordiaException);
/*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\param generateCodes whether to generate codes for tokens using WordMap
\returns vector of tokenized sentence objects
\throws ConcordiaException
*/
std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences,
bool byWhitespace = false)
bool byWhitespace = false,
bool generateCodes = true)
throw(ConcordiaException);
/*! Adds an Example to the index.

View File

@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
return ts;
}
TokenizedSentence HashGenerator::generateTokens(
const std::string & sentence,
bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateTokens();
if (ts.getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
return ts;
}
void HashGenerator::serializeWordMap() {
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_oarchive oa(ofs);

View File

@ -51,6 +51,17 @@ public:
bool byWhitespace = false)
throw(ConcordiaException);
/*!
This method acts like generateHash, but only performs tokenization.
Resulting TokenizedSentence does not have token codes information.
\param sentence sentence to tokenize
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the tokens
*/
TokenizedSentence generateTokens(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*!
Saves the contents of current WordMap to HDD.
*/

View File

@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
concordia.clearIndex();
}
BOOST_AUTO_TEST_CASE( TokenizeOnly )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
}
}
void TokenizedSentence::generateTokens() {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_tokens.push_back(annotation);
}
}
}

View File

@ -67,13 +67,21 @@ public:
/*! Method for generating hash based on annotations.
This method takes into account annotations of type
word and named entity. These are encoded and added
to to code list. Annotations corresponding to these
to code list. Annotations corresponding to these
tokens are added to the tokens list.
\param wordMap word map to use when encoding tokens
\returns tokens list
*/
void generateHash(boost::shared_ptr<WordMap> wordMap);
/*! Method for generating tokens based on annotations.
This method takes into account annotations of type
word and named entity. Unlike in generateHash,
these are not encoded or added to code list.
Annotations corresponding to these
tokens are added to the tokens list.
*/
void generateTokens();
/*!
Transform the sentence to lower case.
*/