new responsibilities of tokenized sentence

This commit is contained in:
rjawor 2015-06-26 15:38:24 +02:00
parent 9b1735516c
commit 724bf0d080
8 changed files with 45 additions and 85 deletions

View File

@ -8,7 +8,6 @@ endforeach(dir)
add_library(concordia SHARED
token_annotation.cpp
tokenized_sentence.cpp
hashed_sentence.cpp
concordia_search_result.cpp
matched_pattern_fragment.cpp
concordia_searcher.cpp
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
install(FILES
token_annotation.hpp
tokenized_sentence.hpp
hashed_sentence.hpp
concordia_search_result.hpp
matched_pattern_fragment.hpp
concordia_searcher.hpp

View File

@ -1,10 +1,12 @@
#include "concordia/hash_generator.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <fstream>
@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> result;
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
if (tokenTexts.size() > Utils::maxSentenceSize) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
for (std::vector<std::string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
std::string token = *it;
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result.push_back(code);
}
return result;
return ts->getCodes();
}
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::string tokenizedSentence = ts->getSentence();
boost::trim(tokenizedSentence);
std::vector<std::string> tokenTexts;
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on);
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
tokenTexts.push_back(annotation.getValue());
}
}
return tokenTexts;
}

View File

@ -51,7 +51,7 @@ public:
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
anonymizing and tokenization is also needed.
tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/

View File

@ -1,7 +0,0 @@
#include "concordia/hashed_sentence.hpp"
HashedSentence::HashedSentence() {
}
HashedSentence::~HashedSentence() {
}

View File

@ -1,61 +0,0 @@
#ifndef HASHED_SENTENCE_HDR
#define HASHED_SENTENCE_HDR
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include <vector>
#include <string>
/*!
A sentence after hashing by the HashGenerator. The class holds
the list of word codes and intervals representing original
word positions in the sentence (char-based).
*/
class HashedSentence {
public:
/*!
Constructor.
*/
HashedSentence();
/*! Destructor.
*/
virtual ~HashedSentence();
/*! Getter for original word positions list.
\returns original word positions list
*/
std::vector<Interval> getOriginalWordPositions() const {
return _originalWordPositions;
}
/*! Getter for word codes list.
\returns word codes list
*/
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
return _wordCodes;
}
/*! Method for adding a word code to the list
\param word code to be added
*/
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
_wordCodes.push_back(wordCode);
}
/*! Method for adding an original word position to the list.
\param original word position
*/
void addOriginalWordPosition(Interval & originalWordPosition) {
_originalWordPositions.push_back(originalWordPosition);
}
private:
std::vector<Interval> _originalWordPositions;
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
};
#endif

View File

@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
}
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
BOOST_AUTO_TEST_CASE( TooLongHashTest )
{

View File

@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
void TokenizedSentence::toLowerCase() {
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
}
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
_tokens.push_back(annotation);
}
}
}

View File

@ -3,6 +3,9 @@
#include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/word_map.hpp"
#include <boost/shared_ptr.hpp>
#include <string>
#include <vector>
#include <list>
@ -39,6 +42,16 @@ public:
return _tokenAnnotations;
}
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
return _codes;
}
std::vector<TokenAnnotation> getTokens() const {
return _tokens;
}
void generateHash(boost::shared_ptr<WordMap> wordMap);
/*!
Transform the sentence to lower case.
*/
@ -59,6 +72,10 @@ private:
std::string _sentence;
std::list<TokenAnnotation> _tokenAnnotations;
std::vector<INDEX_CHARACTER_TYPE> _codes;
std::vector<TokenAnnotation> _tokens;
};
#endif