new responsibilities of tokenized sentence

This commit is contained in:
rjawor 2015-06-26 15:38:24 +02:00
parent 9b1735516c
commit 724bf0d080
8 changed files with 45 additions and 85 deletions

View File

@ -8,7 +8,6 @@ endforeach(dir)
add_library(concordia SHARED add_library(concordia SHARED
token_annotation.cpp token_annotation.cpp
tokenized_sentence.cpp tokenized_sentence.cpp
hashed_sentence.cpp
concordia_search_result.cpp concordia_search_result.cpp
matched_pattern_fragment.cpp matched_pattern_fragment.cpp
concordia_searcher.cpp concordia_searcher.cpp
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
install(FILES install(FILES
token_annotation.hpp token_annotation.hpp
tokenized_sentence.hpp tokenized_sentence.hpp
hashed_sentence.hpp
concordia_search_result.hpp concordia_search_result.hpp
matched_pattern_fragment.hpp matched_pattern_fragment.hpp
concordia_searcher.hpp concordia_searcher.hpp

View File

@ -1,10 +1,12 @@
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp> #include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp> #include <boost/archive/binary_iarchive.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <fstream> #include <fstream>
@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash( std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) { const std::string & sentence) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> result; std::vector<INDEX_CHARACTER_TYPE> result;
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
if (tokenTexts.size() > Utils::maxSentenceSize) { boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence."); throw ConcordiaException("Trying to add too long sentence.");
} }
for (std::vector<std::string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
std::string token = *it;
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result.push_back(code);
}
return result; return ts->getCodes();
} }
std::vector<std::string> HashGenerator::generateTokenVector( std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) { const std::string & sentence) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence); boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::string tokenizedSentence = ts->getSentence();
boost::trim(tokenizedSentence);
std::vector<std::string> tokenTexts; std::vector<std::string> tokenTexts;
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"), BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
boost::algorithm::token_compress_on); if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
tokenTexts.push_back(annotation.getValue());
}
}
return tokenTexts; return tokenTexts;
} }

View File

@ -51,7 +51,7 @@ public:
Generates vector of tokens from a sentence. This method is internally Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence (see \ref tutorial1_3), the vector of tokens resulting from sentence
anonymizing and tokenization is also needed. tokenization is also needed.
\param sentence sentence to tokenize \param sentence sentence to tokenize
\returns vector of tokens \returns vector of tokens
*/ */

View File

@ -1,7 +0,0 @@
#include "concordia/hashed_sentence.hpp"
HashedSentence::HashedSentence() {
}
HashedSentence::~HashedSentence() {
}

View File

@ -1,61 +0,0 @@
#ifndef HASHED_SENTENCE_HDR
#define HASHED_SENTENCE_HDR
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include <vector>
#include <string>
/*!
A sentence after hashing by the HashGenerator. The class holds
the list of word codes and intervals representing original
word positions in the sentence (char-based).
*/
class HashedSentence {
public:
/*!
Constructor.
*/
HashedSentence();
/*! Destructor.
*/
virtual ~HashedSentence();
/*! Getter for original word positions list.
\returns original word positions list
*/
std::vector<Interval> getOriginalWordPositions() const {
return _originalWordPositions;
}
/*! Getter for word codes list.
\returns word codes list
*/
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
return _wordCodes;
}
/*! Method for adding a word code to the list
\param word code to be added
*/
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
_wordCodes.push_back(wordCode);
}
/*! Method for adding an original word position to the list.
\param original word position
*/
void addOriginalWordPosition(Interval & originalWordPosition) {
_originalWordPositions.push_back(originalWordPosition);
}
private:
std::vector<Interval> _originalWordPositions;
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
};
#endif

View File

@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
} }
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes. /* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
BOOST_AUTO_TEST_CASE( TooLongHashTest ) BOOST_AUTO_TEST_CASE( TooLongHashTest )
{ {

View File

@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
void TokenizedSentence::toLowerCase() { void TokenizedSentence::toLowerCase() {
_sentence = TextUtils::getInstance().toLowerCase(_sentence); _sentence = TextUtils::getInstance().toLowerCase(_sentence);
} }
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
_tokens.push_back(annotation);
}
}
}

View File

@ -3,6 +3,9 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp" #include "concordia/token_annotation.hpp"
#include "concordia/word_map.hpp"
#include <boost/shared_ptr.hpp>
#include <string> #include <string>
#include <vector> #include <vector>
#include <list> #include <list>
@ -39,6 +42,16 @@ public:
return _tokenAnnotations; return _tokenAnnotations;
} }
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
return _codes;
}
std::vector<TokenAnnotation> getTokens() const {
return _tokens;
}
void generateHash(boost::shared_ptr<WordMap> wordMap);
/*! /*!
Transform the sentence to lower case. Transform the sentence to lower case.
*/ */
@ -59,6 +72,10 @@ private:
std::string _sentence; std::string _sentence;
std::list<TokenAnnotation> _tokenAnnotations; std::list<TokenAnnotation> _tokenAnnotations;
std::vector<INDEX_CHARACTER_TYPE> _codes;
std::vector<TokenAnnotation> _tokens;
}; };
#endif #endif