new responsibilities of tokenized sentence
This commit is contained in:
parent
9b1735516c
commit
724bf0d080
@ -8,7 +8,6 @@ endforeach(dir)
|
||||
add_library(concordia SHARED
|
||||
token_annotation.cpp
|
||||
tokenized_sentence.cpp
|
||||
hashed_sentence.cpp
|
||||
concordia_search_result.cpp
|
||||
matched_pattern_fragment.cpp
|
||||
concordia_searcher.cpp
|
||||
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
|
||||
install(FILES
|
||||
token_annotation.hpp
|
||||
tokenized_sentence.hpp
|
||||
hashed_sentence.hpp
|
||||
concordia_search_result.hpp
|
||||
matched_pattern_fragment.hpp
|
||||
concordia_searcher.hpp
|
||||
|
@ -1,10 +1,12 @@
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/archive/binary_oarchive.hpp>
|
||||
#include <boost/archive/binary_iarchive.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
#include <fstream>
|
||||
|
||||
@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
|
||||
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
const std::string & sentence) throw(ConcordiaException) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> result;
|
||||
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
|
||||
if (tokenTexts.size() > Utils::maxSentenceSize) {
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
ts->generateHash(_wordMap);
|
||||
|
||||
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
for (std::vector<std::string>::iterator it = tokenTexts.begin();
|
||||
it != tokenTexts.end(); ++it) {
|
||||
std::string token = *it;
|
||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||
result.push_back(code);
|
||||
}
|
||||
|
||||
return result;
|
||||
return ts->getCodes();
|
||||
}
|
||||
|
||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||
const std::string & sentence) {
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
std::string tokenizedSentence = ts->getSentence();
|
||||
boost::trim(tokenizedSentence);
|
||||
std::vector<std::string> tokenTexts;
|
||||
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
boost::algorithm::token_compress_on);
|
||||
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
|
||||
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
tokenTexts.push_back(annotation.getValue());
|
||||
}
|
||||
}
|
||||
return tokenTexts;
|
||||
}
|
||||
|
||||
|
@ -51,7 +51,7 @@ public:
|
||||
Generates vector of tokens from a sentence. This method is internally
|
||||
used by generateHash. However, for the sake of concordiaSearch
|
||||
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
||||
anonymizing and tokenization is also needed.
|
||||
tokenization is also needed.
|
||||
\param sentence sentence to tokenize
|
||||
\returns vector of tokens
|
||||
*/
|
||||
|
@ -1,7 +0,0 @@
|
||||
#include "concordia/hashed_sentence.hpp"
|
||||
|
||||
HashedSentence::HashedSentence() {
|
||||
}
|
||||
|
||||
HashedSentence::~HashedSentence() {
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
#ifndef HASHED_SENTENCE_HDR
|
||||
#define HASHED_SENTENCE_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
A sentence after hashing by the HashGenerator. The class holds
|
||||
the list of word codes and intervals representing original
|
||||
word positions in the sentence (char-based).
|
||||
*/
|
||||
|
||||
class HashedSentence {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
HashedSentence();
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~HashedSentence();
|
||||
|
||||
/*! Getter for original word positions list.
|
||||
\returns original word positions list
|
||||
*/
|
||||
std::vector<Interval> getOriginalWordPositions() const {
|
||||
return _originalWordPositions;
|
||||
}
|
||||
|
||||
/*! Getter for word codes list.
|
||||
\returns word codes list
|
||||
*/
|
||||
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
|
||||
return _wordCodes;
|
||||
}
|
||||
|
||||
/*! Method for adding a word code to the list
|
||||
\param word code to be added
|
||||
*/
|
||||
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
|
||||
_wordCodes.push_back(wordCode);
|
||||
}
|
||||
|
||||
/*! Method for adding an original word position to the list.
|
||||
\param original word position
|
||||
*/
|
||||
void addOriginalWordPosition(Interval & originalWordPosition) {
|
||||
_originalWordPositions.push_back(originalWordPosition);
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<Interval> _originalWordPositions;
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
|
||||
};
|
||||
|
||||
#endif
|
@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
||||
/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
||||
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
|
||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||
{
|
||||
|
@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
|
||||
void TokenizedSentence::toLowerCase() {
|
||||
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
||||
}
|
||||
|
||||
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
||||
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
||||
_tokens.push_back(annotation);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,9 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "concordia/word_map.hpp"
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
@ -39,6 +42,16 @@ public:
|
||||
return _tokenAnnotations;
|
||||
}
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
||||
return _codes;
|
||||
}
|
||||
|
||||
std::vector<TokenAnnotation> getTokens() const {
|
||||
return _tokens;
|
||||
}
|
||||
|
||||
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||
|
||||
/*!
|
||||
Transform the sentence to lower case.
|
||||
*/
|
||||
@ -59,6 +72,10 @@ private:
|
||||
std::string _sentence;
|
||||
|
||||
std::list<TokenAnnotation> _tokenAnnotations;
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
||||
|
||||
std::vector<TokenAnnotation> _tokens;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user