new responsibilities of tokenized sentence
This commit is contained in:
parent
9b1735516c
commit
724bf0d080
@ -8,7 +8,6 @@ endforeach(dir)
|
|||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
token_annotation.cpp
|
token_annotation.cpp
|
||||||
tokenized_sentence.cpp
|
tokenized_sentence.cpp
|
||||||
hashed_sentence.cpp
|
|
||||||
concordia_search_result.cpp
|
concordia_search_result.cpp
|
||||||
matched_pattern_fragment.cpp
|
matched_pattern_fragment.cpp
|
||||||
concordia_searcher.cpp
|
concordia_searcher.cpp
|
||||||
@ -38,7 +37,6 @@ install(TARGETS concordia DESTINATION lib/)
|
|||||||
install(FILES
|
install(FILES
|
||||||
token_annotation.hpp
|
token_annotation.hpp
|
||||||
tokenized_sentence.hpp
|
tokenized_sentence.hpp
|
||||||
hashed_sentence.hpp
|
|
||||||
concordia_search_result.hpp
|
concordia_search_result.hpp
|
||||||
matched_pattern_fragment.hpp
|
matched_pattern_fragment.hpp
|
||||||
concordia_searcher.hpp
|
concordia_searcher.hpp
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
|
#include "concordia/token_annotation.hpp"
|
||||||
|
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/archive/binary_oarchive.hpp>
|
#include <boost/archive/binary_oarchive.hpp>
|
||||||
#include <boost/archive/binary_iarchive.hpp>
|
#include <boost/archive/binary_iarchive.hpp>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
@ -28,28 +30,27 @@ HashGenerator::~HashGenerator() {
|
|||||||
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||||
const std::string & sentence) throw(ConcordiaException) {
|
const std::string & sentence) throw(ConcordiaException) {
|
||||||
std::vector<INDEX_CHARACTER_TYPE> result;
|
std::vector<INDEX_CHARACTER_TYPE> result;
|
||||||
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
|
|
||||||
if (tokenTexts.size() > Utils::maxSentenceSize) {
|
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||||
|
ts->generateHash(_wordMap);
|
||||||
|
|
||||||
|
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
||||||
throw ConcordiaException("Trying to add too long sentence.");
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
}
|
}
|
||||||
for (std::vector<std::string>::iterator it = tokenTexts.begin();
|
|
||||||
it != tokenTexts.end(); ++it) {
|
|
||||||
std::string token = *it;
|
|
||||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
|
||||||
result.push_back(code);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return ts->getCodes();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||||
const std::string & sentence) {
|
const std::string & sentence) {
|
||||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||||
std::string tokenizedSentence = ts->getSentence();
|
|
||||||
boost::trim(tokenizedSentence);
|
|
||||||
std::vector<std::string> tokenTexts;
|
std::vector<std::string> tokenTexts;
|
||||||
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
|
BOOST_FOREACH(TokenAnnotation annotation, ts->getAnnotations()) {
|
||||||
boost::algorithm::token_compress_on);
|
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||||
|
annotation.getType() == TokenAnnotation::NE) {
|
||||||
|
tokenTexts.push_back(annotation.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
return tokenTexts;
|
return tokenTexts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ public:
|
|||||||
Generates vector of tokens from a sentence. This method is internally
|
Generates vector of tokens from a sentence. This method is internally
|
||||||
used by generateHash. However, for the sake of concordiaSearch
|
used by generateHash. However, for the sake of concordiaSearch
|
||||||
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
||||||
anonymizing and tokenization is also needed.
|
tokenization is also needed.
|
||||||
\param sentence sentence to tokenize
|
\param sentence sentence to tokenize
|
||||||
\returns vector of tokens
|
\returns vector of tokens
|
||||||
*/
|
*/
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
#include "concordia/hashed_sentence.hpp"
|
|
||||||
|
|
||||||
HashedSentence::HashedSentence() {
|
|
||||||
}
|
|
||||||
|
|
||||||
HashedSentence::~HashedSentence() {
|
|
||||||
}
|
|
@ -1,61 +0,0 @@
|
|||||||
#ifndef HASHED_SENTENCE_HDR
|
|
||||||
#define HASHED_SENTENCE_HDR
|
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
|
||||||
#include "concordia/interval.hpp"
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
/*!
|
|
||||||
A sentence after hashing by the HashGenerator. The class holds
|
|
||||||
the list of word codes and intervals representing original
|
|
||||||
word positions in the sentence (char-based).
|
|
||||||
*/
|
|
||||||
|
|
||||||
class HashedSentence {
|
|
||||||
public:
|
|
||||||
/*!
|
|
||||||
Constructor.
|
|
||||||
|
|
||||||
*/
|
|
||||||
HashedSentence();
|
|
||||||
|
|
||||||
/*! Destructor.
|
|
||||||
*/
|
|
||||||
virtual ~HashedSentence();
|
|
||||||
|
|
||||||
/*! Getter for original word positions list.
|
|
||||||
\returns original word positions list
|
|
||||||
*/
|
|
||||||
std::vector<Interval> getOriginalWordPositions() const {
|
|
||||||
return _originalWordPositions;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Getter for word codes list.
|
|
||||||
\returns word codes list
|
|
||||||
*/
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
|
|
||||||
return _wordCodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Method for adding a word code to the list
|
|
||||||
\param word code to be added
|
|
||||||
*/
|
|
||||||
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
|
|
||||||
_wordCodes.push_back(wordCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Method for adding an original word position to the list.
|
|
||||||
\param original word position
|
|
||||||
*/
|
|
||||||
void addOriginalWordPosition(Interval & originalWordPosition) {
|
|
||||||
_originalWordPositions.push_back(originalWordPosition);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<Interval> _originalWordPositions;
|
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
/* Commented out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
||||||
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
|
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
|
||||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||||
{
|
{
|
||||||
|
@ -47,3 +47,15 @@ void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations)
|
|||||||
void TokenizedSentence::toLowerCase() {
|
void TokenizedSentence::toLowerCase() {
|
||||||
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
||||||
|
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||||
|
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||||
|
annotation.getType() == TokenAnnotation::NE) {
|
||||||
|
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
||||||
|
_tokens.push_back(annotation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,9 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/token_annotation.hpp"
|
#include "concordia/token_annotation.hpp"
|
||||||
|
#include "concordia/word_map.hpp"
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -39,6 +42,16 @@ public:
|
|||||||
return _tokenAnnotations;
|
return _tokenAnnotations;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
||||||
|
return _codes;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<TokenAnnotation> getTokens() const {
|
||||||
|
return _tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Transform the sentence to lower case.
|
Transform the sentence to lower case.
|
||||||
*/
|
*/
|
||||||
@ -59,6 +72,10 @@ private:
|
|||||||
std::string _sentence;
|
std::string _sentence;
|
||||||
|
|
||||||
std::list<TokenAnnotation> _tokenAnnotations;
|
std::list<TokenAnnotation> _tokenAnnotations;
|
||||||
|
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
||||||
|
|
||||||
|
std::vector<TokenAnnotation> _tokens;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user