full search stub - tests needed

This commit is contained in:
rjawor 2019-01-09 15:30:56 +01:00
parent 53b100b2e4
commit 5a7cbbe9e9
8 changed files with 186 additions and 5 deletions

View File

@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
#define CONCORDIA_SEARCH_MAX_RESULTS 5
#define CONCORDIA_SEARCH_MAX_RESULTS 3
#define WORD_MAP_FILE_NAME "word_map.bin"
#define MARKERS_FILE_NAME "markers.bin"

View File

@ -224,6 +224,23 @@ MatchedPatternFragment Concordia::simpleSearch(
}
}
OccurencesList Concordia::fullSearch(
const std::string & pattern,
SUFFIX_MARKER_TYPE limit,
SUFFIX_MARKER_TYPE offset,
bool byWhitespace)
throw(ConcordiaException) {
if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->fullSearch(_hashGenerator, _T,
_markers, _SA, pattern, limit, offset, byWhitespace);
} else {
// If the index or search pattern are empty, return an empty result.
OccurencesList result(0);
return result;
}
}
MatchedPatternFragment Concordia::lexiconSearch(
const std::string & pattern,
bool byWhitespace)

View File

@ -9,6 +9,7 @@
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
@ -134,6 +135,23 @@ public:
bool byWhitespace = false)
throw(ConcordiaException);
/*! Performs a substring lookup in RAM-based index, returning all occurences.
The result contains no more than "limit" occurences, starting at "offset".
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param pattern string pattern to be searched in the index.
\param limit maximum number of occurences to return
\param offset starting occurence
\param byWhitespace should the pattern by tokenized by white space
\returns list of occurences of the pattern in the index
\throws ConcordiaException
*/
OccurencesList fullSearch(
const std::string & pattern,
SUFFIX_MARKER_TYPE limit,
SUFFIX_MARKER_TYPE offset,
bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a search useful for lexicons in the following scenario:
Concordia gets fed by a lexicon (glossary) instead of a TM.
The lexicon search performs as simple search - it requires

View File

@ -54,6 +54,47 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
return result;
}
OccurencesList IndexSearcher::fullSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
SUFFIX_MARKER_TYPE limit,
SUFFIX_MARKER_TYPE offset,
bool byWhitespace) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
OccurencesList result(size);
for (int i = offset; i < limit; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
SubstringOccurence occurence;
occurence.enterDataFromMarker(marker);
result.addOccurence(occurence);
}
}
delete[] patternArray;
return result;
}
MatchedPatternFragment IndexSearcher::lexiconSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -8,6 +8,7 @@
#include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/concordia_searcher.hpp"
@ -53,6 +54,30 @@ public:
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a substring lookup in RAM-based index, returning all occurences.
The result contains no more than "limit" occurences, starting at "offset".
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\param limit maximum number of occurences to return
\param offset starting occurence
\param byWhitespace should the pattern by tokenized by white space
\returns list of occurences of the pattern in the index
\throws ConcordiaException
*/
OccurencesList fullSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
SUFFIX_MARKER_TYPE limit,
SUFFIX_MARKER_TYPE offset,
bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a search useful for lexicons in the following scenario:
Concordia gets fed by a lexicon (glossary) instead of a TM.
The lexicon search performs as simple search - it requires

View File

@ -0,0 +1,13 @@
#include "concordia/occurences_list.hpp"
OccurencesList::OccurencesList(const SUFFIX_MARKER_TYPE & totalCount):
_totalCount(totalCount) {
}
OccurencesList::~OccurencesList() {
}
void OccurencesList::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -0,0 +1,50 @@
#ifndef OCCURENCES_LIST_HDR
#define OCCURENCES_LIST_HDR
#include "concordia/common/config.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*!
Class representing the occurences list in full search. The list only
contains as many occurences as specified in the "limit" parameter for full search.
The "totalCount" field stores the total number of occurences available.
*/
class OccurencesList {
public:
/*! Constructor.
*/
explicit OccurencesList(const SUFFIX_MARKER_TYPE & totalCount);
/*! Destructor.
*/
virtual ~OccurencesList();
/*! Getter for occurences.
\returns occurences
*/
std::vector<SubstringOccurence> getOccurences() const {
return _occurences;
}
SUFFIX_MARKER_TYPE getTotalCount() {
return _totalCount;
}
/*! Adds an occurence to the list.
\param fragment occurence to be added
*/
void addOccurence(const SubstringOccurence & occurence);
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _totalCount;
};
#endif

View File

@ -171,6 +171,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
}
BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<Example> testExamples;
testExamples.push_back(Example("xto xjest okno",1));
testExamples.push_back(Example("czy xjest okno otwarte",2));
testExamples.push_back(Example("chyba xto okno xjest xtutaj",3));
testExamples.push_back(Example("xto okno xjest okno",4));
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
OccurencesList searchResult1 = concordia2.fullSearch("okno", 2, 1);
concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 10);
}
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
@ -328,8 +349,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
@ -339,8 +358,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);