full search stub - tests needed
This commit is contained in:
parent
53b100b2e4
commit
5a7cbbe9e9
@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||
|
||||
#define CONCORDIA_SEARCH_MAX_RESULTS 5
|
||||
#define CONCORDIA_SEARCH_MAX_RESULTS 3
|
||||
|
||||
#define WORD_MAP_FILE_NAME "word_map.bin"
|
||||
#define MARKERS_FILE_NAME "markers.bin"
|
||||
|
@ -224,6 +224,23 @@ MatchedPatternFragment Concordia::simpleSearch(
|
||||
}
|
||||
}
|
||||
|
||||
OccurencesList Concordia::fullSearch(
|
||||
const std::string & pattern,
|
||||
SUFFIX_MARKER_TYPE limit,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
bool byWhitespace)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0 && pattern.size() > 0) {
|
||||
return _searcher->fullSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern, limit, offset, byWhitespace);
|
||||
} else {
|
||||
// If the index or search pattern are empty, return an empty result.
|
||||
OccurencesList result(0);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
MatchedPatternFragment Concordia::lexiconSearch(
|
||||
const std::string & pattern,
|
||||
bool byWhitespace)
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/example.hpp"
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
#include "concordia/occurences_list.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/index_searcher.hpp"
|
||||
@ -134,6 +135,23 @@ public:
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
||||
The result contains no more than "limit" occurences, starting at "offset".
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\param limit maximum number of occurences to return
|
||||
\param offset starting occurence
|
||||
\param byWhitespace should the pattern by tokenized by white space
|
||||
\returns list of occurences of the pattern in the index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
OccurencesList fullSearch(
|
||||
const std::string & pattern,
|
||||
SUFFIX_MARKER_TYPE limit,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
bool byWhitespace = false) throw(ConcordiaException);
|
||||
|
||||
/*! Performs a search useful for lexicons in the following scenario:
|
||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||
The lexicon search performs as simple search - it requires
|
||||
|
@ -54,6 +54,47 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
|
||||
return result;
|
||||
}
|
||||
|
||||
OccurencesList IndexSearcher::fullSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern,
|
||||
SUFFIX_MARKER_TYPE limit,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
bool byWhitespace) throw(ConcordiaException) {
|
||||
int left;
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SA->data(), (saidx_t) SA->size(), &left);
|
||||
|
||||
OccurencesList result(size);
|
||||
for (int i = offset; i < limit; ++i) {
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
// As we are looking for a pattern in an array of higher
|
||||
// resolution than the hashed index file, we might
|
||||
// obtain accidental results exceeding the boundaries
|
||||
// of characters in hashed index. The above check
|
||||
// removes these accidental results.
|
||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||
|
||||
SubstringOccurence occurence;
|
||||
occurence.enterDataFromMarker(marker);
|
||||
result.addOccurence(occurence);
|
||||
}
|
||||
}
|
||||
|
||||
delete[] patternArray;
|
||||
return result;
|
||||
}
|
||||
|
||||
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
#include "concordia/occurences_list.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/concordia_searcher.hpp"
|
||||
@ -53,6 +54,30 @@ public:
|
||||
const std::string & pattern,
|
||||
bool byWhitespace = false) throw(ConcordiaException);
|
||||
|
||||
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
||||
The result contains no more than "limit" occurences, starting at "offset".
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\param limit maximum number of occurences to return
|
||||
\param offset starting occurence
|
||||
\param byWhitespace should the pattern by tokenized by white space
|
||||
\returns list of occurences of the pattern in the index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
OccurencesList fullSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern,
|
||||
SUFFIX_MARKER_TYPE limit,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
bool byWhitespace = false) throw(ConcordiaException);
|
||||
|
||||
/*! Performs a search useful for lexicons in the following scenario:
|
||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||
The lexicon search performs as simple search - it requires
|
||||
|
13
concordia/occurences_list.cpp
Normal file
13
concordia/occurences_list.cpp
Normal file
@ -0,0 +1,13 @@
|
||||
#include "concordia/occurences_list.hpp"
|
||||
|
||||
OccurencesList::OccurencesList(const SUFFIX_MARKER_TYPE & totalCount):
|
||||
_totalCount(totalCount) {
|
||||
}
|
||||
|
||||
OccurencesList::~OccurencesList() {
|
||||
}
|
||||
|
||||
void OccurencesList::addOccurence(
|
||||
const SubstringOccurence & occurence) {
|
||||
_occurences.push_back(occurence);
|
||||
}
|
50
concordia/occurences_list.hpp
Normal file
50
concordia/occurences_list.hpp
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef OCCURENCES_LIST_HDR
|
||||
#define OCCURENCES_LIST_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
/*!
|
||||
Class representing the occurences list in full search. The list only
|
||||
contains as many occurences as specified in the "limit" parameter for full search.
|
||||
The "totalCount" field stores the total number of occurences available.
|
||||
|
||||
*/
|
||||
|
||||
class OccurencesList {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit OccurencesList(const SUFFIX_MARKER_TYPE & totalCount);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~OccurencesList();
|
||||
|
||||
/*! Getter for occurences.
|
||||
\returns occurences
|
||||
*/
|
||||
std::vector<SubstringOccurence> getOccurences() const {
|
||||
return _occurences;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE getTotalCount() {
|
||||
return _totalCount;
|
||||
}
|
||||
|
||||
/*! Adds an occurence to the list.
|
||||
\param fragment occurence to be added
|
||||
*/
|
||||
void addOccurence(const SubstringOccurence & occurence);
|
||||
|
||||
|
||||
private:
|
||||
std::vector<SubstringOccurence> _occurences;
|
||||
|
||||
SUFFIX_MARKER_TYPE _totalCount;
|
||||
};
|
||||
|
||||
#endif
|
@ -171,6 +171,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
std::vector<Example> testExamples;
|
||||
testExamples.push_back(Example("xto xjest okno",1));
|
||||
testExamples.push_back(Example("czy xjest okno otwarte",2));
|
||||
testExamples.push_back(Example("chyba xto okno xjest xtutaj",3));
|
||||
testExamples.push_back(Example("xto okno xjest okno",4));
|
||||
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
|
||||
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
OccurencesList searchResult1 = concordia2.fullSearch("okno", 2, 1);
|
||||
|
||||
concordia2.clearIndex();
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 10);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
@ -328,8 +349,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
|
||||
@ -339,8 +358,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
|
||||
|
Loading…
Reference in New Issue
Block a user