full search stub - tests needed
This commit is contained in:
parent
53b100b2e4
commit
5a7cbbe9e9
@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
|||||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||||
|
|
||||||
#define CONCORDIA_SEARCH_MAX_RESULTS 5
|
#define CONCORDIA_SEARCH_MAX_RESULTS 3
|
||||||
|
|
||||||
#define WORD_MAP_FILE_NAME "word_map.bin"
|
#define WORD_MAP_FILE_NAME "word_map.bin"
|
||||||
#define MARKERS_FILE_NAME "markers.bin"
|
#define MARKERS_FILE_NAME "markers.bin"
|
||||||
|
@ -224,6 +224,23 @@ MatchedPatternFragment Concordia::simpleSearch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OccurencesList Concordia::fullSearch(
|
||||||
|
const std::string & pattern,
|
||||||
|
SUFFIX_MARKER_TYPE limit,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
bool byWhitespace)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
if (_T->size() > 0 && pattern.size() > 0) {
|
||||||
|
return _searcher->fullSearch(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern, limit, offset, byWhitespace);
|
||||||
|
} else {
|
||||||
|
// If the index or search pattern are empty, return an empty result.
|
||||||
|
OccurencesList result(0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
MatchedPatternFragment Concordia::lexiconSearch(
|
MatchedPatternFragment Concordia::lexiconSearch(
|
||||||
const std::string & pattern,
|
const std::string & pattern,
|
||||||
bool byWhitespace)
|
bool byWhitespace)
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
#include "concordia/matched_pattern_fragment.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
#include "concordia/occurences_list.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
@ -134,6 +135,23 @@ public:
|
|||||||
bool byWhitespace = false)
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
||||||
|
The result contains no more than "limit" occurences, starting at "offset".
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param pattern string pattern to be searched in the index.
|
||||||
|
\param limit maximum number of occurences to return
|
||||||
|
\param offset starting occurence
|
||||||
|
\param byWhitespace should the pattern by tokenized by white space
|
||||||
|
\returns list of occurences of the pattern in the index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
OccurencesList fullSearch(
|
||||||
|
const std::string & pattern,
|
||||||
|
SUFFIX_MARKER_TYPE limit,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
bool byWhitespace = false) throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Performs a search useful for lexicons in the following scenario:
|
/*! Performs a search useful for lexicons in the following scenario:
|
||||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||||
The lexicon search performs as simple search - it requires
|
The lexicon search performs as simple search - it requires
|
||||||
|
@ -54,6 +54,47 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OccurencesList IndexSearcher::fullSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern,
|
||||||
|
SUFFIX_MARKER_TYPE limit,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
bool byWhitespace) throw(ConcordiaException) {
|
||||||
|
int left;
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
||||||
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
|
SA->data(), (saidx_t) SA->size(), &left);
|
||||||
|
|
||||||
|
OccurencesList result(size);
|
||||||
|
for (int i = offset; i < limit; ++i) {
|
||||||
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
|
// As we are looking for a pattern in an array of higher
|
||||||
|
// resolution than the hashed index file, we might
|
||||||
|
// obtain accidental results exceeding the boundaries
|
||||||
|
// of characters in hashed index. The above check
|
||||||
|
// removes these accidental results.
|
||||||
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
|
|
||||||
|
SubstringOccurence occurence;
|
||||||
|
occurence.enterDataFromMarker(marker);
|
||||||
|
result.addOccurence(occurence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] patternArray;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/matched_pattern_fragment.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
#include "concordia/occurences_list.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/concordia_searcher.hpp"
|
#include "concordia/concordia_searcher.hpp"
|
||||||
@ -53,6 +54,30 @@ public:
|
|||||||
const std::string & pattern,
|
const std::string & pattern,
|
||||||
bool byWhitespace = false) throw(ConcordiaException);
|
bool byWhitespace = false) throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
||||||
|
The result contains no more than "limit" occurences, starting at "offset".
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern string pattern to be searched in the index.
|
||||||
|
\param limit maximum number of occurences to return
|
||||||
|
\param offset starting occurence
|
||||||
|
\param byWhitespace should the pattern by tokenized by white space
|
||||||
|
\returns list of occurences of the pattern in the index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
OccurencesList fullSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern,
|
||||||
|
SUFFIX_MARKER_TYPE limit,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
bool byWhitespace = false) throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Performs a search useful for lexicons in the following scenario:
|
/*! Performs a search useful for lexicons in the following scenario:
|
||||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||||
The lexicon search performs as simple search - it requires
|
The lexicon search performs as simple search - it requires
|
||||||
|
13
concordia/occurences_list.cpp
Normal file
13
concordia/occurences_list.cpp
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#include "concordia/occurences_list.hpp"
|
||||||
|
|
||||||
|
OccurencesList::OccurencesList(const SUFFIX_MARKER_TYPE & totalCount):
|
||||||
|
_totalCount(totalCount) {
|
||||||
|
}
|
||||||
|
|
||||||
|
OccurencesList::~OccurencesList() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void OccurencesList::addOccurence(
|
||||||
|
const SubstringOccurence & occurence) {
|
||||||
|
_occurences.push_back(occurence);
|
||||||
|
}
|
50
concordia/occurences_list.hpp
Normal file
50
concordia/occurences_list.hpp
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#ifndef OCCURENCES_LIST_HDR
|
||||||
|
#define OCCURENCES_LIST_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class representing the occurences list in full search. The list only
|
||||||
|
contains as many occurences as specified in the "limit" parameter for full search.
|
||||||
|
The "totalCount" field stores the total number of occurences available.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class OccurencesList {
|
||||||
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
|
explicit OccurencesList(const SUFFIX_MARKER_TYPE & totalCount);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~OccurencesList();
|
||||||
|
|
||||||
|
/*! Getter for occurences.
|
||||||
|
\returns occurences
|
||||||
|
*/
|
||||||
|
std::vector<SubstringOccurence> getOccurences() const {
|
||||||
|
return _occurences;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getTotalCount() {
|
||||||
|
return _totalCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*! Adds an occurence to the list.
|
||||||
|
\param fragment occurence to be added
|
||||||
|
*/
|
||||||
|
void addOccurence(const SubstringOccurence & occurence);
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<SubstringOccurence> _occurences;
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE _totalCount;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -171,6 +171,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
std::vector<Example> testExamples;
|
||||||
|
testExamples.push_back(Example("xto xjest okno",1));
|
||||||
|
testExamples.push_back(Example("czy xjest okno otwarte",2));
|
||||||
|
testExamples.push_back(Example("chyba xto okno xjest xtutaj",3));
|
||||||
|
testExamples.push_back(Example("xto okno xjest okno",4));
|
||||||
|
std::vector<TokenizedSentence> hashedPatterns = concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
|
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
OccurencesList searchResult1 = concordia2.fullSearch("okno", 2, 1);
|
||||||
|
|
||||||
|
concordia2.clearIndex();
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 10);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
@ -328,8 +349,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
|
||||||
@ -339,8 +358,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
|
||||||
|
Loading…
Reference in New Issue
Block a user