diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index e59b33c..4f7bba5 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. -#define CONCORDIA_SEARCH_MAX_RESULTS 5 +#define CONCORDIA_SEARCH_MAX_RESULTS 3 #define WORD_MAP_FILE_NAME "word_map.bin" #define MARKERS_FILE_NAME "markers.bin" diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 8d483d5..f15b1b5 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -224,6 +224,23 @@ MatchedPatternFragment Concordia::simpleSearch( } } +OccurencesList Concordia::fullSearch( + const std::string & pattern, + SUFFIX_MARKER_TYPE limit, + SUFFIX_MARKER_TYPE offset, + bool byWhitespace) + throw(ConcordiaException) { + if (_T->size() > 0 && pattern.size() > 0) { + return _searcher->fullSearch(_hashGenerator, _T, + _markers, _SA, pattern, limit, offset, byWhitespace); + } else { + // If the index or search pattern are empty, return an empty result. + OccurencesList result(0); + return result; + } +} + + MatchedPatternFragment Concordia::lexiconSearch( const std::string & pattern, bool byWhitespace) diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index e4329af..67ab351 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -9,6 +9,7 @@ #include "concordia/common/config.hpp" #include "concordia/example.hpp" #include "concordia/matched_pattern_fragment.hpp" +#include "concordia/occurences_list.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_index.hpp" #include "concordia/index_searcher.hpp" @@ -134,6 +135,23 @@ public: bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a substring lookup in RAM-based index, returning all occurences. + The result contains no more than "limit" occurences, starting at "offset". + \param hashGenerator hash generator to be used to convert + input sentence to a hash + \param pattern string pattern to be searched in the index. + \param limit maximum number of occurences to return + \param offset starting occurence + \param byWhitespace should the pattern by tokenized by white space + \returns list of occurences of the pattern in the index + \throws ConcordiaException + */ + OccurencesList fullSearch( + const std::string & pattern, + SUFFIX_MARKER_TYPE limit, + SUFFIX_MARKER_TYPE offset, + bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a search useful for lexicons in the following scenario: Concordia gets fed by a lexicon (glossary) instead of a TM. The lexicon search performs as simple search - it requires diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 6ff535f..c043510 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -54,6 +54,47 @@ MatchedPatternFragment IndexSearcher::simpleSearch( return result; } +OccurencesList IndexSearcher::fullSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern, + SUFFIX_MARKER_TYPE limit, + SUFFIX_MARKER_TYPE offset, + bool byWhitespace) throw(ConcordiaException) { + int left; + std::vector hash = + hashGenerator->generateHash(pattern, byWhitespace).getCodes(); + saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); + sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); + + int size = sa_search(T->data(), (saidx_t) T->size(), + (const sauchar_t *) patternArray, patternLength, + SA->data(), (saidx_t) SA->size(), &left); + + OccurencesList result(size); + for (int i = offset; i < limit; ++i) { + saidx_t resultPos = SA->at(left + i); + if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { + // As we are looking for a pattern in an array of higher + // resolution than the hashed index file, we might + // obtain accidental results exceeding the boundaries + // of characters in hashed index. The above check + // removes these accidental results. + saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); + SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); + + SubstringOccurence occurence; + occurence.enterDataFromMarker(marker); + result.addOccurence(occurence); + } + } + + delete[] patternArray; + return result; +} + MatchedPatternFragment IndexSearcher::lexiconSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index 7af4245..67bea2f 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -8,6 +8,7 @@ #include "concordia/common/config.hpp" #include "concordia/matched_pattern_fragment.hpp" +#include "concordia/occurences_list.hpp" #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/concordia_searcher.hpp" @@ -53,6 +54,30 @@ public: const std::string & pattern, bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a substring lookup in RAM-based index, returning all occurences. + The result contains no more than "limit" occurences, starting at "offset". + \param hashGenerator hash generator to be used to convert + input sentence to a hash + \param T hashed index to search in + \param markers markers array for the needs of searching + \param SA suffix array for the needs of searching + \param pattern string pattern to be searched in the index. + \param limit maximum number of occurences to return + \param offset starting occurence + \param byWhitespace should the pattern by tokenized by white space + \returns list of occurences of the pattern in the index + \throws ConcordiaException + */ + OccurencesList fullSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern, + SUFFIX_MARKER_TYPE limit, + SUFFIX_MARKER_TYPE offset, + bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a search useful for lexicons in the following scenario: Concordia gets fed by a lexicon (glossary) instead of a TM. The lexicon search performs as simple search - it requires diff --git a/concordia/occurences_list.cpp b/concordia/occurences_list.cpp new file mode 100644 index 0000000..83cde5d --- /dev/null +++ b/concordia/occurences_list.cpp @@ -0,0 +1,13 @@ +#include "concordia/occurences_list.hpp" + +OccurencesList::OccurencesList(const SUFFIX_MARKER_TYPE & totalCount): + _totalCount(totalCount) { +} + +OccurencesList::~OccurencesList() { +} + +void OccurencesList::addOccurence( + const SubstringOccurence & occurence) { + _occurences.push_back(occurence); +} diff --git a/concordia/occurences_list.hpp b/concordia/occurences_list.hpp new file mode 100644 index 0000000..8ca7267 --- /dev/null +++ b/concordia/occurences_list.hpp @@ -0,0 +1,50 @@ +#ifndef OCCURENCES_LIST_HDR +#define OCCURENCES_LIST_HDR + +#include "concordia/common/config.hpp" +#include "concordia/substring_occurence.hpp" +#include +#include +#include + +/*! + Class representing the occurences list in full search. The list only + contains as many occurences as specified in the "limit" parameter for full search. + The "totalCount" field stores the total number of occurences available. + +*/ + +class OccurencesList { +public: + /*! Constructor. + */ + explicit OccurencesList(const SUFFIX_MARKER_TYPE & totalCount); + + /*! Destructor. + */ + virtual ~OccurencesList(); + + /*! Getter for occurences. + \returns occurences + */ + std::vector getOccurences() const { + return _occurences; + } + + SUFFIX_MARKER_TYPE getTotalCount() { + return _totalCount; + } + + /*! Adds an occurence to the list. + \param fragment occurence to be added + */ + void addOccurence(const SubstringOccurence & occurence); + + +private: + std::vector _occurences; + + SUFFIX_MARKER_TYPE _totalCount; +}; + +#endif diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 2d5063c..67caa73 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -171,6 +171,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2); } +BOOST_AUTO_TEST_CASE( ConcordiaFullSearch1 ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + std::vector testExamples; + testExamples.push_back(Example("xto xjest okno",1)); + testExamples.push_back(Example("czy xjest okno otwarte",2)); + testExamples.push_back(Example("chyba xto okno xjest xtutaj",3)); + testExamples.push_back(Example("xto okno xjest okno",4)); + std::vector hashedPatterns = concordia.addAllExamples(testExamples); + + Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + OccurencesList searchResult1 = concordia2.fullSearch("okno", 2, 1); + + concordia2.clearIndex(); + + BOOST_CHECK_EQUAL(searchResult1.getTotalCount(), 10); + +} + BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 ) { Concordia concordia = Concordia(TestResourcesManager::getTempPath(), @@ -328,8 +349,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1); @@ -339,8 +358,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);