From 61631c52a318be3093f03627602df7aff920a908 Mon Sep 17 00:00:00 2001 From: rjawor Date: Tue, 10 Oct 2017 15:39:47 +0200 Subject: [PATCH] lexicon search --- concordia/concordia.cpp | 20 ++++++++++- concordia/concordia.hpp | 16 ++++++++- concordia/concordia_index.cpp | 16 ++++++--- concordia/index_searcher.cpp | 60 ++++++++++++++++++++++++++++++++ concordia/index_searcher.hpp | 22 ++++++++++++ concordia/t/test_concordia.cpp | 63 ++++++++++++++++++++++++++++++++++ 6 files changed, 191 insertions(+), 6 deletions(-) diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 848eca6..8d483d5 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -214,10 +214,28 @@ MatchedPatternFragment Concordia::simpleSearch( const std::string & pattern, bool byWhitespace) throw(ConcordiaException) { - if (_T->size() > 0) { + if (_T->size() > 0 && pattern.size() > 0) { return _searcher->simpleSearch(_hashGenerator, _T, _markers, _SA, pattern, byWhitespace); } else { + // If the index or search pattern are empty, return an empty result. + MatchedPatternFragment result(0, 0); + return result; + } +} + +MatchedPatternFragment Concordia::lexiconSearch( + const std::string & pattern, + bool byWhitespace) + throw(ConcordiaException) { + if (_T->size() > 0 && pattern.size() > 0) { + return _searcher->lexiconSearch(_hashGenerator, _T, + _markers, _SA, pattern, byWhitespace); + } else { + // If the index or search pattern are empty, return an empty result. + // Especially performing the lexicon search with an empty pattern + // would not be funny, as it would effectively search for double EOS, + // which is very frequent in the index. MatchedPatternFragment result(0, 0); return result; } diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index b373da6..e4329af 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -131,9 +131,23 @@ public: \throws ConcordiaException */ MatchedPatternFragment simpleSearch(const std::string & pattern, - bool byWhitespace = false) + bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a search useful for lexicons in the following scenario: + Concordia gets fed by a lexicon (glossary) instead of a TM. + The lexicon search performs as simple search - it requires + the match to cover the whole pattern, but additionally + the lexicon search requires that the match is the whole example source. + \param pattern pattern to be searched in the index + \param byWhitespace whether to tokenize the pattern by white space + \returns matched pattern fragment containing vector of occurences + \throws ConcordiaException + */ + MatchedPatternFragment lexiconSearch(const std::string & pattern, + bool byWhitespace = false) + throw(ConcordiaException); + SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) throw(ConcordiaException); diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 3eb98d7..d602564 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -133,6 +133,16 @@ void ConcordiaIndex::_addSingleTokenizedExample( boost::shared_ptr > markers, const TokenizedSentence & tokenizedSentence, const SUFFIX_MARKER_TYPE id) { + + // prepend sentence boundary marker + INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; + Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); + Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); + SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; + Utils::writeMarker(markersFile, sentenceBoundaryMA); + markers->push_back(sentenceBoundaryMA); + + std::vector hash = tokenizedSentence.getCodes(); int offset = 0; @@ -155,11 +165,10 @@ void ConcordiaIndex::_addSingleTokenizedExample( } // append sentence boundary marker - INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; + sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); - - SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; + sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); } @@ -178,4 +187,3 @@ TokenizedSentence ConcordiaIndex::_addSingleExample( return hashedPattern; } - diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index e5f6310..6ff535f 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -54,6 +54,66 @@ MatchedPatternFragment IndexSearcher::simpleSearch( return result; } +MatchedPatternFragment IndexSearcher::lexiconSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern, + bool byWhitespace) throw(ConcordiaException) { + int left; + std::vector hash = + hashGenerator->generateHash(pattern, byWhitespace).getCodes(); + + if (hash.size() == 0) { + // If the hash is empty, return empty result + return MatchedPatternFragment(0, 0); + } + + // append and prepend query with EOS (sentenceBoundaryHI) for lexicon search + INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; + hash.insert(hash.begin(), sentenceBoundaryHI); + hash.push_back(sentenceBoundaryHI); + + saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); + sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); + + int size = sa_search(T->data(), (saidx_t) T->size(), + (const sauchar_t *) patternArray, patternLength, + SA->data(), (saidx_t) SA->size(), &left); + + // In this scenario the whole pattern is matched, but + // the pattern was artificially augmented by two EOS's. + // Therefore, the matched pattern fragment starts at 0 + // and is hash.size() - 2 long. + MatchedPatternFragment result(0, hash.size()-2); + for (int i = 0; i < size; ++i) { + saidx_t resultPos = SA->at(left + i); + if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { + // As we are looking for a pattern in an array of higher + // resolution than the hashed index file, we might + // obtain accidental results exceeding the boundaries + // of characters in hashed index. The above check + // removes these accidental results. + saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); + + // Our search query started with an EOS and is non-empty, + // so we should look at the marker of the next character + SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1); + + SubstringOccurence occurence; + occurence.enterDataFromMarker(marker); + result.addOccurence(occurence); + if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { + break; + } + } + } + + delete[] patternArray; + return result; +} + SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( boost::shared_ptr hashGenerator, boost::shared_ptr > T, diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index aa5b6e0..7af4245 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -53,6 +53,28 @@ public: const std::string & pattern, bool byWhitespace = false) throw(ConcordiaException); + /*! Performs a search useful for lexicons in the following scenario: + Concordia gets fed by a lexicon (glossary) instead of a TM. + The lexicon search performs as simple search - it requires + the match to cover the whole pattern, but additionally + the lexicon search requires that the match is the whole example source. + \param hashGenerator hash generator to be used to convert + input sentence to a hash + \param T hashed index to search in + \param markers markers array for the needs of searching + \param SA suffix array for the needs of searching + \param pattern string pattern to be searched in the index. + \returns matched pattern fragment, containing occurences of the pattern in the index + \throws ConcordiaException + */ + MatchedPatternFragment lexiconSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const std::string & pattern, + bool byWhitespace = false) throw(ConcordiaException); + SUFFIX_MARKER_TYPE countOccurences( boost::shared_ptr hashGenerator, boost::shared_ptr > T, diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index cd524b9..f0c008a 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -171,6 +171,69 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2); } +BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14)); + /* + 0,3 type: 1 value: ala + 4,11 type: 1 value: posiada + 12,16 type: 1 value: kota + */ + BOOST_CHECK_EQUAL(ts.getTokens().size(), 3); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); + + concordia.addExample(Example("Ala posiada rysia",51)); + concordia.addExample(Example("Marysia posiada rysia",123)); + concordia.refreshSAfromRAM(); + + /*The test index contains 3 sentences: + 14: "Ala posiada kota" + 51: "Ala posiada rysia" + 123: "Marysia posiada rysia" + + Test word map: + Ala -> 0 + posiada -> 1 + kota -> 2 + rysia -> 3 + Marysia -> 4 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + T[n]: 0 1 2 | 0 1 3 | 4 1 3 | + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 9 10 11 + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + + */ + + MatchedPatternFragment searchResult1 = concordia.lexiconSearch("posiada rysia"); + MatchedPatternFragment searchResult2 = concordia.lexiconSearch("Ala posiada"); + MatchedPatternFragment searchResult3 = concordia.lexiconSearch("Marysia posiada rysia"); + MatchedPatternFragment searchResult4 = concordia.lexiconSearch("Ala posiada kota"); + + concordia.clearIndex(); + + // first two patterns do not cover the whole example source + BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0); + + BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1); + BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123); + BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0); + + BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1); + BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14); + BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0); + +} + BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) { Concordia concordia = Concordia(TestResourcesManager::getTempPath(),