lexicon search

2017-10-10 15:39:47 +02:00 · 2017-10-10 15:39:47 +02:00 · 61631c52a3
commit 61631c52a3
parent 5e809efcce
6 changed files with 191 additions and 6 deletions
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -214,10 +214,28 @@ MatchedPatternFragment Concordia::simpleSearch(
                                  const std::string & pattern,
                                  bool byWhitespace)
                                  throw(ConcordiaException) {
-    if (_T->size() > 0) {
+    if (_T->size() > 0 && pattern.size() > 0) {
        return _searcher->simpleSearch(_hashGenerator, _T,
                                         _markers, _SA, pattern, byWhitespace);
    } else {
        // If the index or search pattern are empty, return an empty result.
        MatchedPatternFragment result(0, 0);
        return result;
    }
 }
 MatchedPatternFragment Concordia::lexiconSearch(
                                  const std::string & pattern,
                                  bool byWhitespace)
                                  throw(ConcordiaException) {
    if (_T->size() > 0 && pattern.size() > 0) {
        return _searcher->lexiconSearch(_hashGenerator, _T,
                                         _markers, _SA, pattern, byWhitespace);
    } else {
        // If the index or search pattern are empty, return an empty result.
        // Especially performing the lexicon search with an empty pattern
        // would not be funny, as it would effectively search for double EOS,
        // which is very frequent in the index.
        MatchedPatternFragment result(0, 0);
        return result;
    }
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -134,6 +134,20 @@ public:
                                          bool byWhitespace = false)
                                          throw(ConcordiaException);
    /*! Performs a search useful for lexicons in the following scenario:
      Concordia gets fed by a lexicon (glossary) instead of a TM.
      The lexicon search performs as simple search - it requires
      the match to cover the whole pattern, but additionally
      the lexicon search requires that the match is the whole example source.
    \param pattern pattern to be searched in the index
    \param byWhitespace whether to tokenize the pattern by white space
    \returns matched pattern fragment containing vector of occurences
    \throws ConcordiaException
    */
    MatchedPatternFragment lexiconSearch(const std::string & pattern,
                                        bool byWhitespace = false)
                                        throw(ConcordiaException);
    SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
                                          throw(ConcordiaException);
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -133,6 +133,16 @@ void ConcordiaIndex::_addSingleTokenizedExample(
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const TokenizedSentence & tokenizedSentence,
                   const SUFFIX_MARKER_TYPE id) {
    // prepend sentence boundary marker
    INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
    Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
    Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
    SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    Utils::writeMarker(markersFile, sentenceBoundaryMA);
    markers->push_back(sentenceBoundaryMA);
    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
    int offset = 0;
@ -155,11 +165,10 @@ void ConcordiaIndex::_addSingleTokenizedExample(
    }
    // append sentence boundary marker
-    INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
+    sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
    Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
    Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
-
+    sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    Utils::writeMarker(markersFile, sentenceBoundaryMA);
    markers->push_back(sentenceBoundaryMA);
 }
@ -178,4 +187,3 @@ TokenizedSentence ConcordiaIndex::_addSingleExample(
    return hashedPattern;
 }
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -54,6 +54,66 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
    return result;
 }
 MatchedPatternFragment IndexSearcher::lexiconSearch(
                  boost::shared_ptr<HashGenerator> hashGenerator,
                  boost::shared_ptr<std::vector<sauchar_t> > T,
                  boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                  boost::shared_ptr<std::vector<saidx_t> > SA,
                  const std::string & pattern,
                  bool byWhitespace) throw(ConcordiaException) {
    int left;
    std::vector<INDEX_CHARACTER_TYPE> hash =
                hashGenerator->generateHash(pattern, byWhitespace).getCodes();
    if (hash.size() == 0) {
        // If the hash is empty, return empty result
        return MatchedPatternFragment(0, 0);
    }
    // append and prepend query with EOS (sentenceBoundaryHI) for lexicon search
    INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
    hash.insert(hash.begin(), sentenceBoundaryHI);
    hash.push_back(sentenceBoundaryHI);
    saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
    sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
    int size = sa_search(T->data(), (saidx_t) T->size(),
                         (const sauchar_t *) patternArray, patternLength,
                         SA->data(), (saidx_t) SA->size(), &left);
    // In this scenario the whole pattern is matched, but
    // the pattern was artificially augmented by two EOS's.
    // Therefore, the matched pattern fragment starts at 0
    // and is hash.size() - 2 long.
    MatchedPatternFragment result(0, hash.size()-2);
    for (int i = 0; i < size; ++i) {
        saidx_t resultPos = SA->at(left + i);
        if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
        // As we are looking for a pattern in an array of higher
        // resolution than the hashed index file, we might
        // obtain accidental results exceeding the boundaries
        // of characters in hashed index. The above check
        // removes these accidental results.
            saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
            // Our search query started with an EOS and is non-empty,
            // so we should look at the marker of the next character
            SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
            SubstringOccurence occurence;
            occurence.enterDataFromMarker(marker);
            result.addOccurence(occurence);
            if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
                break;
            }
        }
    }
    delete[] patternArray;
    return result;
 }
 SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
                  boost::shared_ptr<HashGenerator> hashGenerator,
                  boost::shared_ptr<std::vector<sauchar_t> > T,
--- a/concordia/index_searcher.hpp
+++ b/concordia/index_searcher.hpp
@ -53,6 +53,28 @@ public:
                    const std::string & pattern,
                    bool byWhitespace = false) throw(ConcordiaException);
    /*! Performs a search useful for lexicons in the following scenario:
        Concordia gets fed by a lexicon (glossary) instead of a TM.
        The lexicon search performs as simple search - it requires
        the match to cover the whole pattern, but additionally
        the lexicon search requires that the match is the whole example source.
      \param hashGenerator hash generator to be used to convert
             input sentence to a hash
      \param T hashed index to search in
      \param markers markers array for the needs of searching
      \param SA suffix array for the needs of searching
      \param pattern string pattern to be searched in the index.
      \returns matched pattern fragment, containing occurences of the pattern in the index
      \throws ConcordiaException
    */
    MatchedPatternFragment lexiconSearch(
                    boost::shared_ptr<HashGenerator> hashGenerator,
                    boost::shared_ptr<std::vector<sauchar_t> > T,
                    boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                    boost::shared_ptr<std::vector<saidx_t> > SA,
                    const std::string & pattern,
                    bool byWhitespace = false) throw(ConcordiaException);
    SUFFIX_MARKER_TYPE countOccurences(
                    boost::shared_ptr<HashGenerator> hashGenerator,
                    boost::shared_ptr<std::vector<sauchar_t> > T,
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -171,6 +171,69 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
    BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
 }
 BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
    TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
    12,16 type: 1 value: kota
    */
    BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
    concordia.addExample(Example("Ala posiada rysia",51));
    concordia.addExample(Example("Marysia posiada rysia",123));
    concordia.refreshSAfromRAM();
    /*The test index contains 3 sentences:
     14: "Ala posiada kota"
     51: "Ala posiada rysia"
    123: "Marysia posiada rysia"
    Test word map:
    Ala -> 0
    posiada -> 1
    kota -> 2
    rysia -> 3
    Marysia -> 4
    Test hashed index:
        n: 0  1  2  3  4  5  6  7  8  9 10 11
     T[n]: 0  1  2  |  0  1  3  |  4  1  3  |
    Test suffix array:
        n: 0  1  2  3  4  5  6  7  8  9 10 11
    SA[n]: 0  4  1  9  5  2 10  6  8 11  3  7
    */
    MatchedPatternFragment searchResult1 = concordia.lexiconSearch("posiada rysia");
    MatchedPatternFragment searchResult2 = concordia.lexiconSearch("Ala posiada");
    MatchedPatternFragment searchResult3 = concordia.lexiconSearch("Marysia posiada rysia");
    MatchedPatternFragment searchResult4 = concordia.lexiconSearch("Ala posiada kota");
    concordia.clearIndex();
    // first two patterns do not cover the whole example source
    BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0);
    BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
    BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1);
    BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123);
    BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0);
    BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1);
    BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14);
    BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0);
 }
 BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),