lexicon search

This commit is contained in:
rjawor 2017-10-10 15:39:47 +02:00
parent 5e809efcce
commit 61631c52a3
6 changed files with 191 additions and 6 deletions

View File

@ -214,10 +214,28 @@ MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern, const std::string & pattern,
bool byWhitespace) bool byWhitespace)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern, byWhitespace); _markers, _SA, pattern, byWhitespace);
} else { } else {
// If the index or search pattern are empty, return an empty result.
MatchedPatternFragment result(0, 0);
return result;
}
}
MatchedPatternFragment Concordia::lexiconSearch(
const std::string & pattern,
bool byWhitespace)
throw(ConcordiaException) {
if (_T->size() > 0 && pattern.size() > 0) {
return _searcher->lexiconSearch(_hashGenerator, _T,
_markers, _SA, pattern, byWhitespace);
} else {
// If the index or search pattern are empty, return an empty result.
// Especially performing the lexicon search with an empty pattern
// would not be funny, as it would effectively search for double EOS,
// which is very frequent in the index.
MatchedPatternFragment result(0, 0); MatchedPatternFragment result(0, 0);
return result; return result;
} }

View File

@ -134,6 +134,20 @@ public:
bool byWhitespace = false) bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*! Performs a search useful for lexicons in the following scenario:
Concordia gets fed by a lexicon (glossary) instead of a TM.
The lexicon search performs as simple search - it requires
the match to cover the whole pattern, but additionally
the lexicon search requires that the match is the whole example source.
\param pattern pattern to be searched in the index
\param byWhitespace whether to tokenize the pattern by white space
\returns matched pattern fragment containing vector of occurences
\throws ConcordiaException
*/
MatchedPatternFragment lexiconSearch(const std::string & pattern,
bool byWhitespace = false)
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);

View File

@ -133,6 +133,16 @@ void ConcordiaIndex::_addSingleTokenizedExample(
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const TokenizedSentence & tokenizedSentence, const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id) { const SUFFIX_MARKER_TYPE id) {
// prepend sentence boundary marker
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
int offset = 0; int offset = 0;
@ -155,11 +165,10 @@ void ConcordiaIndex::_addSingleTokenizedExample(
} }
// append sentence boundary marker // append sentence boundary marker
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA); Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA);
} }
@ -178,4 +187,3 @@ TokenizedSentence ConcordiaIndex::_addSingleExample(
return hashedPattern; return hashedPattern;
} }

View File

@ -54,6 +54,66 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
return result; return result;
} }
MatchedPatternFragment IndexSearcher::lexiconSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace) throw(ConcordiaException) {
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
if (hash.size() == 0) {
// If the hash is empty, return empty result
return MatchedPatternFragment(0, 0);
}
// append and prepend query with EOS (sentenceBoundaryHI) for lexicon search
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
hash.insert(hash.begin(), sentenceBoundaryHI);
hash.push_back(sentenceBoundaryHI);
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
// In this scenario the whole pattern is matched, but
// the pattern was artificially augmented by two EOS's.
// Therefore, the matched pattern fragment starts at 0
// and is hash.size() - 2 long.
MatchedPatternFragment result(0, hash.size()-2);
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
// Our search query started with an EOS and is non-empty,
// so we should look at the marker of the next character
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
SubstringOccurence occurence;
occurence.enterDataFromMarker(marker);
result.addOccurence(occurence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break;
}
}
}
delete[] patternArray;
return result;
}
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences( SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -53,6 +53,28 @@ public:
const std::string & pattern, const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException); bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a search useful for lexicons in the following scenario:
Concordia gets fed by a lexicon (glossary) instead of a TM.
The lexicon search performs as simple search - it requires
the match to cover the whole pattern, but additionally
the lexicon search requires that the match is the whole example source.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException
*/
MatchedPatternFragment lexiconSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences( SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -171,6 +171,69 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
12,16 type: 1 value: kota
*/
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
MatchedPatternFragment searchResult1 = concordia.lexiconSearch("posiada rysia");
MatchedPatternFragment searchResult2 = concordia.lexiconSearch("Ala posiada");
MatchedPatternFragment searchResult3 = concordia.lexiconSearch("Marysia posiada rysia");
MatchedPatternFragment searchResult4 = concordia.lexiconSearch("Ala posiada kota");
concordia.clearIndex();
// first two patterns do not cover the whole example source
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0);
}
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTempPath(), Concordia concordia = Concordia(TestResourcesManager::getTempPath(),