lexicon search
This commit is contained in:
parent
5e809efcce
commit
61631c52a3
@ -214,10 +214,28 @@ MatchedPatternFragment Concordia::simpleSearch(
|
||||
const std::string & pattern,
|
||||
bool byWhitespace)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
if (_T->size() > 0 && pattern.size() > 0) {
|
||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern, byWhitespace);
|
||||
} else {
|
||||
// If the index or search pattern are empty, return an empty result.
|
||||
MatchedPatternFragment result(0, 0);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
MatchedPatternFragment Concordia::lexiconSearch(
|
||||
const std::string & pattern,
|
||||
bool byWhitespace)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0 && pattern.size() > 0) {
|
||||
return _searcher->lexiconSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern, byWhitespace);
|
||||
} else {
|
||||
// If the index or search pattern are empty, return an empty result.
|
||||
// Especially performing the lexicon search with an empty pattern
|
||||
// would not be funny, as it would effectively search for double EOS,
|
||||
// which is very frequent in the index.
|
||||
MatchedPatternFragment result(0, 0);
|
||||
return result;
|
||||
}
|
||||
|
@ -134,6 +134,20 @@ public:
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Performs a search useful for lexicons in the following scenario:
|
||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||
The lexicon search performs as simple search - it requires
|
||||
the match to cover the whole pattern, but additionally
|
||||
the lexicon search requires that the match is the whole example source.
|
||||
\param pattern pattern to be searched in the index
|
||||
\param byWhitespace whether to tokenize the pattern by white space
|
||||
\returns matched pattern fragment containing vector of occurences
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
MatchedPatternFragment lexiconSearch(const std::string & pattern,
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
|
@ -133,6 +133,16 @@ void ConcordiaIndex::_addSingleTokenizedExample(
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const TokenizedSentence & tokenizedSentence,
|
||||
const SUFFIX_MARKER_TYPE id) {
|
||||
|
||||
// prepend sentence boundary marker
|
||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
|
||||
|
||||
int offset = 0;
|
||||
@ -155,11 +165,10 @@ void ConcordiaIndex::_addSingleTokenizedExample(
|
||||
}
|
||||
|
||||
// append sentence boundary marker
|
||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||
sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
}
|
||||
@ -178,4 +187,3 @@ TokenizedSentence ConcordiaIndex::_addSingleExample(
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
|
@ -54,6 +54,66 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
|
||||
return result;
|
||||
}
|
||||
|
||||
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern,
|
||||
bool byWhitespace) throw(ConcordiaException) {
|
||||
int left;
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
||||
|
||||
if (hash.size() == 0) {
|
||||
// If the hash is empty, return empty result
|
||||
return MatchedPatternFragment(0, 0);
|
||||
}
|
||||
|
||||
// append and prepend query with EOS (sentenceBoundaryHI) for lexicon search
|
||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||
hash.insert(hash.begin(), sentenceBoundaryHI);
|
||||
hash.push_back(sentenceBoundaryHI);
|
||||
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SA->data(), (saidx_t) SA->size(), &left);
|
||||
|
||||
// In this scenario the whole pattern is matched, but
|
||||
// the pattern was artificially augmented by two EOS's.
|
||||
// Therefore, the matched pattern fragment starts at 0
|
||||
// and is hash.size() - 2 long.
|
||||
MatchedPatternFragment result(0, hash.size()-2);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
// As we are looking for a pattern in an array of higher
|
||||
// resolution than the hashed index file, we might
|
||||
// obtain accidental results exceeding the boundaries
|
||||
// of characters in hashed index. The above check
|
||||
// removes these accidental results.
|
||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
// Our search query started with an EOS and is non-empty,
|
||||
// so we should look at the marker of the next character
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
|
||||
|
||||
SubstringOccurence occurence;
|
||||
occurence.enterDataFromMarker(marker);
|
||||
result.addOccurence(occurence);
|
||||
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] patternArray;
|
||||
return result;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -53,6 +53,28 @@ public:
|
||||
const std::string & pattern,
|
||||
bool byWhitespace = false) throw(ConcordiaException);
|
||||
|
||||
/*! Performs a search useful for lexicons in the following scenario:
|
||||
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||
The lexicon search performs as simple search - it requires
|
||||
the match to cover the whole pattern, but additionally
|
||||
the lexicon search requires that the match is the whole example source.
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\returns matched pattern fragment, containing occurences of the pattern in the index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
MatchedPatternFragment lexiconSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern,
|
||||
bool byWhitespace = false) throw(ConcordiaException);
|
||||
|
||||
SUFFIX_MARKER_TYPE countOccurences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -171,6 +171,69 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
|
||||
/*
|
||||
0,3 type: 1 value: ala
|
||||
4,11 type: 1 value: posiada
|
||||
12,16 type: 1 value: kota
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||
|
||||
concordia.addExample(Example("Ala posiada rysia",51));
|
||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
14: "Ala posiada kota"
|
||||
51: "Ala posiada rysia"
|
||||
123: "Marysia posiada rysia"
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
posiada -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
|
||||
*/
|
||||
|
||||
MatchedPatternFragment searchResult1 = concordia.lexiconSearch("posiada rysia");
|
||||
MatchedPatternFragment searchResult2 = concordia.lexiconSearch("Ala posiada");
|
||||
MatchedPatternFragment searchResult3 = concordia.lexiconSearch("Marysia posiada rysia");
|
||||
MatchedPatternFragment searchResult4 = concordia.lexiconSearch("Ala posiada kota");
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
// first two patterns do not cover the whole example source
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14);
|
||||
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
|
Loading…
Reference in New Issue
Block a user