lexicon search
This commit is contained in:
parent
5e809efcce
commit
61631c52a3
@ -214,10 +214,28 @@ MatchedPatternFragment Concordia::simpleSearch(
|
|||||||
const std::string & pattern,
|
const std::string & pattern,
|
||||||
bool byWhitespace)
|
bool byWhitespace)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0 && pattern.size() > 0) {
|
||||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||||
_markers, _SA, pattern, byWhitespace);
|
_markers, _SA, pattern, byWhitespace);
|
||||||
} else {
|
} else {
|
||||||
|
// If the index or search pattern are empty, return an empty result.
|
||||||
|
MatchedPatternFragment result(0, 0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchedPatternFragment Concordia::lexiconSearch(
|
||||||
|
const std::string & pattern,
|
||||||
|
bool byWhitespace)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
if (_T->size() > 0 && pattern.size() > 0) {
|
||||||
|
return _searcher->lexiconSearch(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern, byWhitespace);
|
||||||
|
} else {
|
||||||
|
// If the index or search pattern are empty, return an empty result.
|
||||||
|
// Especially performing the lexicon search with an empty pattern
|
||||||
|
// would not be funny, as it would effectively search for double EOS,
|
||||||
|
// which is very frequent in the index.
|
||||||
MatchedPatternFragment result(0, 0);
|
MatchedPatternFragment result(0, 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -134,6 +134,20 @@ public:
|
|||||||
bool byWhitespace = false)
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Performs a search useful for lexicons in the following scenario:
|
||||||
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||||
|
The lexicon search performs as simple search - it requires
|
||||||
|
the match to cover the whole pattern, but additionally
|
||||||
|
the lexicon search requires that the match is the whole example source.
|
||||||
|
\param pattern pattern to be searched in the index
|
||||||
|
\param byWhitespace whether to tokenize the pattern by white space
|
||||||
|
\returns matched pattern fragment containing vector of occurences
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
MatchedPatternFragment lexiconSearch(const std::string & pattern,
|
||||||
|
bool byWhitespace = false)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
@ -133,6 +133,16 @@ void ConcordiaIndex::_addSingleTokenizedExample(
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const TokenizedSentence & tokenizedSentence,
|
const TokenizedSentence & tokenizedSentence,
|
||||||
const SUFFIX_MARKER_TYPE id) {
|
const SUFFIX_MARKER_TYPE id) {
|
||||||
|
|
||||||
|
// prepend sentence boundary marker
|
||||||
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||||
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||||
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||||
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||||
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
|
markers->push_back(sentenceBoundaryMA);
|
||||||
|
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
|
||||||
|
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
@ -155,11 +165,10 @@ void ConcordiaIndex::_addSingleTokenizedExample(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// append sentence boundary marker
|
// append sentence boundary marker
|
||||||
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||||
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||||
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||||
|
sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
|
||||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
markers->push_back(sentenceBoundaryMA);
|
markers->push_back(sentenceBoundaryMA);
|
||||||
}
|
}
|
||||||
@ -178,4 +187,3 @@ TokenizedSentence ConcordiaIndex::_addSingleExample(
|
|||||||
|
|
||||||
return hashedPattern;
|
return hashedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,6 +54,66 @@ MatchedPatternFragment IndexSearcher::simpleSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MatchedPatternFragment IndexSearcher::lexiconSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern,
|
||||||
|
bool byWhitespace) throw(ConcordiaException) {
|
||||||
|
int left;
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern, byWhitespace).getCodes();
|
||||||
|
|
||||||
|
if (hash.size() == 0) {
|
||||||
|
// If the hash is empty, return empty result
|
||||||
|
return MatchedPatternFragment(0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// append and prepend query with EOS (sentenceBoundaryHI) for lexicon search
|
||||||
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
||||||
|
hash.insert(hash.begin(), sentenceBoundaryHI);
|
||||||
|
hash.push_back(sentenceBoundaryHI);
|
||||||
|
|
||||||
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
|
SA->data(), (saidx_t) SA->size(), &left);
|
||||||
|
|
||||||
|
// In this scenario the whole pattern is matched, but
|
||||||
|
// the pattern was artificially augmented by two EOS's.
|
||||||
|
// Therefore, the matched pattern fragment starts at 0
|
||||||
|
// and is hash.size() - 2 long.
|
||||||
|
MatchedPatternFragment result(0, hash.size()-2);
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
|
// As we are looking for a pattern in an array of higher
|
||||||
|
// resolution than the hashed index file, we might
|
||||||
|
// obtain accidental results exceeding the boundaries
|
||||||
|
// of characters in hashed index. The above check
|
||||||
|
// removes these accidental results.
|
||||||
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
|
// Our search query started with an EOS and is non-empty,
|
||||||
|
// so we should look at the marker of the next character
|
||||||
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos + 1);
|
||||||
|
|
||||||
|
SubstringOccurence occurence;
|
||||||
|
occurence.enterDataFromMarker(marker);
|
||||||
|
result.addOccurence(occurence);
|
||||||
|
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] patternArray;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
SUFFIX_MARKER_TYPE IndexSearcher::countOccurences(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -53,6 +53,28 @@ public:
|
|||||||
const std::string & pattern,
|
const std::string & pattern,
|
||||||
bool byWhitespace = false) throw(ConcordiaException);
|
bool byWhitespace = false) throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Performs a search useful for lexicons in the following scenario:
|
||||||
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
||||||
|
The lexicon search performs as simple search - it requires
|
||||||
|
the match to cover the whole pattern, but additionally
|
||||||
|
the lexicon search requires that the match is the whole example source.
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern string pattern to be searched in the index.
|
||||||
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
MatchedPatternFragment lexiconSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern,
|
||||||
|
bool byWhitespace = false) throw(ConcordiaException);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE countOccurences(
|
SUFFIX_MARKER_TYPE countOccurences(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -171,6 +171,69 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaLexiconSearch1 )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14));
|
||||||
|
/*
|
||||||
|
0,3 type: 1 value: ala
|
||||||
|
4,11 type: 1 value: posiada
|
||||||
|
12,16 type: 1 value: kota
|
||||||
|
*/
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().size(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||||
|
|
||||||
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
|
/*The test index contains 3 sentences:
|
||||||
|
14: "Ala posiada kota"
|
||||||
|
51: "Ala posiada rysia"
|
||||||
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
|
Test word map:
|
||||||
|
Ala -> 0
|
||||||
|
posiada -> 1
|
||||||
|
kota -> 2
|
||||||
|
rysia -> 3
|
||||||
|
Marysia -> 4
|
||||||
|
|
||||||
|
Test hashed index:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
|
Test suffix array:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
MatchedPatternFragment searchResult1 = concordia.lexiconSearch("posiada rysia");
|
||||||
|
MatchedPatternFragment searchResult2 = concordia.lexiconSearch("Ala posiada");
|
||||||
|
MatchedPatternFragment searchResult3 = concordia.lexiconSearch("Marysia posiada rysia");
|
||||||
|
MatchedPatternFragment searchResult4 = concordia.lexiconSearch("Ala posiada kota");
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
|
// first two patterns do not cover the whole example source
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult3.getOccurences().size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult3.getOccurences().at(0).getOffset(), 0);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult4.getOccurences().size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getId(), 14);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult4.getOccurences().at(0).getOffset(), 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
Loading…
Reference in New Issue
Block a user