best overlay computation

Former-commit-id: 986f3d6b611fd276a7b26073daa0094caf078d1e
This commit is contained in:
rjawor 2015-04-21 15:14:48 +02:00
parent 9b97ff2fa9
commit 7549703414
14 changed files with 156 additions and 39 deletions

View File

@ -6,21 +6,19 @@ getPossibleCoverages
param A - set of concordia results, current coverage param A - set of concordia results, current coverage
return isTerminal - returns true if nothing from S can be added to A return isTerminal - returns true if nothing from S can be added to A
*/ */
boolean getPossibleCoverages(A) { void getPossibleCoverages(A) {
allTerminal = true allTerminal = true
for s in S: // to consider - sort intervals in S and always search from the last interval in A for s in S: // to consider - sort intervals in S and always search from the last interval in A
// however - how to sort the intervals? maybe by their ends? // however - how to sort the intervals? maybe by their ends?
if not A intersects {s} // given the above, this check would only require to check if s overlaps with the last interval in A if not A intersects {s} // given the above, this check would only require to check if s overlaps with the last interval in A
allTerminal = allTerminal and getPossibleCoverages(A+{s}) getPossibleCoverages(A+{s})
allTerminal = false
if allTerminal then if allTerminal then
score = scoreCoverage(A) score = scoreCoverage(A)
if score > scoreCoverage(maxCoverage) if score > scoreCoverage(maxCoverage)
maxCoverage = A maxCoverage = A
return true
else
return false
} }

View File

@ -1,7 +1,6 @@
- wyłączyć stopWords - wyłączyć stopWords
- concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
- testy zużycia pamięci - testy zużycia pamięci
- Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
- Multi-threading? - Multi-threading?
- concordia-server (zastanowić się, czy nie napisać CAT-a) - concordia-server (zastanowić się, czy nie napisać CAT-a)
@ -12,6 +11,8 @@
---------------------------- Archive ----------------------------- ---------------------------- Archive -----------------------------
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy) DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)

View File

@ -122,18 +122,34 @@ int main(int argc, char** argv) {
std::cout << std::endl; std::cout << std::endl;
std::cout << "\tFound: " << result->getFragments().size() std::cout << "\tFound: " << result->getFragments().size()
<< " matches. " << "Search took: " << << " fragments. " << "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl; msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
std::cout << "\tBest overlay: " << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment, BOOST_FOREACH(MatchedPatternFragment fragment,
result->getFragments()) { result->getBestOverlay()) {
std::cout << "\t\tfound matching fragment " std::cout << "\t\tfragment [" << fragment.getStart()
<< "(exampleId, exampleOffset," << "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getExampleId() << "," << fragment.getExampleId() << ","
<< fragment.getExampleOffset() << "," << fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << "," << fragment.getMatchedLength()
<< std::endl;
}
std::cout << "\tAll pattern fragments: " << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result->getFragments()) {
std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< " patternOffset, length): "
<< fragment.getExampleId() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << ","
<< fragment.getMatchedLength()
<< std::endl; << std::endl;
} }
} }

View File

@ -45,6 +45,9 @@ void AnubisSearcher::concordiaSearch(
} }
} }
// compute best overlay of the pattern by matched fragments
result->computeBestOverlay(pattern.size());
result->sortFragments(); result->sortFragments();
} }

View File

@ -114,6 +114,37 @@ SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
return result; return result;
} }
double Utils::getLogarithmicOverlay(const std::vector<Interval> & intervalList,
SUFFIX_MARKER_TYPE sentenceSize,
double k) {
double overlayScore = 0;
BOOST_FOREACH(Interval interval, intervalList) {
double intervalOverlay = static_cast<double>(interval.getLength())
/ static_cast<double>(sentenceSize);
double significanceFactor = pow(log(interval.getLength()+1)
/ log(sentenceSize+1), 1/k);
overlayScore += intervalOverlay * significanceFactor;
}
return overlayScore;
}
double Utils::getLogarithmicOverlay(
const std::vector<MatchedPatternFragment> & fragmentList,
SUFFIX_MARKER_TYPE patternSize,
double k) {
double overlayScore = 0;
BOOST_FOREACH(MatchedPatternFragment fragment, fragmentList) {
double intervalOverlay = static_cast<double>(fragment.getLength())
/ static_cast<double>(patternSize);
double significanceFactor = pow(log(fragment.getLength()+1)
/ log(patternSize+1), 1/k);
overlayScore += intervalOverlay * significanceFactor;
}
return overlayScore;
}
SUFFIX_MARKER_TYPE Utils::maxSentenceSize = SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8); pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);

View File

@ -9,6 +9,8 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/interval.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include <divsufsort.h> #include <divsufsort.h>
class Utils { class Utils {
@ -56,6 +58,16 @@ public:
SUFFIX_MARKER_TYPE offset, SUFFIX_MARKER_TYPE offset,
SUFFIX_MARKER_TYPE length); SUFFIX_MARKER_TYPE length);
static double getLogarithmicOverlay(
const std::vector<Interval> & intervalList,
SUFFIX_MARKER_TYPE sentenceSize,
double k);
static double getLogarithmicOverlay(
const std::vector<MatchedPatternFragment> & fragmentList,
SUFFIX_MARKER_TYPE patternSize,
double k);
static SUFFIX_MARKER_TYPE maxSentenceSize; static SUFFIX_MARKER_TYPE maxSentenceSize;
private: private:

View File

@ -1,10 +1,12 @@
#include "concordia/concordia_search_result.hpp" #include "concordia/concordia_search_result.hpp"
#include "concordia/common/utils.hpp"
#include <algorithm> #include <algorithm>
ConcordiaSearchResult::ConcordiaSearchResult( ConcordiaSearchResult::ConcordiaSearchResult(
const std::vector<std::string> & tokenVector): const std::vector<std::string> & tokenVector):
_tokenVector(tokenVector) { _tokenVector(tokenVector),
_bestOverlayScore(0) {
} }
ConcordiaSearchResult::~ConcordiaSearchResult() { ConcordiaSearchResult::~ConcordiaSearchResult() {
@ -20,3 +22,40 @@ void ConcordiaSearchResult::sortFragments() {
_matchedPatternFragments.end(), _matchedPatternFragments.end(),
std::greater<MatchedPatternFragment>()); std::greater<MatchedPatternFragment>());
} }
void ConcordiaSearchResult::computeBestOverlay(
SUFFIX_MARKER_TYPE patternSize) {
// the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1,
patternSize);
}
void ConcordiaSearchResult::_checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay,
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize) {
bool allTerminal = true;
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
// if fragment does not intersect currentOverlay
if (currentOverlay.size() == 0 ||
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
currentOverlay.push_back(fragment);
_checkPossibleOverlays(currentOverlay, i, patternSize);
allTerminal = false;
}
}
if (allTerminal) {
double score = Utils::getLogarithmicOverlay(currentOverlay,
patternSize,
1.0);
if (score > _bestOverlayScore) {
_bestOverlay = currentOverlay;
_bestOverlayScore = score;
}
}
}

View File

@ -1,6 +1,7 @@
#ifndef CONCORDIA_SEARCH_RESULT_HDR #ifndef CONCORDIA_SEARCH_RESULT_HDR
#define CONCORDIA_SEARCH_RESULT_HDR #define CONCORDIA_SEARCH_RESULT_HDR
#include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include <vector> #include <vector>
@ -24,6 +25,8 @@ public:
void sortFragments(); void sortFragments();
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
std::vector<std::string> getTokenVector() const { std::vector<std::string> getTokenVector() const {
return _tokenVector; return _tokenVector;
} }
@ -32,10 +35,23 @@ public:
return _matchedPatternFragments; return _matchedPatternFragments;
} }
std::vector<MatchedPatternFragment> getBestOverlay() const {
return _bestOverlay;
}
private: private:
void _checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay,
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize);
std::vector<std::string> _tokenVector; std::vector<std::string> _tokenVector;
std::vector<MatchedPatternFragment> _matchedPatternFragments; std::vector<MatchedPatternFragment> _matchedPatternFragments;
std::vector<MatchedPatternFragment> _bestOverlay;
double _bestOverlayScore;
}; };
#endif #endif

View File

@ -29,10 +29,16 @@ public:
return _end; return _end;
} }
private: protected:
SUFFIX_MARKER_TYPE _start; SUFFIX_MARKER_TYPE _start;
SUFFIX_MARKER_TYPE _end; SUFFIX_MARKER_TYPE _end;
}; };
struct intervalEndComparator {
inline bool operator() (const Interval & lhs, const Interval & rhs) {
return (lhs.getEnd() < rhs.getEnd());
}
};
#endif #endif

View File

@ -5,6 +5,8 @@ MatchedPatternFragment::MatchedPatternFragment(
const SUFFIX_MARKER_TYPE & exampleOffset, const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset, const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength): const SUFFIX_MARKER_TYPE & matchedLength):
Interval(patternOffset,
patternOffset + matchedLength),
_exampleId(exampleId), _exampleId(exampleId),
_exampleOffset(exampleOffset), _exampleOffset(exampleOffset),
_patternOffset(patternOffset), _patternOffset(patternOffset),

View File

@ -2,13 +2,14 @@
#define MATCHED_PATTERN_FRAGMENT_HDR #define MATCHED_PATTERN_FRAGMENT_HDR
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
/*! /*!
Class representing matched pattern fragment in concordia search. Class representing matched pattern fragment in concordia search.
This fragment can be seen as an interval of the pattern.
*/ */
class MatchedPatternFragment { class MatchedPatternFragment : public Interval {
public: public:
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId, MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset, const SUFFIX_MARKER_TYPE & exampleOffset,

View File

@ -204,6 +204,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
// best overlay: [], []
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
/* /*

View File

@ -1,4 +1,6 @@
#include "concordia/tm_matches.hpp" #include "concordia/tm_matches.hpp"
#include "concordia/common/utils.hpp"
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <math.h> #include <math.h>
@ -18,10 +20,12 @@ TmMatches::~TmMatches() {
} }
void TmMatches::calculateScore() { void TmMatches::calculateScore() {
double exampleOverlay = _getLogarithmicOverlay(_exampleMatchedRegions, double exampleOverlay =
Utils::getLogarithmicOverlay(_exampleMatchedRegions,
_exampleSize, 1.0); _exampleSize, 1.0);
double patternOverlay = _getLogarithmicOverlay(_patternMatchedRegions, double patternOverlay =
Utils::getLogarithmicOverlay(_patternMatchedRegions,
_patternSize, 2.0); _patternSize, 2.0);
_score = (exampleOverlay + patternOverlay) / 2.0; _score = (exampleOverlay + patternOverlay) / 2.0;
} }
@ -64,20 +68,3 @@ bool TmMatches::_alreadyIntersects(
} }
return false; return false;
} }
double TmMatches::_getLogarithmicOverlay(
const std::vector<Interval> & intervalList,
SUFFIX_MARKER_TYPE sentenceSize,
double k) {
double overlayScore = 0;
BOOST_FOREACH(Interval interval, intervalList) {
double intervalOverlay = static_cast<double>(interval.getLength())
/ static_cast<double>(sentenceSize);
double significanceFactor = pow(log(interval.getLength()+1)
/ log(sentenceSize+1), 1/k);
overlayScore += intervalOverlay * significanceFactor;
}
return overlayScore;
}

View File

@ -54,10 +54,6 @@ private:
bool _alreadyIntersects(const std::vector<Interval> & intervalList, bool _alreadyIntersects(const std::vector<Interval> & intervalList,
int start, int end); int start, int end);
double _getLogarithmicOverlay(const std::vector<Interval> & intervalList,
SUFFIX_MARKER_TYPE sentenceSize,
double k);
SUFFIX_MARKER_TYPE _exampleId; SUFFIX_MARKER_TYPE _exampleId;
std::vector<Interval> _exampleMatchedRegions; std::vector<Interval> _exampleMatchedRegions;