best overlay computation
Former-commit-id: 986f3d6b611fd276a7b26073daa0094caf078d1e
This commit is contained in:
parent
9b97ff2fa9
commit
7549703414
@ -6,21 +6,19 @@ getPossibleCoverages
|
||||
param A - set of concordia results, current coverage
|
||||
return isTerminal - returns true if nothing from S can be added to A
|
||||
*/
|
||||
boolean getPossibleCoverages(A) {
|
||||
void getPossibleCoverages(A) {
|
||||
allTerminal = true
|
||||
for s in S: // to consider - sort intervals in S and always search from the last interval in A
|
||||
// however - how to sort the intervals? maybe by their ends?
|
||||
if not A intersects {s} // given the above, this check would only require to check if s overlaps with the last interval in A
|
||||
allTerminal = allTerminal and getPossibleCoverages(A+{s})
|
||||
getPossibleCoverages(A+{s})
|
||||
allTerminal = false
|
||||
|
||||
if allTerminal then
|
||||
score = scoreCoverage(A)
|
||||
if score > scoreCoverage(maxCoverage)
|
||||
maxCoverage = A
|
||||
|
||||
return true
|
||||
else
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
|
5
TODO.txt
5
TODO.txt
@ -1,7 +1,6 @@
|
||||
- wyłączyć stopWords
|
||||
- concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||
- testy zużycia pamięci
|
||||
- Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||
- Multi-threading?
|
||||
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
||||
@ -12,6 +11,8 @@
|
||||
|
||||
---------------------------- Archive -----------------------------
|
||||
|
||||
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||
|
||||
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
|
||||
|
||||
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||
|
@ -122,18 +122,34 @@ int main(int argc, char** argv) {
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "\tFound: " << result->getFragments().size()
|
||||
<< " matches. " << "Search took: " <<
|
||||
<< " fragments. " << "Search took: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
if (!cli.count("silent")) {
|
||||
std::cout << "\tBest overlay: " << std::endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||
result->getFragments()) {
|
||||
std::cout << "\t\tfound matching fragment "
|
||||
<< "(exampleId, exampleOffset,"
|
||||
result->getBestOverlay()) {
|
||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||
<< "," << fragment.getEnd()
|
||||
<< "] (exampleId, exampleOffset,"
|
||||
<< " patternOffset, length): "
|
||||
<< fragment.getExampleId() << ","
|
||||
<< fragment.getExampleOffset() << ","
|
||||
<< fragment.getPatternOffset() << ","
|
||||
<< fragment.getMatchedLength() << ","
|
||||
<< fragment.getMatchedLength()
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std::cout << "\tAll pattern fragments: " << std::endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||
result->getFragments()) {
|
||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||
<< "," << fragment.getEnd()
|
||||
<< "] (exampleId, exampleOffset,"
|
||||
<< " patternOffset, length): "
|
||||
<< fragment.getExampleId() << ","
|
||||
<< fragment.getExampleOffset() << ","
|
||||
<< fragment.getPatternOffset() << ","
|
||||
<< fragment.getMatchedLength()
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
@ -45,6 +45,9 @@ void AnubisSearcher::concordiaSearch(
|
||||
}
|
||||
}
|
||||
|
||||
// compute best overlay of the pattern by matched fragments
|
||||
result->computeBestOverlay(pattern.size());
|
||||
|
||||
result->sortFragments();
|
||||
}
|
||||
|
||||
|
@ -114,6 +114,37 @@ SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
|
||||
return result;
|
||||
}
|
||||
|
||||
double Utils::getLogarithmicOverlay(const std::vector<Interval> & intervalList,
|
||||
SUFFIX_MARKER_TYPE sentenceSize,
|
||||
double k) {
|
||||
double overlayScore = 0;
|
||||
BOOST_FOREACH(Interval interval, intervalList) {
|
||||
double intervalOverlay = static_cast<double>(interval.getLength())
|
||||
/ static_cast<double>(sentenceSize);
|
||||
double significanceFactor = pow(log(interval.getLength()+1)
|
||||
/ log(sentenceSize+1), 1/k);
|
||||
|
||||
overlayScore += intervalOverlay * significanceFactor;
|
||||
}
|
||||
return overlayScore;
|
||||
}
|
||||
|
||||
double Utils::getLogarithmicOverlay(
|
||||
const std::vector<MatchedPatternFragment> & fragmentList,
|
||||
SUFFIX_MARKER_TYPE patternSize,
|
||||
double k) {
|
||||
double overlayScore = 0;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment, fragmentList) {
|
||||
double intervalOverlay = static_cast<double>(fragment.getLength())
|
||||
/ static_cast<double>(patternSize);
|
||||
double significanceFactor = pow(log(fragment.getLength()+1)
|
||||
/ log(patternSize+1), 1/k);
|
||||
|
||||
overlayScore += intervalOverlay * significanceFactor;
|
||||
}
|
||||
return overlayScore;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
|
||||
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
|
||||
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
#include <divsufsort.h>
|
||||
|
||||
class Utils {
|
||||
@ -56,6 +58,16 @@ public:
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
SUFFIX_MARKER_TYPE length);
|
||||
|
||||
static double getLogarithmicOverlay(
|
||||
const std::vector<Interval> & intervalList,
|
||||
SUFFIX_MARKER_TYPE sentenceSize,
|
||||
double k);
|
||||
|
||||
static double getLogarithmicOverlay(
|
||||
const std::vector<MatchedPatternFragment> & fragmentList,
|
||||
SUFFIX_MARKER_TYPE patternSize,
|
||||
double k);
|
||||
|
||||
static SUFFIX_MARKER_TYPE maxSentenceSize;
|
||||
|
||||
private:
|
||||
|
@ -1,10 +1,12 @@
|
||||
#include "concordia/concordia_search_result.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
ConcordiaSearchResult::ConcordiaSearchResult(
|
||||
const std::vector<std::string> & tokenVector):
|
||||
_tokenVector(tokenVector) {
|
||||
_tokenVector(tokenVector),
|
||||
_bestOverlayScore(0) {
|
||||
}
|
||||
|
||||
ConcordiaSearchResult::~ConcordiaSearchResult() {
|
||||
@ -20,3 +22,40 @@ void ConcordiaSearchResult::sortFragments() {
|
||||
_matchedPatternFragments.end(),
|
||||
std::greater<MatchedPatternFragment>());
|
||||
}
|
||||
|
||||
void ConcordiaSearchResult::computeBestOverlay(
|
||||
SUFFIX_MARKER_TYPE patternSize) {
|
||||
// the fragments are already sorted by their ends, ascending
|
||||
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||
-1,
|
||||
patternSize);
|
||||
}
|
||||
|
||||
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||
std::vector<MatchedPatternFragment> currentOverlay,
|
||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||
SUFFIX_MARKER_TYPE patternSize) {
|
||||
bool allTerminal = true;
|
||||
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
|
||||
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
|
||||
|
||||
// if fragment does not intersect currentOverlay
|
||||
if (currentOverlay.size() == 0 ||
|
||||
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
|
||||
currentOverlay.push_back(fragment);
|
||||
_checkPossibleOverlays(currentOverlay, i, patternSize);
|
||||
allTerminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (allTerminal) {
|
||||
double score = Utils::getLogarithmicOverlay(currentOverlay,
|
||||
patternSize,
|
||||
1.0);
|
||||
if (score > _bestOverlayScore) {
|
||||
_bestOverlay = currentOverlay;
|
||||
_bestOverlayScore = score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef CONCORDIA_SEARCH_RESULT_HDR
|
||||
#define CONCORDIA_SEARCH_RESULT_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
|
||||
#include <vector>
|
||||
@ -24,6 +25,8 @@ public:
|
||||
|
||||
void sortFragments();
|
||||
|
||||
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
|
||||
|
||||
std::vector<std::string> getTokenVector() const {
|
||||
return _tokenVector;
|
||||
}
|
||||
@ -32,10 +35,23 @@ public:
|
||||
return _matchedPatternFragments;
|
||||
}
|
||||
|
||||
std::vector<MatchedPatternFragment> getBestOverlay() const {
|
||||
return _bestOverlay;
|
||||
}
|
||||
|
||||
private:
|
||||
void _checkPossibleOverlays(
|
||||
std::vector<MatchedPatternFragment> currentOverlay,
|
||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||
SUFFIX_MARKER_TYPE patternSize);
|
||||
|
||||
std::vector<std::string> _tokenVector;
|
||||
|
||||
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
||||
|
||||
std::vector<MatchedPatternFragment> _bestOverlay;
|
||||
|
||||
double _bestOverlayScore;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -29,10 +29,16 @@ public:
|
||||
return _end;
|
||||
}
|
||||
|
||||
private:
|
||||
protected:
|
||||
SUFFIX_MARKER_TYPE _start;
|
||||
|
||||
SUFFIX_MARKER_TYPE _end;
|
||||
};
|
||||
|
||||
struct intervalEndComparator {
|
||||
inline bool operator() (const Interval & lhs, const Interval & rhs) {
|
||||
return (lhs.getEnd() < rhs.getEnd());
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -5,6 +5,8 @@ MatchedPatternFragment::MatchedPatternFragment(
|
||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||
const SUFFIX_MARKER_TYPE & matchedLength):
|
||||
Interval(patternOffset,
|
||||
patternOffset + matchedLength),
|
||||
_exampleId(exampleId),
|
||||
_exampleOffset(exampleOffset),
|
||||
_patternOffset(patternOffset),
|
||||
|
@ -2,13 +2,14 @@
|
||||
#define MATCHED_PATTERN_FRAGMENT_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
|
||||
/*!
|
||||
Class representing matched pattern fragment in concordia search.
|
||||
|
||||
This fragment can be seen as an interval of the pattern.
|
||||
*/
|
||||
|
||||
class MatchedPatternFragment {
|
||||
class MatchedPatternFragment : public Interval {
|
||||
public:
|
||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||
|
@ -204,6 +204,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||
// best overlay: [], []
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
||||
|
||||
/*
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "concordia/tm_matches.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <boost/foreach.hpp>
|
||||
#include <math.h>
|
||||
|
||||
@ -18,10 +20,12 @@ TmMatches::~TmMatches() {
|
||||
}
|
||||
|
||||
void TmMatches::calculateScore() {
|
||||
double exampleOverlay = _getLogarithmicOverlay(_exampleMatchedRegions,
|
||||
double exampleOverlay =
|
||||
Utils::getLogarithmicOverlay(_exampleMatchedRegions,
|
||||
_exampleSize, 1.0);
|
||||
|
||||
double patternOverlay = _getLogarithmicOverlay(_patternMatchedRegions,
|
||||
double patternOverlay =
|
||||
Utils::getLogarithmicOverlay(_patternMatchedRegions,
|
||||
_patternSize, 2.0);
|
||||
_score = (exampleOverlay + patternOverlay) / 2.0;
|
||||
}
|
||||
@ -64,20 +68,3 @@ bool TmMatches::_alreadyIntersects(
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
double TmMatches::_getLogarithmicOverlay(
|
||||
const std::vector<Interval> & intervalList,
|
||||
SUFFIX_MARKER_TYPE sentenceSize,
|
||||
double k) {
|
||||
double overlayScore = 0;
|
||||
BOOST_FOREACH(Interval interval, intervalList) {
|
||||
double intervalOverlay = static_cast<double>(interval.getLength())
|
||||
/ static_cast<double>(sentenceSize);
|
||||
double significanceFactor = pow(log(interval.getLength()+1)
|
||||
/ log(sentenceSize+1), 1/k);
|
||||
|
||||
overlayScore += intervalOverlay * significanceFactor;
|
||||
}
|
||||
return overlayScore;
|
||||
}
|
||||
|
||||
|
@ -54,10 +54,6 @@ private:
|
||||
bool _alreadyIntersects(const std::vector<Interval> & intervalList,
|
||||
int start, int end);
|
||||
|
||||
double _getLogarithmicOverlay(const std::vector<Interval> & intervalList,
|
||||
SUFFIX_MARKER_TYPE sentenceSize,
|
||||
double k);
|
||||
|
||||
SUFFIX_MARKER_TYPE _exampleId;
|
||||
|
||||
std::vector<Interval> _exampleMatchedRegions;
|
||||
|
Loading…
Reference in New Issue
Block a user