best overlay computation
Former-commit-id: 986f3d6b611fd276a7b26073daa0094caf078d1e
This commit is contained in:
parent
9b97ff2fa9
commit
7549703414
@ -6,21 +6,19 @@ getPossibleCoverages
|
|||||||
param A - set of concordia results, current coverage
|
param A - set of concordia results, current coverage
|
||||||
return isTerminal - returns true if nothing from S can be added to A
|
return isTerminal - returns true if nothing from S can be added to A
|
||||||
*/
|
*/
|
||||||
boolean getPossibleCoverages(A) {
|
void getPossibleCoverages(A) {
|
||||||
allTerminal = true
|
allTerminal = true
|
||||||
for s in S: // to consider - sort intervals in S and always search from the last interval in A
|
for s in S: // to consider - sort intervals in S and always search from the last interval in A
|
||||||
// however - how to sort the intervals? maybe by their ends?
|
// however - how to sort the intervals? maybe by their ends?
|
||||||
if not A intersects {s} // given the above, this check would only require to check if s overlaps with the last interval in A
|
if not A intersects {s} // given the above, this check would only require to check if s overlaps with the last interval in A
|
||||||
allTerminal = allTerminal and getPossibleCoverages(A+{s})
|
getPossibleCoverages(A+{s})
|
||||||
|
allTerminal = false
|
||||||
|
|
||||||
if allTerminal then
|
if allTerminal then
|
||||||
score = scoreCoverage(A)
|
score = scoreCoverage(A)
|
||||||
if score > scoreCoverage(maxCoverage)
|
if score > scoreCoverage(maxCoverage)
|
||||||
maxCoverage = A
|
maxCoverage = A
|
||||||
|
|
||||||
return true
|
|
||||||
else
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
5
TODO.txt
5
TODO.txt
@ -1,7 +1,6 @@
|
|||||||
- wyłączyć stopWords
|
- wyłączyć stopWords
|
||||||
- concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||||
- testy zużycia pamięci
|
- testy zużycia pamięci
|
||||||
- Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
|
||||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||||
- Multi-threading?
|
- Multi-threading?
|
||||||
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
||||||
@ -12,6 +11,8 @@
|
|||||||
|
|
||||||
---------------------------- Archive -----------------------------
|
---------------------------- Archive -----------------------------
|
||||||
|
|
||||||
|
DONE - Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||||
|
|
||||||
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
|
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
|
||||||
|
|
||||||
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||||
|
@ -122,18 +122,34 @@ int main(int argc, char** argv) {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
std::cout << "\tFound: " << result->getFragments().size()
|
std::cout << "\tFound: " << result->getFragments().size()
|
||||||
<< " matches. " << "Search took: " <<
|
<< " fragments. " << "Search took: " <<
|
||||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
if (!cli.count("silent")) {
|
if (!cli.count("silent")) {
|
||||||
|
std::cout << "\tBest overlay: " << std::endl;
|
||||||
BOOST_FOREACH(MatchedPatternFragment fragment,
|
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||||
result->getFragments()) {
|
result->getBestOverlay()) {
|
||||||
std::cout << "\t\tfound matching fragment "
|
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||||
<< "(exampleId, exampleOffset,"
|
<< "," << fragment.getEnd()
|
||||||
|
<< "] (exampleId, exampleOffset,"
|
||||||
<< " patternOffset, length): "
|
<< " patternOffset, length): "
|
||||||
<< fragment.getExampleId() << ","
|
<< fragment.getExampleId() << ","
|
||||||
<< fragment.getExampleOffset() << ","
|
<< fragment.getExampleOffset() << ","
|
||||||
<< fragment.getPatternOffset() << ","
|
<< fragment.getPatternOffset() << ","
|
||||||
<< fragment.getMatchedLength() << ","
|
<< fragment.getMatchedLength()
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "\tAll pattern fragments: " << std::endl;
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||||
|
result->getFragments()) {
|
||||||
|
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||||
|
<< "," << fragment.getEnd()
|
||||||
|
<< "] (exampleId, exampleOffset,"
|
||||||
|
<< " patternOffset, length): "
|
||||||
|
<< fragment.getExampleId() << ","
|
||||||
|
<< fragment.getExampleOffset() << ","
|
||||||
|
<< fragment.getPatternOffset() << ","
|
||||||
|
<< fragment.getMatchedLength()
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,9 @@ void AnubisSearcher::concordiaSearch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute best overlay of the pattern by matched fragments
|
||||||
|
result->computeBestOverlay(pattern.size());
|
||||||
|
|
||||||
result->sortFragments();
|
result->sortFragments();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,6 +114,37 @@ SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double Utils::getLogarithmicOverlay(const std::vector<Interval> & intervalList,
|
||||||
|
SUFFIX_MARKER_TYPE sentenceSize,
|
||||||
|
double k) {
|
||||||
|
double overlayScore = 0;
|
||||||
|
BOOST_FOREACH(Interval interval, intervalList) {
|
||||||
|
double intervalOverlay = static_cast<double>(interval.getLength())
|
||||||
|
/ static_cast<double>(sentenceSize);
|
||||||
|
double significanceFactor = pow(log(interval.getLength()+1)
|
||||||
|
/ log(sentenceSize+1), 1/k);
|
||||||
|
|
||||||
|
overlayScore += intervalOverlay * significanceFactor;
|
||||||
|
}
|
||||||
|
return overlayScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
double Utils::getLogarithmicOverlay(
|
||||||
|
const std::vector<MatchedPatternFragment> & fragmentList,
|
||||||
|
SUFFIX_MARKER_TYPE patternSize,
|
||||||
|
double k) {
|
||||||
|
double overlayScore = 0;
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment fragment, fragmentList) {
|
||||||
|
double intervalOverlay = static_cast<double>(fragment.getLength())
|
||||||
|
/ static_cast<double>(patternSize);
|
||||||
|
double significanceFactor = pow(log(fragment.getLength()+1)
|
||||||
|
/ log(patternSize+1), 1/k);
|
||||||
|
|
||||||
|
overlayScore += intervalOverlay * significanceFactor;
|
||||||
|
}
|
||||||
|
return overlayScore;
|
||||||
|
}
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
|
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
|
||||||
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
|
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
|
||||||
|
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include "concordia/interval.hpp"
|
||||||
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
class Utils {
|
class Utils {
|
||||||
@ -56,6 +58,16 @@ public:
|
|||||||
SUFFIX_MARKER_TYPE offset,
|
SUFFIX_MARKER_TYPE offset,
|
||||||
SUFFIX_MARKER_TYPE length);
|
SUFFIX_MARKER_TYPE length);
|
||||||
|
|
||||||
|
static double getLogarithmicOverlay(
|
||||||
|
const std::vector<Interval> & intervalList,
|
||||||
|
SUFFIX_MARKER_TYPE sentenceSize,
|
||||||
|
double k);
|
||||||
|
|
||||||
|
static double getLogarithmicOverlay(
|
||||||
|
const std::vector<MatchedPatternFragment> & fragmentList,
|
||||||
|
SUFFIX_MARKER_TYPE patternSize,
|
||||||
|
double k);
|
||||||
|
|
||||||
static SUFFIX_MARKER_TYPE maxSentenceSize;
|
static SUFFIX_MARKER_TYPE maxSentenceSize;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
#include "concordia/concordia_search_result.hpp"
|
#include "concordia/concordia_search_result.hpp"
|
||||||
|
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
ConcordiaSearchResult::ConcordiaSearchResult(
|
ConcordiaSearchResult::ConcordiaSearchResult(
|
||||||
const std::vector<std::string> & tokenVector):
|
const std::vector<std::string> & tokenVector):
|
||||||
_tokenVector(tokenVector) {
|
_tokenVector(tokenVector),
|
||||||
|
_bestOverlayScore(0) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaSearchResult::~ConcordiaSearchResult() {
|
ConcordiaSearchResult::~ConcordiaSearchResult() {
|
||||||
@ -20,3 +22,40 @@ void ConcordiaSearchResult::sortFragments() {
|
|||||||
_matchedPatternFragments.end(),
|
_matchedPatternFragments.end(),
|
||||||
std::greater<MatchedPatternFragment>());
|
std::greater<MatchedPatternFragment>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ConcordiaSearchResult::computeBestOverlay(
|
||||||
|
SUFFIX_MARKER_TYPE patternSize) {
|
||||||
|
// the fragments are already sorted by their ends, ascending
|
||||||
|
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||||
|
-1,
|
||||||
|
patternSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||||
|
std::vector<MatchedPatternFragment> currentOverlay,
|
||||||
|
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||||
|
SUFFIX_MARKER_TYPE patternSize) {
|
||||||
|
bool allTerminal = true;
|
||||||
|
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
|
||||||
|
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
|
||||||
|
|
||||||
|
// if fragment does not intersect currentOverlay
|
||||||
|
if (currentOverlay.size() == 0 ||
|
||||||
|
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
|
||||||
|
currentOverlay.push_back(fragment);
|
||||||
|
_checkPossibleOverlays(currentOverlay, i, patternSize);
|
||||||
|
allTerminal = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTerminal) {
|
||||||
|
double score = Utils::getLogarithmicOverlay(currentOverlay,
|
||||||
|
patternSize,
|
||||||
|
1.0);
|
||||||
|
if (score > _bestOverlayScore) {
|
||||||
|
_bestOverlay = currentOverlay;
|
||||||
|
_bestOverlayScore = score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#ifndef CONCORDIA_SEARCH_RESULT_HDR
|
#ifndef CONCORDIA_SEARCH_RESULT_HDR
|
||||||
#define CONCORDIA_SEARCH_RESULT_HDR
|
#define CONCORDIA_SEARCH_RESULT_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/matched_pattern_fragment.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -24,6 +25,8 @@ public:
|
|||||||
|
|
||||||
void sortFragments();
|
void sortFragments();
|
||||||
|
|
||||||
|
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
|
||||||
|
|
||||||
std::vector<std::string> getTokenVector() const {
|
std::vector<std::string> getTokenVector() const {
|
||||||
return _tokenVector;
|
return _tokenVector;
|
||||||
}
|
}
|
||||||
@ -32,10 +35,23 @@ public:
|
|||||||
return _matchedPatternFragments;
|
return _matchedPatternFragments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<MatchedPatternFragment> getBestOverlay() const {
|
||||||
|
return _bestOverlay;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void _checkPossibleOverlays(
|
||||||
|
std::vector<MatchedPatternFragment> currentOverlay,
|
||||||
|
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||||
|
SUFFIX_MARKER_TYPE patternSize);
|
||||||
|
|
||||||
std::vector<std::string> _tokenVector;
|
std::vector<std::string> _tokenVector;
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
||||||
|
|
||||||
|
std::vector<MatchedPatternFragment> _bestOverlay;
|
||||||
|
|
||||||
|
double _bestOverlayScore;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -29,10 +29,16 @@ public:
|
|||||||
return _end;
|
return _end;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
SUFFIX_MARKER_TYPE _start;
|
SUFFIX_MARKER_TYPE _start;
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _end;
|
SUFFIX_MARKER_TYPE _end;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct intervalEndComparator {
|
||||||
|
inline bool operator() (const Interval & lhs, const Interval & rhs) {
|
||||||
|
return (lhs.getEnd() < rhs.getEnd());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -5,6 +5,8 @@ MatchedPatternFragment::MatchedPatternFragment(
|
|||||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
const SUFFIX_MARKER_TYPE & matchedLength):
|
const SUFFIX_MARKER_TYPE & matchedLength):
|
||||||
|
Interval(patternOffset,
|
||||||
|
patternOffset + matchedLength),
|
||||||
_exampleId(exampleId),
|
_exampleId(exampleId),
|
||||||
_exampleOffset(exampleOffset),
|
_exampleOffset(exampleOffset),
|
||||||
_patternOffset(patternOffset),
|
_patternOffset(patternOffset),
|
||||||
|
@ -2,13 +2,14 @@
|
|||||||
#define MATCHED_PATTERN_FRAGMENT_HDR
|
#define MATCHED_PATTERN_FRAGMENT_HDR
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/interval.hpp"
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing matched pattern fragment in concordia search.
|
Class representing matched pattern fragment in concordia search.
|
||||||
|
This fragment can be seen as an interval of the pattern.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class MatchedPatternFragment {
|
class MatchedPatternFragment : public Interval {
|
||||||
public:
|
public:
|
||||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||||
|
@ -204,6 +204,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||||
|
// best overlay: [], []
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
||||||
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
|
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
@ -18,10 +20,12 @@ TmMatches::~TmMatches() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void TmMatches::calculateScore() {
|
void TmMatches::calculateScore() {
|
||||||
double exampleOverlay = _getLogarithmicOverlay(_exampleMatchedRegions,
|
double exampleOverlay =
|
||||||
|
Utils::getLogarithmicOverlay(_exampleMatchedRegions,
|
||||||
_exampleSize, 1.0);
|
_exampleSize, 1.0);
|
||||||
|
|
||||||
double patternOverlay = _getLogarithmicOverlay(_patternMatchedRegions,
|
double patternOverlay =
|
||||||
|
Utils::getLogarithmicOverlay(_patternMatchedRegions,
|
||||||
_patternSize, 2.0);
|
_patternSize, 2.0);
|
||||||
_score = (exampleOverlay + patternOverlay) / 2.0;
|
_score = (exampleOverlay + patternOverlay) / 2.0;
|
||||||
}
|
}
|
||||||
@ -64,20 +68,3 @@ bool TmMatches::_alreadyIntersects(
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
double TmMatches::_getLogarithmicOverlay(
|
|
||||||
const std::vector<Interval> & intervalList,
|
|
||||||
SUFFIX_MARKER_TYPE sentenceSize,
|
|
||||||
double k) {
|
|
||||||
double overlayScore = 0;
|
|
||||||
BOOST_FOREACH(Interval interval, intervalList) {
|
|
||||||
double intervalOverlay = static_cast<double>(interval.getLength())
|
|
||||||
/ static_cast<double>(sentenceSize);
|
|
||||||
double significanceFactor = pow(log(interval.getLength()+1)
|
|
||||||
/ log(sentenceSize+1), 1/k);
|
|
||||||
|
|
||||||
overlayScore += intervalOverlay * significanceFactor;
|
|
||||||
}
|
|
||||||
return overlayScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
@ -54,10 +54,6 @@ private:
|
|||||||
bool _alreadyIntersects(const std::vector<Interval> & intervalList,
|
bool _alreadyIntersects(const std::vector<Interval> & intervalList,
|
||||||
int start, int end);
|
int start, int end);
|
||||||
|
|
||||||
double _getLogarithmicOverlay(const std::vector<Interval> & intervalList,
|
|
||||||
SUFFIX_MARKER_TYPE sentenceSize,
|
|
||||||
double k);
|
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _exampleId;
|
SUFFIX_MARKER_TYPE _exampleId;
|
||||||
|
|
||||||
std::vector<Interval> _exampleMatchedRegions;
|
std::vector<Interval> _exampleMatchedRegions;
|
||||||
|
Loading…
Reference in New Issue
Block a user