From 8a3883130628706c41fe55cfe808cf6a39f389a4 Mon Sep 17 00:00:00 2001 From: rjawor Date: Fri, 14 Mar 2014 12:05:06 +0100 Subject: [PATCH] logarithmic score Former-commit-id: ec2704b3a206cc39ed42d19620bef6ce0fedbc7e --- concordia/t/test_tm_matches.cpp | 28 ++++++++++++++++++++++++++ concordia/tm_matches.cpp | 35 ++++++++++++++++++++------------- concordia/tm_matches.hpp | 4 ++++ 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/concordia/t/test_tm_matches.cpp b/concordia/t/test_tm_matches.cpp index 5cc3108..484bfcc 100644 --- a/concordia/t/test_tm_matches.cpp +++ b/concordia/t/test_tm_matches.cpp @@ -17,4 +17,32 @@ BOOST_AUTO_TEST_CASE( TmMatchesSimpleScore1 ) BOOST_CHECK_EQUAL(tmMatches.getScore(),0.35); } +BOOST_AUTO_TEST_CASE( TmMatchesLogarithmicScore1 ) +{ + TmMatches tmMatches(0,10,10); + tmMatches.calculateScore(); + + BOOST_CHECK_EQUAL(tmMatches.getScore(),0.0); +} + +BOOST_AUTO_TEST_CASE( TmMatchesLogarithmicScore2 ) +{ + TmMatches tmMatches(0,10,10); + tmMatches.addPatternInterval(0,10); + tmMatches.addExampleInterval(0,10); + tmMatches.calculateScore(); + + BOOST_CHECK_EQUAL(tmMatches.getScore(),1.0); +} + +BOOST_AUTO_TEST_CASE( TmMatchesLogarithmicScore3 ) +{ + TmMatches tmMatches(0,10,10); + tmMatches.addPatternInterval(2,5); + tmMatches.addExampleInterval(1,5); + tmMatches.calculateScore(); + + BOOST_CHECK_CLOSE(tmMatches.getScore(),0.2482, 0.1); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/tm_matches.cpp b/concordia/tm_matches.cpp index 4b7c8ca..178221f 100644 --- a/concordia/tm_matches.cpp +++ b/concordia/tm_matches.cpp @@ -1,6 +1,6 @@ #include "concordia/tm_matches.hpp" #include - +#include TmMatches::TmMatches(const SUFFIX_MARKER_TYPE exampleId, const unsigned char exampleSize, @@ -14,20 +14,13 @@ TmMatches::~TmMatches() { } void TmMatches::calculateScore() { - /* TODO logarithmic function - unsigned char exampleMatchedLength = 0; - BOOST_FOREACH(Interval & interval, _exampleMatchedRegions) { - exampleMatchedLength += interval.getLength(); - } - unsigned char patternMatchedLength = 0; - BOOST_FOREACH(Interval & interval, _patternMatchedRegions) { - patternMatchedLength += interval.getLength(); - } - - _score = (double) (exampleMatchedLength + patternMatchedLength) - / (double) (_exampleSize + _patternSize); - */ + double exampleOverlay = _getLogarithmicOverlay(_exampleMatchedRegions, + _exampleSize, 1.0); + + double patternOverlay = _getLogarithmicOverlay(_patternMatchedRegions, + _patternSize, 2.0); + _score = (exampleOverlay + patternOverlay) / 2.0; } void TmMatches::calculateSimpleScore() { @@ -72,3 +65,17 @@ bool TmMatches::_alreadyIntersects( return false; } +double TmMatches::_getLogarithmicOverlay(boost::ptr_vector intervalList, + unsigned char sentenceSize, + double k) { + double overlayScore = 0; + BOOST_FOREACH(Interval & interval, intervalList) { + double intervalOverlay = (double) interval.getLength() / (double) sentenceSize; + double significanceFactor = pow(log(interval.getLength()+1) / log(sentenceSize+1), 1/k); + + overlayScore += intervalOverlay * significanceFactor; + } + return overlayScore; +} + + diff --git a/concordia/tm_matches.hpp b/concordia/tm_matches.hpp index fee8f6f..12748fb 100644 --- a/concordia/tm_matches.hpp +++ b/concordia/tm_matches.hpp @@ -44,6 +44,10 @@ private: bool _alreadyIntersects(boost::ptr_vector intervalList, int start, int end); + double _getLogarithmicOverlay(boost::ptr_vector intervalList, + unsigned char sentenceSize, + double k); + SUFFIX_MARKER_TYPE _exampleId; boost::ptr_vector _exampleMatchedRegions;