diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index afaa27f..95c8456 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -174,17 +174,18 @@ int main(int argc, char** argv) { std::cout << "\tSearching for pattern: \"" << pattern << "\"" << std::endl; time_start = boost::posix_time::microsec_clock::local_time(); - std::vector result = + MatchedPatternFragment result = concordia.simpleSearch(pattern); time_end = boost::posix_time::microsec_clock::local_time(); msdiff = time_end - time_start; - std::cout << "\tFound: " << result.size() << " matches. " - << "Search took: " << - msdiff.total_milliseconds() << "ms." << std::endl; + std::cout << "\tFound: " << result.getOccurences().size() + << " matches. " << "Search took: " + << msdiff.total_milliseconds() << "ms." << std::endl; if (!cli.count("silent")) { - BOOST_FOREACH(MatchedPatternFragment occurence, result) { + BOOST_FOREACH(SubstringOccurence occurence, + result.getOccurences()) { std::cout << "\t\tfound match in sentence number: " - << occurence.getExampleId() << std::endl; + << occurence.getId() << std::endl; } } } else if (cli.count("anubis-search")) { @@ -234,10 +235,9 @@ int main(int argc, char** argv) { result->getBestOverlay()) { std::cout << "\t\tfragment [" << fragment.getStart() << "," << fragment.getEnd() - << "] (exampleId, exampleOffset," + << "] (exampleCount," << " patternOffset, length): " - << fragment.getExampleId() << "," - << fragment.getExampleOffset() << "," + << fragment.getOccurences().size() << "," << fragment.getPatternOffset() << "," << fragment.getMatchedLength() << std::endl; @@ -248,10 +248,9 @@ int main(int argc, char** argv) { result->getFragments()) { std::cout << "\t\tfragment [" << fragment.getStart() << "," << fragment.getEnd() - << "] (exampleId, exampleOffset," + << "] (exampleCount," << " patternOffset, length): " - << fragment.getExampleId() << "," - << fragment.getExampleOffset() << "," + << fragment.getOccurences().size() << "," << fragment.getPatternOffset() << "," << fragment.getMatchedLength() << std::endl; diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index 4f7bba5..e59b33c 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. -#define CONCORDIA_SEARCH_MAX_RESULTS 3 +#define CONCORDIA_SEARCH_MAX_RESULTS 5 #define WORD_MAP_FILE_NAME "word_map.bin" #define MARKERS_FILE_NAME "markers.bin" diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 6c1949f..e262f97 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -69,17 +69,19 @@ std::vector Concordia::tokenizeAll( bool generateCodes) throw(ConcordiaException) { std::vector result; - + if (generateCodes) { BOOST_FOREACH(std::string sentence, sentences) { - result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); + result.push_back(_hashGenerator->generateHash(sentence, + byWhitespace)); } _hashGenerator->serializeWordMap(); } else { BOOST_FOREACH(std::string sentence, sentences) { - result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace)); - } + result.push_back(_hashGenerator->generateTokens(sentence, + byWhitespace)); + } } return result; } @@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern) } -std::vector Concordia::simpleSearch( - const std::string & pattern) +MatchedPatternFragment Concordia::simpleSearch( + const std::string & pattern) throw(ConcordiaException) { if (_T->size() > 0) { return _searcher->simpleSearch(_hashGenerator, _T, _markers, _SA, pattern); } else { - std::vector result; + MatchedPatternFragment result(0, 0); return result; } } @@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() { std::string Concordia::_getMarkersFilePath() { return _indexPath+"/"+MARKERS_FILE_NAME; } - diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 5b0a8b3..dcf436d 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -126,12 +126,11 @@ public: /*! Performs a simple substring lookup on the index. For more info see \ref tutorial1_2. \param pattern pattern to be searched in the index - \returns vector of matched results + \returns matched pattern fragment containing vector of occurences \throws ConcordiaException */ - std::vector simpleSearch( - const std::string & pattern) - throw(ConcordiaException); + MatchedPatternFragment simpleSearch(const std::string & pattern) + throw(ConcordiaException); SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) throw(ConcordiaException); @@ -160,7 +159,7 @@ public: /*! Loads HDD stored index files to RAM and generates suffix array based on RAM stored data structures. - For more info see \ref tutorial2. + For more info see \ref tutorial2. \throws ConcordiaException */ void loadRAMIndexFromDisk() throw(ConcordiaException); diff --git a/concordia/concordia_search_result.hpp b/concordia/concordia_search_result.hpp index da4c751..8e133e5 100644 --- a/concordia/concordia_search_result.hpp +++ b/concordia/concordia_search_result.hpp @@ -16,7 +16,7 @@ - list of longest matched fragments sorted in descending order by length - the best overlay - the score of the best overlay. - + For more info about concordia searching see \ref tutorial1_3. */ @@ -75,6 +75,24 @@ public: return _bestOverlayScore; } + friend std::ostream & operator << (std::ostream & o, + const ConcordiaSearchResult & result) { + o << "Best overlay {" << std::endl; + BOOST_FOREACH(MatchedPatternFragment fragment, + result.getBestOverlay()) { + o << fragment << std::endl; + } + o << "}" << std::endl; + o << "All fragments {" << std::endl; + BOOST_FOREACH(MatchedPatternFragment fragment, + result.getFragments()) { + o << fragment << std::endl; + } + o << "}"; + return o; + } + + private: void _checkPossibleOverlays( std::vector currentOverlay, diff --git a/concordia/concordia_searcher.cpp b/concordia/concordia_searcher.cpp index 5f69e44..8dafa54 100644 --- a/concordia/concordia_searcher.cpp +++ b/concordia/concordia_searcher.cpp @@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch( std::vector occurences = lcpSearch(T, markers, SA, currentPattern, lcpLength); - BOOST_FOREACH(SubstringOccurence occurence, occurences) { - result->addFragment(MatchedPatternFragment( - occurence.getId(), - occurence.getOffset(), - offset, - lcpLength / sizeof(INDEX_CHARACTER_TYPE))); + if (occurences.size() > 0) { + MatchedPatternFragment fragment(offset, + lcpLength / sizeof(INDEX_CHARACTER_TYPE)); + + BOOST_FOREACH(SubstringOccurence occurence, occurences) { + fragment.addOccurence(occurence); + } + result->addFragment(fragment); } } diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 7653a20..6749082 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() { IndexSearcher::~IndexSearcher() { } -std::vector IndexSearcher::simpleSearch( +MatchedPatternFragment IndexSearcher::simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { - std::vector result; - int left; std::vector hash = hashGenerator->generateHash(pattern).getCodes(); @@ -30,6 +28,7 @@ std::vector IndexSearcher::simpleSearch( int size = sa_search(T->data(), (saidx_t) T->size(), (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) SA->size(), &left); + MatchedPatternFragment result(0, hash.size()); for (int i = 0; i < size; ++i) { saidx_t resultPos = SA->at(left + i); if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { @@ -40,12 +39,11 @@ std::vector IndexSearcher::simpleSearch( // removes these accidental results. saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); - result.push_back(MatchedPatternFragment( - Utils::getIdFromMarker(marker), - Utils::getOffsetFromMarker(marker), - 0, - hash.size())); - if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) { + + SubstringOccurence occurence; + occurence.enterDataFromMarker(marker); + result.addOccurence(occurence); + if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) { break; } } diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index fbf64f4..70cc95a 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -18,7 +18,7 @@ /*! Class for searching the index with a sentence. In all searches the sentence is first hashed and then used as a query. - + IndexSearcher performs the simpleSearch on its own, but uses a ConcordiaSearcher object to carry out concordiaSearch. @@ -42,10 +42,10 @@ public: \param markers markers array for the needs of searching \param SA suffix array for the needs of searching \param pattern string pattern to be searched in the index. - \returns vector of occurences of the pattern in the index + \returns matched pattern fragment, containing occurences of the pattern in the index \throws ConcordiaException */ - std::vector simpleSearch( + MatchedPatternFragment simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, diff --git a/concordia/matched_pattern_fragment.cpp b/concordia/matched_pattern_fragment.cpp index d925448..a120a80 100644 --- a/concordia/matched_pattern_fragment.cpp +++ b/concordia/matched_pattern_fragment.cpp @@ -1,14 +1,10 @@ #include "concordia/matched_pattern_fragment.hpp" MatchedPatternFragment::MatchedPatternFragment( - const SUFFIX_MARKER_TYPE & exampleId, - const SUFFIX_MARKER_TYPE & exampleOffset, const SUFFIX_MARKER_TYPE & patternOffset, const SUFFIX_MARKER_TYPE & matchedLength): Interval(patternOffset, patternOffset + matchedLength), - _exampleId(exampleId), - _exampleOffset(exampleOffset), _patternOffset(patternOffset), _matchedLength(matchedLength) { } @@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment( MatchedPatternFragment::~MatchedPatternFragment() { } +void MatchedPatternFragment::addOccurence( + const SubstringOccurence & occurence) { + _occurences.push_back(occurence); +} diff --git a/concordia/matched_pattern_fragment.hpp b/concordia/matched_pattern_fragment.hpp index 01d3ac9..bb90bfa 100644 --- a/concordia/matched_pattern_fragment.hpp +++ b/concordia/matched_pattern_fragment.hpp @@ -3,11 +3,15 @@ #include "concordia/common/config.hpp" #include "concordia/interval.hpp" +#include "concordia/substring_occurence.hpp" +#include +#include +#include /*! Class representing matched pattern fragment in concordia search. This fragment can be seen as a word interval of the pattern. - + This class holds information about: - where the pattern fragment was matched (example id and example offset) - where the fragment is located within the pattern @@ -17,32 +21,26 @@ class MatchedPatternFragment : public Interval { public: /*! Constructor. - \param exampleId id of the example where the pattern fragment was matched - \param exampleOffset offset of the matched fragment in the example \param patternOffset offset of the matched fragment in the pattern \param matchedLength length of the matched pattern */ - MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId, - const SUFFIX_MARKER_TYPE & exampleOffset, - const SUFFIX_MARKER_TYPE & patternOffset, + MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset, const SUFFIX_MARKER_TYPE & matchedLength); /*! Destructor. */ virtual ~MatchedPatternFragment(); - /*! Getter for example id. - \returns example id + /*! Getter for occurences. + \returns occurences */ - SUFFIX_MARKER_TYPE getExampleId() const { - return _exampleId; + std::vector getOccurences() const { + return _occurences; } - /*! Getter for example offset. - \returns example offset + /*! Adds an occurence to the list. + \param fragment occurence to be added */ - SUFFIX_MARKER_TYPE getExampleOffset() const { - return _exampleOffset; - } + void addOccurence(const SubstringOccurence & occurence); /*! Getter for pattern offset. \returns pattern offset @@ -65,10 +63,22 @@ public: return (_matchedLength > other.getMatchedLength()); } -private: - SUFFIX_MARKER_TYPE _exampleId; + friend std::ostream & operator << (std::ostream & o, + const MatchedPatternFragment & fragment) { + o << "fragment(patternOffset=" << fragment.getPatternOffset() + << ", matchedLength=" << fragment.getMatchedLength() << ") {" + << std::endl; + BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) { + o << "\t" << occurence << std::endl; + } - SUFFIX_MARKER_TYPE _exampleOffset; + o << "}"; + return o; + } + + +private: + std::vector _occurences; SUFFIX_MARKER_TYPE _patternOffset; diff --git a/concordia/substring_occurence.hpp b/concordia/substring_occurence.hpp index 74da8c6..b4de964 100644 --- a/concordia/substring_occurence.hpp +++ b/concordia/substring_occurence.hpp @@ -3,6 +3,7 @@ #include "concordia/common/config.hpp" #include +#include /*! Class representing occurence of a searched substring. @@ -65,6 +66,13 @@ public: */ void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker); + friend std::ostream & operator << (std::ostream & o, + const SubstringOccurence & occurence) { + return o << "occurence(exampleId=" << occurence.getId() + << ", offset=" << occurence.getOffset() << ")"; + } + + private: SUFFIX_MARKER_TYPE _id; diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 75c98a4..cd524b9 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -40,53 +40,51 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); - + concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Marysia posiada rysia",123)); concordia.refreshSAfromRAM(); - - /*The test index contains 3 sentences: + + /*The test index contains 3 sentences: 14: "Ala posiada kota" 51: "Ala posiada rysia" 123: "Marysia posiada rysia" - + Test word map: Ala -> 0 posiada -> 1 kota -> 2 rysia -> 3 Marysia -> 4 - + Test hashed index: n: 0 1 2 3 4 5 6 7 8 9 10 11 T[n]: 0 1 2 | 0 1 3 | 4 1 3 | - + Test suffix array: n: 0 1 2 3 4 5 6 7 8 9 10 11 - SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 - + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + */ - - std::vector searchResult1 = concordia.simpleSearch("posiada rysia"); - std::vector searchResult2 = concordia.simpleSearch("posiada kota Ala"); + + MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia"); + MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala"); concordia.clearIndex(); - BOOST_CHECK_EQUAL(searchResult1.size(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51); - BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); - + BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1); + // Checking pattern spanning over 2 segments - BOOST_CHECK_EQUAL(searchResult2.size(), 0); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0); } BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) { - // modified stop words to avoid anonymization + // modified stop words to avoid anonymization Concordia concordia = Concordia(TestResourcesManager::getTempPath(), TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); std::vector testExamples; @@ -106,12 +104,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1); BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest"); - /*The test index contains 4 sentences: + /*The test index contains 4 sentences: 312: "xto xjest okno" 202: "czy xjest okno otwarte" 45: "chyba xto xjest xtutaj" 29: "xto xjest" - + Test word map: xto -> 0 xjest -> 1 @@ -120,42 +118,37 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) otwarte -> 4 chyba -> 5 xtutaj -> 6 - + Test hashed index: n: 0 1 2 3 4 5 6 7 8 9 10 11 12 T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1 - + Test suffix array: n: 0 1 2 3 4 5 6 7 8 9 10 11 12 SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10 - + */ - + Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - std::vector searchResult1 = concordia2.simpleSearch("xto xjest"); - std::vector searchResult2 = concordia2.simpleSearch("xjest okno"); + MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest"); + MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno"); concordia2.clearIndex(); - BOOST_CHECK_EQUAL(searchResult1.size(), 3); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0); - BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45); - BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29); - BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0); - BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0); - BOOST_CHECK_EQUAL(searchResult2.size(), 2); - BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202); - BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2); - BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312); - BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312); + BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1); } BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) @@ -166,17 +159,16 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312)); testExamples.push_back(Example("czy xjest żółte otwarte",202)); concordia.addAllExamples(testExamples); - + Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - std::vector searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); + MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); concordia2.clearIndex(); - BOOST_CHECK_EQUAL(searchResult1.size(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); - BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312); + BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2); } BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) @@ -187,15 +179,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Marysia posiada rysia",123)); concordia.refreshSAfromRAM(); - - /*The test index contains 3 sentences: + + /*The test index contains 3 sentences: 14: "Ala posiada kota" 51: "Ala posiada rysia" 123: "Marysia posiada rysia" */ // the below expectations assume 0.3 anubis threshold - + std::vector searchResult1 = concordia.anubisSearch("posiada rysia chyba"); BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51); @@ -235,9 +227,9 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) concordia.addExample(Example("Gosia chyba posiada rysia też",167)); concordia.addExample(Example("Ania od wczoraj posiada rysia",45)); concordia.refreshSAfromRAM(); - + boost::shared_ptr searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); - // best overlay: [0,2], [2,3], score = 0.829 + // best overlay: [0,2], [2,3], score = 0.829 BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1); @@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3); - BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7); /* - addFragment 45,2,0,2 - addFragment 51,1,0,2 - addFragment 123,1,0,2 - addFragment 45,3,1,1 - addFragment 51,2,1,1 - addFragment 123,2,1,1 - addFragment 167,1,2,1 + adding fragment: offset=0, length=2 + adding occurence: example id=167, offset=2 + adding occurence: example id=45, offset=3 + adding occurence: example id=51, offset=1 + adding occurence: example id=123, offset=1 + adding fragment: offset=1, length=1 + adding occurence: example id=167, offset=3 + adding occurence: example id=45, offset=4 + adding occurence: example id=51, offset=2 + adding occurence: example id=123, offset=2 + adding fragment: offset=2, length=1 + adding occurence: example id=167, offset=1 */ - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1); - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1); - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1); - - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1); concordia.clearIndex(); } @@ -322,9 +311,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) concordia.addTokenizedExample(ts, 14); concordia.refreshSAfromRAM(); - + boost::shared_ptr searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); - // best overlay: + // best overlay: + + // std::cout << *searchResult1 << std::endl; BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1); @@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2 Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2 Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1 + + Best overlay { + fragment(patternOffset=1, matchedLength=4) { + occurence(exampleId=321, offset=0) + } + fragment(patternOffset=5, matchedLength=4) { + occurence(exampleId=14, offset=7) + } + } + All fragments { + fragment(patternOffset=4, matchedLength=5) { + occurence(exampleId=14, offset=6) + } + fragment(patternOffset=1, matchedLength=4) { + occurence(exampleId=321, offset=0) + } + fragment(patternOffset=5, matchedLength=4) { + occurence(exampleId=14, offset=7) + } + fragment(patternOffset=2, matchedLength=3) { + occurence(exampleId=321, offset=1) + } + fragment(patternOffset=6, matchedLength=3) { + occurence(exampleId=14, offset=8) + } + fragment(patternOffset=3, matchedLength=2) { + occurence(exampleId=321, offset=2) + } + fragment(patternOffset=7, matchedLength=2) { + occurence(exampleId=14, offset=9) + } + fragment(patternOffset=8, matchedLength=1) { + occurence(exampleId=14, offset=10) + } + } + */ - - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14); - BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6); + + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14); + BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); @@ -373,13 +400,13 @@ BOOST_AUTO_TEST_CASE( Tokenize ) BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); - + std::vector sentences; sentences.push_back("Marysia, ma rysia;"); sentences.push_back("Testing complete;"); sentences.push_back("This, is (a) weird;! sentence <>"); std::vector tokenizedSentences = concordia.tokenizeAll(sentences); - + concordia.clearIndex(); @@ -387,7 +414,7 @@ BOOST_AUTO_TEST_CASE( Tokenize ) BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3); BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2); BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5); - + } BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) @@ -400,30 +427,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) concordia.addExample(Example("Marysia posiada rysia",123)); concordia.addExample(Example("Ala posiada kota i psa",542)); concordia.refreshSAfromRAM(); - - /*The test index contains 3 sentences: + + /*The test index contains 3 sentences: 14: "Ala posiada kota" 51: "Ala posiada rysia" 123: "Marysia posiada rysia" - + Test word map: Ala -> 0 posiada -> 1 kota -> 2 rysia -> 3 Marysia -> 4 - + Test hashed index: n: 0 1 2 3 4 5 6 7 8 9 10 11 T[n]: 0 1 2 | 0 1 3 | 4 1 3 | - + Test suffix array: n: 0 1 2 3 4 5 6 7 8 9 10 11 - SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 - + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + */ - - + + BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1); @@ -446,7 +473,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace ) BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); - + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); @@ -455,7 +482,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace ) BOOST_CHECK_EQUAL(ts.getCodes().size(), 7); concordia.clearIndex(); - + } BOOST_AUTO_TEST_CASE( TokenizeOnly ) @@ -469,7 +496,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly ) BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); - + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); @@ -478,7 +505,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly ) BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens concordia.clearIndex(); - + } BOOST_AUTO_TEST_SUITE_END() diff --git a/scripts/concordia_json.zip b/scripts/concordia_json.zip new file mode 100644 index 0000000..647c4a2 Binary files /dev/null and b/scripts/concordia_json.zip differ diff --git a/scripts/responseExplained.txt b/scripts/responseExplained.txt new file mode 100644 index 0000000..dcea0f3 --- /dev/null +++ b/scripts/responseExplained.txt @@ -0,0 +1,22 @@ +{ + "status": "success", //status operacji + "result": { + "bestOverlayScore" : // Concordia podaje score znalezionego przez siebie dopasowania + "bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń + { // jeden fragment + "matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego + "matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty) + "occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego + "id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe) + "matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu + "matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive) + "sourceSegment": , // pełny tekst zdania źródłowego przykładu + "targetSegment": , // pełny tekst zdania docelowego przykładu + "targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego + [257, 264] + ] + }, .... // mogą być jeszcze kolejne fragmenty + + ] + } +} diff --git a/scripts/sampleRequest.json b/scripts/sampleRequest.json new file mode 100644 index 0000000..3a8dd5e --- /dev/null +++ b/scripts/sampleRequest.json @@ -0,0 +1,5 @@ +{ + "operation": "concordiaSearch", + "pattern":"Ala ma kota", + "tmId":1 +} diff --git a/scripts/sampleResponse.json b/scripts/sampleResponse.json new file mode 100644 index 0000000..b832c7f --- /dev/null +++ b/scripts/sampleResponse.json @@ -0,0 +1,43 @@ +{ + "status": "success", + "result": { + "bestOverlayScore": 0.5, + "bestOverlay": [{ + "matchedPatternStart": 0, + "matchedPatternEnd": 8, + "occurences": [{ + "id": 1782145, + "matchedExampleStart": 305, + "matchedExampleEnd": 314, + "sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;", + "targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '", + "targetFragments": [ + [257, 264] + ] + }, { + "id": 1782145, + "matchedExampleStart": 326, + "matchedExampleEnd": 335, + "sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;", + "targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '", + "targetFragments": [ + [300, 315] + ] + }] + }, { + "matchedPatternStart": 9, + "matchedPatternEnd": 47, + "occurences": [{ + "id": 1623941, + "matchedExampleStart": 54, + "matchedExampleEnd": 93, + "sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .", + "targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .", + "targetFragments": [ + [51, 85], + [96, 99] + ] + }] + }] + } +}