diff --git a/TODO.txt b/TODO.txt index 45e5880..810d6cd 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,5 +1,6 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- -- document the code (classes, cfg files) and update tutorial +- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results. +IN PROGRESS - document the code (classes, cfg files) and update tutorial - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. - testy zużycia pamięci - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 3c6d4b6..d92b167 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -42,6 +42,17 @@ std::string _createLibraryVersion() { return version.str(); } +boost::shared_ptr + Concordia::tokenize(const std::string & sentence) + throw(ConcordiaException) { + boost::shared_ptr result = + _hashGenerator->generateHash(sentence); + _hashGenerator->serializeWordMap(); + return result; + +} + + // Sentences are written to disk and added to T. // SA is generated on command by other methods. boost::shared_ptr Concordia::addExample( @@ -50,6 +61,17 @@ boost::shared_ptr Concordia::addExample( return _index->addExample(_hashGenerator, _T, _markers, example); } +// Sentences are written to disk and added to T. +// SA is generated on command by other methods. +void Concordia::addTokenizedExample( + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id) + throw(ConcordiaException) { + _index->addTokenizedExample(_hashGenerator, _T, + _markers, tokenizedSentence, id); +} + + // Sentences are written to disk and added to T. // SA is generated on command by other methods. std::vector Concordia::addAllExamples( diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 9c707e9..901a893 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -52,6 +52,15 @@ public: */ std::string & getVersion(); + /*! Tokenizes the given sentence. + \param sentence sentence to be tokenized + \returns tokenized sentence object, + containing information about original word positions + \throws ConcordiaException + */ + boost::shared_ptr tokenize(const std::string & sentence) + throw(ConcordiaException); + /*! Adds an Example to the index. \param example example to be added \returns tokenized sentence object, @@ -61,6 +70,16 @@ public: boost::shared_ptr addExample(const Example & example) throw(ConcordiaException); + /*! Adds a tokenized example to the index. + \param tokenizedSentence tokenized sentence to be added + \param id of the sentence to be added + \throws ConcordiaException + */ + void addTokenizedExample( + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id) + throw(ConcordiaException); + /*! Adds multiple examples to the index. \param examples vector of examples to be added \returns vector of tokenized sentence objects, diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 23c4ca4..43a4d81 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -82,16 +82,33 @@ boost::shared_ptr ConcordiaIndex::addExample( return hashedPattern; } -boost::shared_ptr ConcordiaIndex::_addSingleExample( +void ConcordiaIndex::addTokenizedExample( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id) { + std::ofstream hashedIndexFile; + hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| + std::ios::app|std::ios::binary); + std::ofstream markersFile; + markersFile.open(_markersFilePath.c_str(), std::ios::out| + std::ios::app|std::ios::binary); + _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, + T, markers, tokenizedSentence, id); + hashedIndexFile.close(); + markersFile.close(); +} + +void ConcordiaIndex::_addSingleTokenizedExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, - const Example & example) { - boost::shared_ptr hashedPattern = - hashGenerator->generateHash(example.getSentence()); - std::vector hash = hashedPattern->getCodes(); + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id) { + std::vector hash = tokenizedSentence->getCodes(); int offset = 0; for (std::vector::iterator it = hash.begin(); @@ -102,7 +119,7 @@ boost::shared_ptr ConcordiaIndex::_addSingleExample( // append to markersFile SUFFIX_MARKER_TYPE marker = Utils::createMarker( - example.getId(), + id, offset, hash.size()); @@ -120,7 +137,20 @@ boost::shared_ptr ConcordiaIndex::_addSingleExample( SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); +} +boost::shared_ptr ConcordiaIndex::_addSingleExample( + std::ofstream & hashedIndexFile, + std::ofstream & markersFile, + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const Example & example) { + boost::shared_ptr hashedPattern = + hashGenerator->generateHash(example.getSentence()); + _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, + T, markers, hashedPattern, example.getId()); + return hashedPattern; } diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index 6d79cb9..f59469b 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -59,6 +59,27 @@ public: boost::shared_ptr > markers, const Example & example); + /*! Adds a tokenized example to the index. Hashed index + and markers array are appended with the example. + At the same time, HDD versions of these + two data structures are also appended with the same example. + The method returns a tokenized version of the example. + \param hashGenerator hash generator to be used to prepare the hash + of the example + \param T RAM-based hash index to be appended to + \param markers RAM-based markers array to be appended to + \param example example to be added to index + \param tokenizedSentence tokenized sentence to be added + \param id of the sentence to be added + \throws ConcordiaException + */ + void addTokenizedExample( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id); + /*! Adds multiple examples to the index. Examples are first hashed using the hash generator passed to this method. Then, hashed index and markers array (also passed to this method) are appended @@ -87,6 +108,15 @@ public: boost::shared_ptr > T); private: + void _addSingleTokenizedExample( + std::ofstream & hashedIndexFile, + std::ofstream & markersFile, + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr tokenizedSentence, + SUFFIX_MARKER_TYPE id); + boost::shared_ptr _addSingleExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 38cc33c..96fbebc 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -287,7 +287,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("New test product has a mistake", 321)); - concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); + boost::shared_ptr ts = concordia.tokenize("This is just testing and it has nothing to do with the above"); + concordia.addTokenizedExample(ts, 14); concordia.refreshSAfromRAM(); boost::shared_ptr searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); @@ -324,4 +325,20 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) concordia.clearIndex(); } + +BOOST_AUTO_TEST_CASE( Tokenize ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + boost::shared_ptr ts = concordia.tokenize(" Ala posiada kota"); + /* + 0,3 type: 1 value: ala + 4,11 type: 1 value: posiada + 12,16 type: 1 value: kota + */ + BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); + BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9); + BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16); + BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada"); +} BOOST_AUTO_TEST_SUITE_END()