separated tokenization and adding to index

2015-08-01 17:03:39 +02:00 · 2015-08-01 17:03:39 +02:00 · 28704c2f43
commit 28704c2f43
parent 5a57406875
6 changed files with 127 additions and 8 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,5 +1,6 @@
 ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- document the code (classes, cfg files) and update tutorial
+- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
 IN PROGRESS - document the code (classes, cfg files) and update tutorial
 - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
 - testy zużycia pamięci
 - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -42,6 +42,17 @@ std::string _createLibraryVersion() {
    return version.str();
 }
 boost::shared_ptr<TokenizedSentence>
            Concordia::tokenize(const std::string & sentence)
                                  throw(ConcordiaException) {
    boost::shared_ptr<TokenizedSentence> result =
                _hashGenerator->generateHash(sentence);
    _hashGenerator->serializeWordMap();
    return result;
 }
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
 boost::shared_ptr<TokenizedSentence> Concordia::addExample(
@ -50,6 +61,17 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
    return _index->addExample(_hashGenerator, _T, _markers, example);
 }
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
 void Concordia::addTokenizedExample(
                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
                    SUFFIX_MARKER_TYPE id)
                                              throw(ConcordiaException) {
    _index->addTokenizedExample(_hashGenerator, _T,
                                _markers, tokenizedSentence, id);
 }
 // Sentences are written to disk and added to T.
 // SA is generated on command by other methods.
 std::vector<TokenizedSentence> Concordia::addAllExamples(
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -52,6 +52,15 @@ public:
    */
    std::string & getVersion();
    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
    boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
                                                     throw(ConcordiaException);
    /*! Adds an Example to the index.
      \param example example to be added
      \returns tokenized sentence object,
@ -61,6 +70,16 @@ public:
    boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
                                                   throw(ConcordiaException);
    /*! Adds a tokenized example to the index.
      \param tokenizedSentence tokenized sentence to be added
      \param id of the sentence to be added
      \throws ConcordiaException
    */
    void addTokenizedExample(
                    boost::shared_ptr<TokenizedSentence> tokenizedSentence,
                    SUFFIX_MARKER_TYPE id)
                                                  throw(ConcordiaException);
    /*! Adds multiple examples to the index.
      \param examples vector of examples to be added
      \returns vector of tokenized sentence objects,
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -82,16 +82,33 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
    return hashedPattern;
 }
-boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
+void ConcordiaIndex::addTokenizedExample(
            boost::shared_ptr<HashGenerator> hashGenerator,
            boost::shared_ptr<std::vector<sauchar_t> > T,
            boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
            boost::shared_ptr<TokenizedSentence> tokenizedSentence,
            SUFFIX_MARKER_TYPE id) {
    std::ofstream hashedIndexFile;
    hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
    std::ofstream markersFile;
    markersFile.open(_markersFilePath.c_str(), std::ios::out|
                                             std::ios::app|std::ios::binary);
    _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
                              T, markers, tokenizedSentence, id);
    hashedIndexFile.close();
    markersFile.close();
 }
 void ConcordiaIndex::_addSingleTokenizedExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
-                   const Example & example) {
+                   boost::shared_ptr<TokenizedSentence> tokenizedSentence,
-    boost::shared_ptr<TokenizedSentence> hashedPattern =
+                   SUFFIX_MARKER_TYPE id) {
-                    hashGenerator->generateHash(example.getSentence());
+    std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
    std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
    int offset = 0;
    for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -102,7 +119,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
        // append to markersFile
        SUFFIX_MARKER_TYPE marker = Utils::createMarker(
-                                           example.getId(),
+                                           id,
                                           offset,
                                           hash.size());
@ -120,6 +137,19 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
    SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
    Utils::writeMarker(markersFile, sentenceBoundaryMA);
    markers->push_back(sentenceBoundaryMA);
 }
 boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
                   std::ofstream & hashedIndexFile,
                   std::ofstream & markersFile,
                   boost::shared_ptr<HashGenerator> hashGenerator,
                   boost::shared_ptr<std::vector<sauchar_t> > T,
                   boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                   const Example & example) {
    boost::shared_ptr<TokenizedSentence> hashedPattern =
                    hashGenerator->generateHash(example.getSentence());
    _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
                               T, markers, hashedPattern, example.getId());
    return hashedPattern;
 }
--- a/concordia/concordia_index.hpp
+++ b/concordia/concordia_index.hpp
@ -59,6 +59,27 @@ public:
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                const Example & example);
    /*! Adds a tokenized example to the index. Hashed index
        and markers array are appended with the example.
        At the same time, HDD versions of these
        two data structures are also appended with the same example.
        The method returns a tokenized version of the example.
      \param hashGenerator hash generator to be used to prepare the hash
             of the example
      \param T RAM-based hash index to be appended to
      \param markers RAM-based markers array to be appended to
      \param example example to be added to index
      \param tokenizedSentence tokenized sentence to be added
      \param id of the sentence to be added
      \throws ConcordiaException
    */
    void addTokenizedExample(
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
                SUFFIX_MARKER_TYPE id);
    /*! Adds multiple examples to the index. Examples are first hashed using
        the hash generator passed to this method. Then, hashed index
        and markers array (also passed to this method) are appended
@ -87,6 +108,15 @@ public:
                boost::shared_ptr<std::vector<sauchar_t> > T);
 private:
    void _addSingleTokenizedExample(
                std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
                boost::shared_ptr<HashGenerator> hashGenerator,
                boost::shared_ptr<std::vector<sauchar_t> > T,
                boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
                boost::shared_ptr<TokenizedSentence> tokenizedSentence,
                SUFFIX_MARKER_TYPE id);
    boost::shared_ptr<TokenizedSentence> _addSingleExample(
                std::ofstream & hashedIndexFile,
                std::ofstream & markersFile,
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -287,7 +287,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
    concordia.addExample(Example("Alice has a cat", 56));
    concordia.addExample(Example("Alice has a dog", 23));
    concordia.addExample(Example("New test product has a mistake", 321));
-    concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
+    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
    concordia.addTokenizedExample(ts, 14);
    concordia.refreshSAfromRAM();
    boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
@ -324,4 +325,20 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
    concordia.clearIndex();
 }
 BOOST_AUTO_TEST_CASE( Tokenize )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
    boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("  Ala    posiada kota");
    /*
    0,3 type: 1 value: ala
    4,11 type: 1 value: posiada
    12,16 type: 1 value: kota
    */
    BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
    BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
 }
 BOOST_AUTO_TEST_SUITE_END()