extended markers - length, bitwise operators

Former-commit-id: 948a7fc68bf0b2284ce631d877fc13fa3eaa4882
2015-04-09 22:17:19 +02:00 · 2015-04-09 22:17:19 +02:00 · 2533fd5b44
commit 2533fd5b44
parent fec63e561d
15 changed files with 124 additions and 30 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,17 +9,17 @@ set (CONCORDIA_VERSION_MINOR 1)
 # Type of the characters in SA

 set (INDEX_CHARACTER_TYPE "unsigned int")
-set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
+set (INDEX_CHARACTER_TYPE_MAX_VALUE "ULONG_MAX")
 # The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.

 # Suffix markers
-set (SUFFIX_MARKER_TYPE "unsigned int")
-set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
-set (SUFFIX_MARKER_DIVISOR 256)
-# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
-# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
+set (SUFFIX_MARKER_TYPE "unsigned long")
+set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
+set (SUFFIX_MARKER_SENTENCE_BYTES 2)
+# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
+# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.

-# ============================== #
+# =============================== #
 # Production paths
 # ============================== #

--- a/TODO.txt
+++ b/TODO.txt
@ -2,6 +2,8 @@ DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
 DONE 2. anonimizacja zdań
 DONE 3. Dzielenie zdań (max 255 tokenów)

+
+
 - concordia-server
 - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
 - wyłączyć stopWords
@ -13,4 +15,8 @@ zastanowić się nad optymalizacją:
 - unordered_map tmMatchesMap
 - LCP array

-Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
+DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
+
+DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
+2. Wykonać anubis search na nowych markerach z długością zdania
+3. Multi-threading?
--- a/concordia/anubis_searcher.cpp
+++ b/concordia/anubis_searcher.cpp
@ -117,7 +117,7 @@ void AnubisSearcher::_collectResults(
        saidx_t resultPos = SA->at(left + i);
        SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
        result.push_back(new SubstringOccurence(
-                            marker / SUFFIX_MARKER_DIVISOR,
-                            marker % SUFFIX_MARKER_DIVISOR));
+                            Utils::getIdFromMarker(marker),
+                            Utils::getOffsetFromMarker(marker)));
    }
 }
--- a/concordia/common/config.hpp.in
+++ b/concordia/common/config.hpp.in
@ -23,8 +23,9 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
 #define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@


-#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
-//Max sentence size is determined by suffix marker divisor.
-//The last bits in a sentence marker denote offset whose maximum value
-//is the sentence size minus 2.
-#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
+#define SUFFIX_MARKER_SENTENCE_BYTES @SUFFIX_MARKER_SENTENCE_BYTES@
+//Max sentence size is determined by the SUFFIX_MARKER_SENTENCE_BYTES property.
+//The sentence marker is build as follows: its first bytes store the
+// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
+// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
+
--- a/concordia/common/utils.cpp
+++ b/concordia/common/utils.cpp
@ -1,4 +1,5 @@
 #include "concordia/common/utils.hpp"
+#include <math.h>

 Utils::Utils() {
 }
@ -71,3 +72,43 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array,
    }
 }

+SUFFIX_MARKER_TYPE Utils::getIdFromMarker(SUFFIX_MARKER_TYPE marker) {
+    // shift right to erase offset and length
+    return marker >> SUFFIX_MARKER_SENTENCE_BYTES * 16;
+}
+
+SUFFIX_MARKER_TYPE Utils::getOffsetFromMarker(SUFFIX_MARKER_TYPE marker) {
+    // shift left to erase id
+    SUFFIX_MARKER_TYPE result = marker << _idBytes * 8;
+    // shift back right and go further to erase length
+    result = result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
+    return result;
+}
+
+SUFFIX_MARKER_TYPE Utils::getLengthFromMarker(SUFFIX_MARKER_TYPE marker) {
+    // shift left to erase id and offset
+    SUFFIX_MARKER_TYPE result = marker <<
+                        (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
+    // shift back
+    return result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
+}
+
+SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
+                                              SUFFIX_MARKER_TYPE offset,
+                                              SUFFIX_MARKER_TYPE length) {
+    // shift twice by SUFFIX_MARKER_SENTENCE_BYTES
+    SUFFIX_MARKER_TYPE result = id << SUFFIX_MARKER_SENTENCE_BYTES * 16;
+    // shift once by SUFFIX_MARKER_SENTENCE_BYTES
+    result += offset << SUFFIX_MARKER_SENTENCE_BYTES * 8;
+    // no shift at all
+    result += length;
+
+    return result;
+}
+
+SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
+                     pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
+
+int Utils::_idBytes = sizeof(SUFFIX_MARKER_TYPE) -
+                       2 * SUFFIX_MARKER_SENTENCE_BYTES;
+
--- a/concordia/common/utils.hpp
+++ b/concordia/common/utils.hpp
@ -45,9 +45,23 @@ public:
    template <typename T>
    static void printVector(boost::shared_ptr<std::vector<T> > vector);

+    static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
+
+    static SUFFIX_MARKER_TYPE getOffsetFromMarker(SUFFIX_MARKER_TYPE marker);
+
+    static SUFFIX_MARKER_TYPE getLengthFromMarker(SUFFIX_MARKER_TYPE marker);
+
+    static SUFFIX_MARKER_TYPE createMarker(SUFFIX_MARKER_TYPE id,
+                                           SUFFIX_MARKER_TYPE offset,
+                                           SUFFIX_MARKER_TYPE length);
+
+    static SUFFIX_MARKER_TYPE maxSentenceSize;
+
 private:
    static void _insertCharToSaucharArray(sauchar_t * array,
                                 INDEX_CHARACTER_TYPE character, int pos);
+
+    static int _idBytes;
 };

 template <typename T>
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -5,6 +5,7 @@
 #include <boost/filesystem.hpp>
 #include <boost/foreach.hpp>
 #include <iostream>
+#include <climits>

 ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
                               const string & markersFilePath)
@ -91,8 +92,10 @@ void ConcordiaIndex::_addSingleExample(

        // append to markersFile

-        SUFFIX_MARKER_TYPE marker = offset;
-        marker += example.getId() * SUFFIX_MARKER_DIVISOR;
+        SUFFIX_MARKER_TYPE marker = Utils::createMarker(
+                                           example.getId(),
+                                           offset,
+                                           hash->size());

        Utils::writeMarker(markersFile, marker);
        markers->push_back(marker);
--- a/concordia/example.cpp
+++ b/concordia/example.cpp
@ -1,5 +1,6 @@
 #include "concordia/example.hpp"
-
+#include <climits>
+#include <iostream>

 Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
                                              throw(ConcordiaException):
@ -7,7 +8,7 @@ Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
                                              _id(id) {
    // check if the example id exceeds space
    // reserved for it in the suffix marker
-    if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
+    if (id >= SUFFIX_MARKER_TYPE_MAX_VALUE >> 8) {
        throw ConcordiaException("Example id too large.");
    }
 }
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -1,4 +1,5 @@
 #include "concordia/hash_generator.hpp"
+#include "concordia/common/utils.hpp"

 #include <boost/filesystem.hpp>
 #include <boost/archive/binary_oarchive.hpp>
@ -30,7 +31,7 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
                                  result(new vector<INDEX_CHARACTER_TYPE>());
    boost::shared_ptr<vector<string> > tokenTexts =
                        generateTokenVector(sentence);
-    if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
+    if (tokenTexts->size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
    for (vector<string>::iterator it = tokenTexts->begin();
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -40,11 +40,9 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
            saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
            SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);

-            // TODO(rafalj): think about using bitwise operators
-            // in the below code
            result.push_back(new SubstringOccurence(
-                marker / SUFFIX_MARKER_DIVISOR,
-                marker % SUFFIX_MARKER_DIVISOR));
+                        Utils::getIdFromMarker(marker),
+                        Utils::getOffsetFromMarker(marker)));
        }
    }

--- a/concordia/t/test_anubis_searcher.cpp
+++ b/concordia/t/test_anubis_searcher.cpp
@ -1,6 +1,7 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/anubis_searcher.hpp"
 #include "concordia/common/config.hpp"
+#include "concordia/common/utils.hpp"

 using namespace std;

@ -27,9 +28,8 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    T->push_back(3);
    T->push_back(2);
    
-    SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
    for(int i=0;i<6;i++) {
-        markers->push_back(marker++);
+        markers->push_back(Utils::createMarker(34,i,6));
    }
    
    pattern->push_back(2);
--- a/concordia/t/test_example.cpp
+++ b/concordia/t/test_example.cpp
@ -1,6 +1,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
+#include <climits>

 #include "concordia/example.hpp"

@ -10,19 +11,19 @@ BOOST_AUTO_TEST_SUITE(exampleTest)

 BOOST_AUTO_TEST_CASE( ExceedingId )
 {
-    Example example1("Test", 16777215);
+    unsigned long maxId = (ULLONG_MAX >> 8) - 1;
+    Example example1("Test", maxId);

    bool exceptionThrown = false;
    string message = "";
    try {
-        Example example2("Test", 16777216);
+        Example example2("Test", maxId+1);
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
    }    
    BOOST_CHECK_EQUAL(exceptionThrown, true);    
    BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);    
-
 }


--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
    HashGenerator hashGenerator = HashGenerator(config);

    stringstream ss;
-    for (int i=0;i<257;i++) {
+    for (int i=0;i<65537;i++) {
        ss << "xx" << i << " ";
    }

--- a/concordia/t/test_utils.cpp
+++ b/concordia/t/test_utils.cpp
@ -85,6 +85,33 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
    BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());      
 }

+BOOST_AUTO_TEST_CASE( MaxSentenceSize )
+{
+    BOOST_CHECK_EQUAL(Utils::maxSentenceSize, 65536);
+}
+
+//The below examples use the following marker:
+//00000000|00000000|00000000|00000011|00000000|00000101|00000000|00000111
+
+BOOST_AUTO_TEST_CASE( CreateMarker )
+{
+    BOOST_CHECK_EQUAL(Utils::createMarker(3,5,7), 12885229575);
+}
+
+BOOST_AUTO_TEST_CASE( GetIdFromMarker )
+{
+    BOOST_CHECK_EQUAL(Utils::getIdFromMarker(12885229575), 3);
+}
+
+BOOST_AUTO_TEST_CASE( GetOffsetFromMarker )
+{
+    BOOST_CHECK_EQUAL(Utils::getOffsetFromMarker(12885229575), 5);
+}
+
+BOOST_AUTO_TEST_CASE( GetLengthFromMarker )
+{
+    BOOST_CHECK_EQUAL(Utils::getLengthFromMarker(12885229575), 7);
+}


 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/word_map.cpp
+++ b/concordia/word_map.cpp
@ -1,4 +1,5 @@
 #include "concordia/word_map.hpp"
+#include <climits>


 WordMap::WordMap() throw(ConcordiaException) {