extended markers - length, bitwise operators

Former-commit-id: 948a7fc68bf0b2284ce631d877fc13fa3eaa4882
2015-04-09 22:17:19 +02:00 · 2015-04-09 22:17:19 +02:00 · 2533fd5b44
commit 2533fd5b44
parent fec63e561d
15 changed files with 124 additions and 30 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,17 +9,17 @@ set (CONCORDIA_VERSION_MINOR 1)
 # Type of the characters in SA
 set (INDEX_CHARACTER_TYPE "unsigned int")
-set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
+set (INDEX_CHARACTER_TYPE_MAX_VALUE "ULONG_MAX")
 # The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
 # Suffix markers
-set (SUFFIX_MARKER_TYPE "unsigned int")
+set (SUFFIX_MARKER_TYPE "unsigned long")
-set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
+set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
-set (SUFFIX_MARKER_DIVISOR 256)
+set (SUFFIX_MARKER_SENTENCE_BYTES 2)
-# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
+# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
-# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
+# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
-# ============================== #
+# =============================== #
 # Production paths
 # ============================== #
--- a/TODO.txt
+++ b/TODO.txt
@ -2,6 +2,8 @@ DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
 DONE 2. anonimizacja zdań
 DONE 3. Dzielenie zdań (max 255 tokenów)
 - concordia-server
 - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
 - wyłączyć stopWords
@ -13,4 +15,8 @@ zastanowić się nad optymalizacją:
 - unordered_map tmMatchesMap
 - LCP array
-Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
+DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
 DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
 2. Wykonać anubis search na nowych markerach z długością zdania
 3. Multi-threading?
--- a/concordia/anubis_searcher.cpp
+++ b/concordia/anubis_searcher.cpp
@ -117,7 +117,7 @@ void AnubisSearcher::_collectResults(
        saidx_t resultPos = SA->at(left + i);
        SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
        result.push_back(new SubstringOccurence(
-                            marker / SUFFIX_MARKER_DIVISOR,
+                            Utils::getIdFromMarker(marker),
-                            marker % SUFFIX_MARKER_DIVISOR));
+                            Utils::getOffsetFromMarker(marker)));
    }
 }
--- a/concordia/common/config.hpp.in
+++ b/concordia/common/config.hpp.in
@ -23,8 +23,9 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
 #define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
-#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
+#define SUFFIX_MARKER_SENTENCE_BYTES @SUFFIX_MARKER_SENTENCE_BYTES@
-//Max sentence size is determined by suffix marker divisor.
+//Max sentence size is determined by the SUFFIX_MARKER_SENTENCE_BYTES property.
-//The last bits in a sentence marker denote offset whose maximum value
+//The sentence marker is build as follows: its first bytes store the
-//is the sentence size minus 2.
+// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
-#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
+// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
--- a/concordia/common/utils.cpp
+++ b/concordia/common/utils.cpp
@ -1,4 +1,5 @@
 #include "concordia/common/utils.hpp"
 #include <math.h>
 Utils::Utils() {
 }
@ -71,3 +72,43 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array,
    }
 }
 SUFFIX_MARKER_TYPE Utils::getIdFromMarker(SUFFIX_MARKER_TYPE marker) {
    // shift right to erase offset and length
    return marker >> SUFFIX_MARKER_SENTENCE_BYTES * 16;
 }
 SUFFIX_MARKER_TYPE Utils::getOffsetFromMarker(SUFFIX_MARKER_TYPE marker) {
    // shift left to erase id
    SUFFIX_MARKER_TYPE result = marker << _idBytes * 8;
    // shift back right and go further to erase length
    result = result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
    return result;
 }
 SUFFIX_MARKER_TYPE Utils::getLengthFromMarker(SUFFIX_MARKER_TYPE marker) {
    // shift left to erase id and offset
    SUFFIX_MARKER_TYPE result = marker <<
                        (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
    // shift back
    return result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
 }
 SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
                                              SUFFIX_MARKER_TYPE offset,
                                              SUFFIX_MARKER_TYPE length) {
    // shift twice by SUFFIX_MARKER_SENTENCE_BYTES
    SUFFIX_MARKER_TYPE result = id << SUFFIX_MARKER_SENTENCE_BYTES * 16;
    // shift once by SUFFIX_MARKER_SENTENCE_BYTES
    result += offset << SUFFIX_MARKER_SENTENCE_BYTES * 8;
    // no shift at all
    result += length;
    return result;
 }
 SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
                     pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
 int Utils::_idBytes = sizeof(SUFFIX_MARKER_TYPE) -
                       2 * SUFFIX_MARKER_SENTENCE_BYTES;
--- a/concordia/common/utils.hpp
+++ b/concordia/common/utils.hpp
@ -45,9 +45,23 @@ public:
    template <typename T>
    static void printVector(boost::shared_ptr<std::vector<T> > vector);
    static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
    static SUFFIX_MARKER_TYPE getOffsetFromMarker(SUFFIX_MARKER_TYPE marker);
    static SUFFIX_MARKER_TYPE getLengthFromMarker(SUFFIX_MARKER_TYPE marker);
    static SUFFIX_MARKER_TYPE createMarker(SUFFIX_MARKER_TYPE id,
                                           SUFFIX_MARKER_TYPE offset,
                                           SUFFIX_MARKER_TYPE length);
    static SUFFIX_MARKER_TYPE maxSentenceSize;
 private:
    static void _insertCharToSaucharArray(sauchar_t * array,
                                 INDEX_CHARACTER_TYPE character, int pos);
    static int _idBytes;
 };
 template <typename T>
--- a/concordia/concordia_index.cpp
+++ b/concordia/concordia_index.cpp
@ -5,6 +5,7 @@
 #include <boost/filesystem.hpp>
 #include <boost/foreach.hpp>
 #include <iostream>
 #include <climits>
 ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
                               const string & markersFilePath)
@ -91,8 +92,10 @@ void ConcordiaIndex::_addSingleExample(
        // append to markersFile
-        SUFFIX_MARKER_TYPE marker = offset;
+        SUFFIX_MARKER_TYPE marker = Utils::createMarker(
-        marker += example.getId() * SUFFIX_MARKER_DIVISOR;
+                                           example.getId(),
                                           offset,
                                           hash->size());
        Utils::writeMarker(markersFile, marker);
        markers->push_back(marker);
--- a/concordia/example.cpp
+++ b/concordia/example.cpp
@ -1,5 +1,6 @@
 #include "concordia/example.hpp"
-
+#include <climits>
 #include <iostream>
 Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
                                              throw(ConcordiaException):
@ -7,7 +8,7 @@ Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
                                              _id(id) {
    // check if the example id exceeds space
    // reserved for it in the suffix marker
-    if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
+    if (id >= SUFFIX_MARKER_TYPE_MAX_VALUE >> 8) {
        throw ConcordiaException("Example id too large.");
    }
 }
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -1,4 +1,5 @@
 #include "concordia/hash_generator.hpp"
 #include "concordia/common/utils.hpp"
 #include <boost/filesystem.hpp>
 #include <boost/archive/binary_oarchive.hpp>
@ -30,7 +31,7 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
                                  result(new vector<INDEX_CHARACTER_TYPE>());
    boost::shared_ptr<vector<string> > tokenTexts =
                        generateTokenVector(sentence);
-    if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
+    if (tokenTexts->size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
    for (vector<string>::iterator it = tokenTexts->begin();
--- a/concordia/index_searcher.cpp
+++ b/concordia/index_searcher.cpp
@ -40,11 +40,9 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
            saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
            SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
            // TODO(rafalj): think about using bitwise operators
            // in the below code
            result.push_back(new SubstringOccurence(
-                marker / SUFFIX_MARKER_DIVISOR,
+                        Utils::getIdFromMarker(marker),
-                marker % SUFFIX_MARKER_DIVISOR));
+                        Utils::getOffsetFromMarker(marker)));
        }
    }
--- a/concordia/t/test_anubis_searcher.cpp
+++ b/concordia/t/test_anubis_searcher.cpp
@ -1,6 +1,7 @@
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include "concordia/anubis_searcher.hpp"
 #include "concordia/common/config.hpp"
 #include "concordia/common/utils.hpp"
 using namespace std;
@ -27,9 +28,8 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
    T->push_back(3);
    T->push_back(2);
    SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
    for(int i=0;i<6;i++) {
-        markers->push_back(marker++);
+        markers->push_back(Utils::createMarker(34,i,6));
    }
    pattern->push_back(2);
--- a/concordia/t/test_example.cpp
+++ b/concordia/t/test_example.cpp
@ -1,6 +1,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include "tests/unit-tests/unit_tests_globals.hpp"
 #include <string>
 #include <climits>
 #include "concordia/example.hpp"
@ -10,19 +11,19 @@ BOOST_AUTO_TEST_SUITE(exampleTest)
 BOOST_AUTO_TEST_CASE( ExceedingId )
 {
-    Example example1("Test", 16777215);
+    unsigned long maxId = (ULLONG_MAX >> 8) - 1;
    Example example1("Test", maxId);
    bool exceptionThrown = false;
    string message = "";
    try {
-        Example example2("Test", 16777216);
+        Example example2("Test", maxId+1);
    } catch (ConcordiaException & e) {
        exceptionThrown = true;
        message = e.what();
    }    
    BOOST_CHECK_EQUAL(exceptionThrown, true);    
    BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);    
 }
--- a/concordia/t/test_hash_generator.cpp
+++ b/concordia/t/test_hash_generator.cpp
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
    HashGenerator hashGenerator = HashGenerator(config);
    stringstream ss;
-    for (int i=0;i<257;i++) {
+    for (int i=0;i<65537;i++) {
        ss << "xx" << i << " ";
    }
--- a/concordia/t/test_utils.cpp
+++ b/concordia/t/test_utils.cpp
@ -85,6 +85,33 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
    BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());      
 }
 BOOST_AUTO_TEST_CASE( MaxSentenceSize )
 {
    BOOST_CHECK_EQUAL(Utils::maxSentenceSize, 65536);
 }
 //The below examples use the following marker:
 //00000000|00000000|00000000|00000011|00000000|00000101|00000000|00000111
 BOOST_AUTO_TEST_CASE( CreateMarker )
 {
    BOOST_CHECK_EQUAL(Utils::createMarker(3,5,7), 12885229575);
 }
 BOOST_AUTO_TEST_CASE( GetIdFromMarker )
 {
    BOOST_CHECK_EQUAL(Utils::getIdFromMarker(12885229575), 3);
 }
 BOOST_AUTO_TEST_CASE( GetOffsetFromMarker )
 {
    BOOST_CHECK_EQUAL(Utils::getOffsetFromMarker(12885229575), 5);
 }
 BOOST_AUTO_TEST_CASE( GetLengthFromMarker )
 {
    BOOST_CHECK_EQUAL(Utils::getLengthFromMarker(12885229575), 7);
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/word_map.cpp
+++ b/concordia/word_map.cpp
@ -1,4 +1,5 @@
 #include "concordia/word_map.hpp"
 #include <climits>
 WordMap::WordMap() throw(ConcordiaException) {