extended markers - length, bitwise operators
Former-commit-id: 948a7fc68bf0b2284ce631d877fc13fa3eaa4882
This commit is contained in:
parent
fec63e561d
commit
2533fd5b44
@ -9,17 +9,17 @@ set (CONCORDIA_VERSION_MINOR 1)
|
||||
# Type of the characters in SA
|
||||
|
||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
|
||||
set (INDEX_CHARACTER_TYPE_MAX_VALUE "ULONG_MAX")
|
||||
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
||||
|
||||
# Suffix markers
|
||||
set (SUFFIX_MARKER_TYPE "unsigned int")
|
||||
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
|
||||
set (SUFFIX_MARKER_DIVISOR 256)
|
||||
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
||||
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
||||
set (SUFFIX_MARKER_TYPE "unsigned long")
|
||||
set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
|
||||
set (SUFFIX_MARKER_SENTENCE_BYTES 2)
|
||||
# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
|
||||
# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
|
||||
|
||||
# ============================== #
|
||||
# =============================== #
|
||||
# Production paths
|
||||
# ============================== #
|
||||
|
||||
|
8
TODO.txt
8
TODO.txt
@ -2,6 +2,8 @@ DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||
DONE 2. anonimizacja zdań
|
||||
DONE 3. Dzielenie zdań (max 255 tokenów)
|
||||
|
||||
|
||||
|
||||
- concordia-server
|
||||
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||
- wyłączyć stopWords
|
||||
@ -13,4 +15,8 @@ zastanowić się nad optymalizacją:
|
||||
- unordered_map tmMatchesMap
|
||||
- LCP array
|
||||
|
||||
Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
||||
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
||||
|
||||
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
||||
2. Wykonać anubis search na nowych markerach z długością zdania
|
||||
3. Multi-threading?
|
||||
|
@ -117,7 +117,7 @@ void AnubisSearcher::_collectResults(
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
||||
result.push_back(new SubstringOccurence(
|
||||
marker / SUFFIX_MARKER_DIVISOR,
|
||||
marker % SUFFIX_MARKER_DIVISOR));
|
||||
Utils::getIdFromMarker(marker),
|
||||
Utils::getOffsetFromMarker(marker)));
|
||||
}
|
||||
}
|
||||
|
@ -23,8 +23,9 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
|
||||
|
||||
|
||||
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
||||
//Max sentence size is determined by suffix marker divisor.
|
||||
//The last bits in a sentence marker denote offset whose maximum value
|
||||
//is the sentence size minus 2.
|
||||
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
|
||||
#define SUFFIX_MARKER_SENTENCE_BYTES @SUFFIX_MARKER_SENTENCE_BYTES@
|
||||
//Max sentence size is determined by the SUFFIX_MARKER_SENTENCE_BYTES property.
|
||||
//The sentence marker is build as follows: its first bytes store the
|
||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <math.h>
|
||||
|
||||
Utils::Utils() {
|
||||
}
|
||||
@ -71,3 +72,43 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array,
|
||||
}
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::getIdFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||
// shift right to erase offset and length
|
||||
return marker >> SUFFIX_MARKER_SENTENCE_BYTES * 16;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::getOffsetFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||
// shift left to erase id
|
||||
SUFFIX_MARKER_TYPE result = marker << _idBytes * 8;
|
||||
// shift back right and go further to erase length
|
||||
result = result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||
return result;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::getLengthFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||
// shift left to erase id and offset
|
||||
SUFFIX_MARKER_TYPE result = marker <<
|
||||
(_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||
// shift back
|
||||
return result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
SUFFIX_MARKER_TYPE length) {
|
||||
// shift twice by SUFFIX_MARKER_SENTENCE_BYTES
|
||||
SUFFIX_MARKER_TYPE result = id << SUFFIX_MARKER_SENTENCE_BYTES * 16;
|
||||
// shift once by SUFFIX_MARKER_SENTENCE_BYTES
|
||||
result += offset << SUFFIX_MARKER_SENTENCE_BYTES * 8;
|
||||
// no shift at all
|
||||
result += length;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
|
||||
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
|
||||
|
||||
int Utils::_idBytes = sizeof(SUFFIX_MARKER_TYPE) -
|
||||
2 * SUFFIX_MARKER_SENTENCE_BYTES;
|
||||
|
||||
|
@ -45,9 +45,23 @@ public:
|
||||
template <typename T>
|
||||
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
||||
|
||||
static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||
|
||||
static SUFFIX_MARKER_TYPE getOffsetFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||
|
||||
static SUFFIX_MARKER_TYPE getLengthFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||
|
||||
static SUFFIX_MARKER_TYPE createMarker(SUFFIX_MARKER_TYPE id,
|
||||
SUFFIX_MARKER_TYPE offset,
|
||||
SUFFIX_MARKER_TYPE length);
|
||||
|
||||
static SUFFIX_MARKER_TYPE maxSentenceSize;
|
||||
|
||||
private:
|
||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos);
|
||||
|
||||
static int _idBytes;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <iostream>
|
||||
#include <climits>
|
||||
|
||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
||||
const string & markersFilePath)
|
||||
@ -91,8 +92,10 @@ void ConcordiaIndex::_addSingleExample(
|
||||
|
||||
// append to markersFile
|
||||
|
||||
SUFFIX_MARKER_TYPE marker = offset;
|
||||
marker += example.getId() * SUFFIX_MARKER_DIVISOR;
|
||||
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||
example.getId(),
|
||||
offset,
|
||||
hash->size());
|
||||
|
||||
Utils::writeMarker(markersFile, marker);
|
||||
markers->push_back(marker);
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "concordia/example.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <iostream>
|
||||
|
||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||
throw(ConcordiaException):
|
||||
@ -7,7 +8,7 @@ Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||
_id(id) {
|
||||
// check if the example id exceeds space
|
||||
// reserved for it in the suffix marker
|
||||
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
|
||||
if (id >= SUFFIX_MARKER_TYPE_MAX_VALUE >> 8) {
|
||||
throw ConcordiaException("Example id too large.");
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/archive/binary_oarchive.hpp>
|
||||
@ -30,7 +31,7 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
boost::shared_ptr<vector<string> > tokenTexts =
|
||||
generateTokenVector(sentence);
|
||||
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
|
||||
if (tokenTexts->size() > Utils::maxSentenceSize) {
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
for (vector<string>::iterator it = tokenTexts->begin();
|
||||
|
@ -40,11 +40,9 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||
|
||||
// TODO(rafalj): think about using bitwise operators
|
||||
// in the below code
|
||||
result.push_back(new SubstringOccurence(
|
||||
marker / SUFFIX_MARKER_DIVISOR,
|
||||
marker % SUFFIX_MARKER_DIVISOR));
|
||||
Utils::getIdFromMarker(marker),
|
||||
Utils::getOffsetFromMarker(marker)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/anubis_searcher.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -27,9 +28,8 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
T->push_back(3);
|
||||
T->push_back(2);
|
||||
|
||||
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
|
||||
for(int i=0;i<6;i++) {
|
||||
markers->push_back(marker++);
|
||||
markers->push_back(Utils::createMarker(34,i,6));
|
||||
}
|
||||
|
||||
pattern->push_back(2);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
#include <climits>
|
||||
|
||||
#include "concordia/example.hpp"
|
||||
|
||||
@ -10,19 +11,19 @@ BOOST_AUTO_TEST_SUITE(exampleTest)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ExceedingId )
|
||||
{
|
||||
Example example1("Test", 16777215);
|
||||
unsigned long maxId = (ULLONG_MAX >> 8) - 1;
|
||||
Example example1("Test", maxId);
|
||||
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
try {
|
||||
Example example2("Test", 16777216);
|
||||
Example example2("Test", maxId+1);
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
stringstream ss;
|
||||
for (int i=0;i<257;i++) {
|
||||
for (int i=0;i<65537;i++) {
|
||||
ss << "xx" << i << " ";
|
||||
}
|
||||
|
||||
|
@ -85,6 +85,33 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( MaxSentenceSize )
|
||||
{
|
||||
BOOST_CHECK_EQUAL(Utils::maxSentenceSize, 65536);
|
||||
}
|
||||
|
||||
//The below examples use the following marker:
|
||||
//00000000|00000000|00000000|00000011|00000000|00000101|00000000|00000111
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CreateMarker )
|
||||
{
|
||||
BOOST_CHECK_EQUAL(Utils::createMarker(3,5,7), 12885229575);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( GetIdFromMarker )
|
||||
{
|
||||
BOOST_CHECK_EQUAL(Utils::getIdFromMarker(12885229575), 3);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( GetOffsetFromMarker )
|
||||
{
|
||||
BOOST_CHECK_EQUAL(Utils::getOffsetFromMarker(12885229575), 5);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( GetLengthFromMarker )
|
||||
{
|
||||
BOOST_CHECK_EQUAL(Utils::getLengthFromMarker(12885229575), 7);
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "concordia/word_map.hpp"
|
||||
#include <climits>
|
||||
|
||||
|
||||
WordMap::WordMap() throw(ConcordiaException) {
|
||||
|
Loading…
Reference in New Issue
Block a user