DNWA/11/wywolania/Data/stide_v1.1/Seq-code/seq_stream.cc
Jakub Stefko ab1d7e2546 ...
2021-01-28 18:33:55 +01:00

359 lines
14 KiB
C++
Executable File

#include <stdlib.h>
#include <string.h>
#include <iostream.h>
#include <fstream.h>
#include <stdio.h>
#include "../Utils/hash.h"
#include "seq_stream.h"
/********************************************************************
* Init() *
* Initializes an instance of Stream. *
* *
* Input: const Config &cfg Configuration information *
* const int intern internal stream identifier *
* const int extern external stream identifier *
* Output: none *
*******************************************************************/
void Stream::Init(const Config &cfg,
const int intern_id, const int extern_id) {
// initialize all the arrays
current_seq.Allocate(cfg.seq_len);
current_seq.Set(-1); // initialize the array to be empty
num_in_seq = -1;
num_pairs_read = 0;
num_anoms = 0;
num_seqs_fnd = 0;
int_sid = intern_id;
ext_sid = extern_id;
max_hdist = 0;
seq_hdist = 0;
lf.Allocate(cfg.lf_size);
lf.Set(0);
seq_lfc = 0;
max_lfc = 0;
ready = 0;
seq_len = cfg.seq_len;
}
/*********************************************************************
* Append() *
* This function puts the integer given into the current_seq array *
* as the last element. It flags ready according to whether *
* current_seq is full. Updates num_in_seq, ready, current_seq, *
* num_seqs_fnd, and num_pairs_read. *
* *
* Input: const int new_value The next value to be put into the *
* current_seq array *
* Output: none *
********************************************************************/
void Stream::Append(const int new_value)
{
// missing system call - zero the current sequence
if (new_value == -1) {
num_in_seq = -1;
ready = 0;
}
else {
num_pairs_read++;
if (num_in_seq < seq_len - 1) { // window not yet full
num_in_seq++;
current_seq[num_in_seq] = new_value;
if (num_in_seq == seq_len - 1) {
ready = 1;
++num_seqs_fnd;
}
}
else {
// Roll over current_seq array
for (int k = 0; k < num_in_seq; k++) {
current_seq[k] = current_seq[k + 1];
}
current_seq[num_in_seq] = new_value;
++num_seqs_fnd;
}
}
}
/********************************************************************
* AddToDB() *
* *
* Adds current_seq to the database if it isn't already there; *
* Returns 0 if it is already there, 1 if it is new. Updates *
* normal and db_size. *
* *
* Input: SeqForest &normal Forest of normal sequences *
* int &db_size Number of unique sequences in the *
* database *
* const int total_pairs_read Number of pairs read from the *
* entire input stream *
* const Config &cfg Configuration Information *
* Output: 0 if sequence isn't new, 1 if it is *
********************************************************************/
int Stream::AddToDB(SeqForest &normal, int &db_size, const int
total_pairs_read, const Config &cfg) const
{
int is_new;
// If there is not a tree with the same root as this sequence has,
// make a new tree with that root and flag trees_found
if (!normal.trees_found[current_seq[0]]) {
normal.trees[current_seq[0]].SetRoot(current_seq[0]);
normal.trees_found[current_seq[0]] = 1;
}
// Try to add the sequence. If it's already there, is_new will be
// set to 0, otherwise it will be set to 1.
is_new = normal.trees[current_seq[0]].InsertSeq(current_seq, 0,
seq_len-1);
db_size += is_new;
if ((is_new && cfg.verbose) || cfg.very_verbose) {
ReportNewSeq(cfg, total_pairs_read, db_size);
}
if (is_new)
return 1;
else
return 0;
}
/*********************************************************************
* CompareSeq() *
* Compares the current sequence in this stream to the database, *
* in the manner indicated by the configuration file. Reports *
* on anomalies if told to by the configuration file. Updates *
* num_anoms, seq_hdist, max_hdist, seq_lfc, and max_lfc. *
* *
* Input: const Config &cfg: Information from configuration file *
* const SeqForest &normal: DB of normal sequences *
* const int total_pairs_read: Number of pairs read from *
* all of the streams *
* Output: none *
********************************************************************/
void Stream::CompareSeq(const Config &cfg, const SeqForest &normal,
const int total_pairs_read)
{
int is_anom; // flag to indicate whether current_seq is an anomaly
is_anom = ComputeMisses(normal);
if ((is_anom) && (cfg.compute_hdist)) {
ComputeHDist(normal);
}
if (cfg.lf_size > 1) {
ComputeLF(is_anom, cfg.lf_size);
}
// if we're in verbose mode and either current_seq is an anomaly or
// its locality frame contains an anomaly, report it
if ((cfg.very_verbose) || (cfg.verbose && (is_anom || seq_lfc))) {
ReportSeq(cfg, total_pairs_read, is_anom);
}
}
/*********************************************************************
* ComputeMisses() *
* Compares the current sequence to the database sequences. If *
* there is an exact match, we return 0. Otherwise we return 1. *
* Updates num_anoms and seq_hdist. *
* *
* Input: const SeqForest &normal: DB of normal sequences *
* Output: 0 if there is an exact match *
* 1 if the sequence is anomalous *
********************************************************************/
int Stream::ComputeMisses(const SeqForest &normal)
{
if (normal.IsSeqInForest(current_seq, seq_len)) {
seq_hdist = 0;
return(0);
}
// We have an anomaly
++num_anoms;
return(1);
}
/*********************************************************************
* ComputeHDist() *
* Compares the current sequence in this stream to each sequence *
* in the database in turn, adding up the number of mismatches *
* between the two sequences. The smallest difference between *
* the current sequence and the database sequences is the minimum *
* Hamming distance for the current sequence. If this minimum *
* Hamming distance is greater than the largest minimum Hamming *
* distance encountered so far, then the variable max_hdist is *
* updated. Updates seq_hdist and max_hdist. *
* *
* Input: const SeqForest &normal: DB of normal sequences *
* *
* Output: none *
********************************************************************/
void Stream::ComputeHDist(const SeqForest &normal)
{
int misses_on_this_seq; // the number of mismatches between
// current_seq and the sequence we're
// comparing it with at the moment
seq_hdist = seq_len; // start with seq_hdist as high as
// possible
// We compare current_seq with each sequence in our database tree
for (int i = 0; i < normal.trees.Size(); i++) {
// Have we seen any sequences starting with element i? If not, we
// can go on to consider sequences starting with element i+1.
if (normal.trees_found[i]) {
misses_on_this_seq =
normal.trees[i].ComputeHDistForTree(current_seq, 0,
seq_len-1);
if (misses_on_this_seq < seq_hdist) {
seq_hdist = misses_on_this_seq;
}
}
}
if (seq_hdist > max_hdist) {
max_hdist = seq_hdist;
}
}
/*********************************************************************
* ComputeLF() *
* Computes the number of misses in current_seq's locality frame. *
* Updates lf, seq_lfc and max_lfc. *
* *
* Input: const int is_anom Flag to indicate whether *
* current_seq is an anomaly *
* const int lf_size Size of locality frame *
* Output: none *
********************************************************************/
void Stream::ComputeLF(const int is_anom, const int lf_size)
{
// When num_seqs_fnd is less than lf_size, the locality frame
// array is not full
if (num_seqs_fnd <= lf_size) {
lf[num_seqs_fnd-1] = is_anom;
seq_lfc += is_anom;
}
else {
// We're about to remove the first element of lf; since seq_lfc is
// the sum of the elements of lf, we should subtract lf[0] from
// seq_lfc to remove it from the sum.
seq_lfc -= lf[0];
// Now we add is_anom and seq_lfc is the sum of the new locality
// frame.
seq_lfc += is_anom;
// roll over the array
for (int i = 0; i < lf_size-1; i++) {
lf[i] = lf[i+1];
}
lf[lf_size-1] = is_anom;
}
if (seq_lfc > max_lfc) {
max_lfc = seq_lfc;
}
}
/*********************************************************************
* ReportSeq() *
* This function reports data about a sequence. Specifically, it *
* can report the external stream id, a number indicating where *
* the first element of the current sequence occurs in the input, *
* a number indicating how many pairs from this particular data *
* stream have been read prior to the first element of the *
* sequence, the minimum Hamming distance for the current *
* sequence, the locality frame count, the locality frame count, *
* and whether this particular sequence is itself an anomaly (it *
* could be that some other sequence in its locality frame is *
* anomalous). The configuration file determines which of those *
* possible data are reported and in what format. Updates no *
* values. *
* *
* Input: const Config &cfg Configuration information *
* const int total_pairs_read Total number of pairs read *
* from the input stream from any data *
* stream, not just this one *
* const int is_anom flag for whether the current *
* sequence is itself an anomaly *
* Output: none *
********************************************************************/
void Stream::ReportSeq(const Config &cfg, const int total_pairs_read,
const int is_anom) const
{
for (int i = 0; i < cfg.num_fvars; i++) {
switch (cfg.write_val[i]) {
case 'a':
printf(cfg.fmt_str[i], is_anom); break;
case 'c':
if (cfg.lf_size > 1) {
printf(cfg.fmt_str[i], seq_lfc);
}
break;
case 'h':
if (cfg.compute_hdist) {
printf(cfg.fmt_str[i], seq_hdist);
}
break;
case 'i':
printf(cfg.fmt_str[i], num_pairs_read); break;
case 'p':
printf(cfg.fmt_str[i], total_pairs_read); break;
case 's':
printf(cfg.fmt_str[i], ext_sid); break;
}
}
printf(cfg.fmt_str[cfg.num_fvars]);
}
/*********************************************************************
* ReportNewSeq() *
* This function reports on sequences which have been newly added *
* to the database. It can report the external stream *
* identifier, where the first element of the sequence occurs *
* both within the whole input stream and within its own data *
* stream, and the number of unique sequences in the database *
* after this sequence has been added. The configuration file *
* determines which of those possible data are reported and in *
* what format. Updates no values. *
* *
* Input: const Config &cfg Configuration information *
* const int total_pairs_read Total number of pairs read *
* from the input stream from any data *
* stream, not just this one *
* const int db_size Number of unique sequences *
* in the database *
* Output: none *
********************************************************************/
void Stream::ReportNewSeq(const Config &cfg, const int total_pairs_read,
const int db_size) const
{
for (int i = 0; i < cfg.num_fvars; i++) {
switch (cfg.write_val[i]) {
case 'd':
printf(cfg.fmt_str[i], db_size); break;
case 'i':
printf(cfg.fmt_str[i], num_pairs_read); break;
case 'p':
printf(cfg.fmt_str[i], total_pairs_read); break;
case 's':
printf(cfg.fmt_str[i], ext_sid); break;
}
}
printf(cfg.fmt_str[cfg.num_fvars]);
}