#include #include #include #include #include #include "../Utils/hash.h" #include "seq_stream.h" /******************************************************************** * Init() * * Initializes an instance of Stream. * * * * Input: const Config &cfg Configuration information * * const int intern internal stream identifier * * const int extern external stream identifier * * Output: none * *******************************************************************/ void Stream::Init(const Config &cfg, const int intern_id, const int extern_id) { // initialize all the arrays current_seq.Allocate(cfg.seq_len); current_seq.Set(-1); // initialize the array to be empty num_in_seq = -1; num_pairs_read = 0; num_anoms = 0; num_seqs_fnd = 0; int_sid = intern_id; ext_sid = extern_id; max_hdist = 0; seq_hdist = 0; lf.Allocate(cfg.lf_size); lf.Set(0); seq_lfc = 0; max_lfc = 0; ready = 0; seq_len = cfg.seq_len; } /********************************************************************* * Append() * * This function puts the integer given into the current_seq array * * as the last element. It flags ready according to whether * * current_seq is full. Updates num_in_seq, ready, current_seq, * * num_seqs_fnd, and num_pairs_read. * * * * Input: const int new_value The next value to be put into the * * current_seq array * * Output: none * ********************************************************************/ void Stream::Append(const int new_value) { // missing system call - zero the current sequence if (new_value == -1) { num_in_seq = -1; ready = 0; } else { num_pairs_read++; if (num_in_seq < seq_len - 1) { // window not yet full num_in_seq++; current_seq[num_in_seq] = new_value; if (num_in_seq == seq_len - 1) { ready = 1; ++num_seqs_fnd; } } else { // Roll over current_seq array for (int k = 0; k < num_in_seq; k++) { current_seq[k] = current_seq[k + 1]; } current_seq[num_in_seq] = new_value; ++num_seqs_fnd; } } } /******************************************************************** * AddToDB() * * * * Adds current_seq to the database if it isn't already there; * * Returns 0 if it is already there, 1 if it is new. Updates * * normal and db_size. * * * * Input: SeqForest &normal Forest of normal sequences * * int &db_size Number of unique sequences in the * * database * * const int total_pairs_read Number of pairs read from the * * entire input stream * * const Config &cfg Configuration Information * * Output: 0 if sequence isn't new, 1 if it is * ********************************************************************/ int Stream::AddToDB(SeqForest &normal, int &db_size, const int total_pairs_read, const Config &cfg) const { int is_new; // If there is not a tree with the same root as this sequence has, // make a new tree with that root and flag trees_found if (!normal.trees_found[current_seq[0]]) { normal.trees[current_seq[0]].SetRoot(current_seq[0]); normal.trees_found[current_seq[0]] = 1; } // Try to add the sequence. If it's already there, is_new will be // set to 0, otherwise it will be set to 1. is_new = normal.trees[current_seq[0]].InsertSeq(current_seq, 0, seq_len-1); db_size += is_new; if ((is_new && cfg.verbose) || cfg.very_verbose) { ReportNewSeq(cfg, total_pairs_read, db_size); } if (is_new) return 1; else return 0; } /********************************************************************* * CompareSeq() * * Compares the current sequence in this stream to the database, * * in the manner indicated by the configuration file. Reports * * on anomalies if told to by the configuration file. Updates * * num_anoms, seq_hdist, max_hdist, seq_lfc, and max_lfc. * * * * Input: const Config &cfg: Information from configuration file * * const SeqForest &normal: DB of normal sequences * * const int total_pairs_read: Number of pairs read from * * all of the streams * * Output: none * ********************************************************************/ void Stream::CompareSeq(const Config &cfg, const SeqForest &normal, const int total_pairs_read) { int is_anom; // flag to indicate whether current_seq is an anomaly is_anom = ComputeMisses(normal); if ((is_anom) && (cfg.compute_hdist)) { ComputeHDist(normal); } if (cfg.lf_size > 1) { ComputeLF(is_anom, cfg.lf_size); } // if we're in verbose mode and either current_seq is an anomaly or // its locality frame contains an anomaly, report it if ((cfg.very_verbose) || (cfg.verbose && (is_anom || seq_lfc))) { ReportSeq(cfg, total_pairs_read, is_anom); } } /********************************************************************* * ComputeMisses() * * Compares the current sequence to the database sequences. If * * there is an exact match, we return 0. Otherwise we return 1. * * Updates num_anoms and seq_hdist. * * * * Input: const SeqForest &normal: DB of normal sequences * * Output: 0 if there is an exact match * * 1 if the sequence is anomalous * ********************************************************************/ int Stream::ComputeMisses(const SeqForest &normal) { if (normal.IsSeqInForest(current_seq, seq_len)) { seq_hdist = 0; return(0); } // We have an anomaly ++num_anoms; return(1); } /********************************************************************* * ComputeHDist() * * Compares the current sequence in this stream to each sequence * * in the database in turn, adding up the number of mismatches * * between the two sequences. The smallest difference between * * the current sequence and the database sequences is the minimum * * Hamming distance for the current sequence. If this minimum * * Hamming distance is greater than the largest minimum Hamming * * distance encountered so far, then the variable max_hdist is * * updated. Updates seq_hdist and max_hdist. * * * * Input: const SeqForest &normal: DB of normal sequences * * * * Output: none * ********************************************************************/ void Stream::ComputeHDist(const SeqForest &normal) { int misses_on_this_seq; // the number of mismatches between // current_seq and the sequence we're // comparing it with at the moment seq_hdist = seq_len; // start with seq_hdist as high as // possible // We compare current_seq with each sequence in our database tree for (int i = 0; i < normal.trees.Size(); i++) { // Have we seen any sequences starting with element i? If not, we // can go on to consider sequences starting with element i+1. if (normal.trees_found[i]) { misses_on_this_seq = normal.trees[i].ComputeHDistForTree(current_seq, 0, seq_len-1); if (misses_on_this_seq < seq_hdist) { seq_hdist = misses_on_this_seq; } } } if (seq_hdist > max_hdist) { max_hdist = seq_hdist; } } /********************************************************************* * ComputeLF() * * Computes the number of misses in current_seq's locality frame. * * Updates lf, seq_lfc and max_lfc. * * * * Input: const int is_anom Flag to indicate whether * * current_seq is an anomaly * * const int lf_size Size of locality frame * * Output: none * ********************************************************************/ void Stream::ComputeLF(const int is_anom, const int lf_size) { // When num_seqs_fnd is less than lf_size, the locality frame // array is not full if (num_seqs_fnd <= lf_size) { lf[num_seqs_fnd-1] = is_anom; seq_lfc += is_anom; } else { // We're about to remove the first element of lf; since seq_lfc is // the sum of the elements of lf, we should subtract lf[0] from // seq_lfc to remove it from the sum. seq_lfc -= lf[0]; // Now we add is_anom and seq_lfc is the sum of the new locality // frame. seq_lfc += is_anom; // roll over the array for (int i = 0; i < lf_size-1; i++) { lf[i] = lf[i+1]; } lf[lf_size-1] = is_anom; } if (seq_lfc > max_lfc) { max_lfc = seq_lfc; } } /********************************************************************* * ReportSeq() * * This function reports data about a sequence. Specifically, it * * can report the external stream id, a number indicating where * * the first element of the current sequence occurs in the input, * * a number indicating how many pairs from this particular data * * stream have been read prior to the first element of the * * sequence, the minimum Hamming distance for the current * * sequence, the locality frame count, the locality frame count, * * and whether this particular sequence is itself an anomaly (it * * could be that some other sequence in its locality frame is * * anomalous). The configuration file determines which of those * * possible data are reported and in what format. Updates no * * values. * * * * Input: const Config &cfg Configuration information * * const int total_pairs_read Total number of pairs read * * from the input stream from any data * * stream, not just this one * * const int is_anom flag for whether the current * * sequence is itself an anomaly * * Output: none * ********************************************************************/ void Stream::ReportSeq(const Config &cfg, const int total_pairs_read, const int is_anom) const { for (int i = 0; i < cfg.num_fvars; i++) { switch (cfg.write_val[i]) { case 'a': printf(cfg.fmt_str[i], is_anom); break; case 'c': if (cfg.lf_size > 1) { printf(cfg.fmt_str[i], seq_lfc); } break; case 'h': if (cfg.compute_hdist) { printf(cfg.fmt_str[i], seq_hdist); } break; case 'i': printf(cfg.fmt_str[i], num_pairs_read); break; case 'p': printf(cfg.fmt_str[i], total_pairs_read); break; case 's': printf(cfg.fmt_str[i], ext_sid); break; } } printf(cfg.fmt_str[cfg.num_fvars]); } /********************************************************************* * ReportNewSeq() * * This function reports on sequences which have been newly added * * to the database. It can report the external stream * * identifier, where the first element of the sequence occurs * * both within the whole input stream and within its own data * * stream, and the number of unique sequences in the database * * after this sequence has been added. The configuration file * * determines which of those possible data are reported and in * * what format. Updates no values. * * * * Input: const Config &cfg Configuration information * * const int total_pairs_read Total number of pairs read * * from the input stream from any data * * stream, not just this one * * const int db_size Number of unique sequences * * in the database * * Output: none * ********************************************************************/ void Stream::ReportNewSeq(const Config &cfg, const int total_pairs_read, const int db_size) const { for (int i = 0; i < cfg.num_fvars; i++) { switch (cfg.write_val[i]) { case 'd': printf(cfg.fmt_str[i], db_size); break; case 'i': printf(cfg.fmt_str[i], num_pairs_read); break; case 'p': printf(cfg.fmt_str[i], total_pairs_read); break; case 's': printf(cfg.fmt_str[i], ext_sid); break; } } printf(cfg.fmt_str[cfg.num_fvars]); }