575 lines
23 KiB
C++
Executable File
575 lines
23 KiB
C++
Executable File
/*********************************************************************
|
|
* *
|
|
* STIDE: Sequence Time-Delay Embedding v1.1 *
|
|
* *
|
|
* Written by Steve Hofmeyr 7/21/96 *
|
|
* Revised by Julie Rehmeyer 3/98 *
|
|
* *
|
|
* Copyright (C) 1996, 1998 Regents of the University of New Mexico. *
|
|
* All Rights Reserved. *
|
|
* *
|
|
* This program is free software; you can redistribute it and/or *
|
|
* modify it under the terms of the GNU General Public License as *
|
|
* published by the Free Software Foundation; either version 2 of *
|
|
* the License, or (at your option) any later version. *
|
|
* *
|
|
* This program is distributed in the hope that it will be useful, *
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
* GNU General Public License for more details. *
|
|
* *
|
|
* You should have received a copy of the GNU General Public *
|
|
* License along with this program; if not, write to the Free *
|
|
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, *
|
|
* USA. *
|
|
* *
|
|
********************************************************************/
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <iostream.h>
|
|
#include <fstream.h>
|
|
#include "../Utils/arrays.h"
|
|
#include "../Utils/hash.h"
|
|
#include "seq_config.h"
|
|
#include "seq_stream.h"
|
|
#include "flexitree.h"
|
|
|
|
#define DBREV 1
|
|
|
|
int counter = 0;
|
|
|
|
Stream *GetReadyStream(Array<Stream> &streams, HashTableInt
|
|
&sid_table, int &num_streams_fnd, int
|
|
&total_pairs_read, const Config &cfg);
|
|
int ReadDB(SeqForest &db_forest, const string &db_name,
|
|
int &seq_len);
|
|
void WriteDB(const SeqForest &db_forest, const string &db_name, const
|
|
int db_size, const int seq_len);
|
|
void FinalReport(const Config &cfg, const SeqForest &normal, const int
|
|
num_streams_fnd, const int num_seqs_added, const
|
|
Array<Stream> &streams, const int db_size);
|
|
void WriteDBStats(const SeqForest &db_forest, ostream &out_stream,
|
|
const int db_size);
|
|
void OutputGraph(const SeqForest &db_forest, string db_name);
|
|
int GetPrimeLargerThan(const int n);
|
|
|
|
/*********************************************************************
|
|
* main() *
|
|
* Input: int argc: Number of command-line arguments *
|
|
* char *argv[]: array of strings containing *
|
|
* command-line arguments *
|
|
* Output: 0 if successful, -1 if unsuccessful *
|
|
*********************************************************************/
|
|
|
|
int main(int argc, char *argv[])
|
|
|
|
{
|
|
Config cfg((const int) argc, (const char **) argv);
|
|
// Declare configuration object and do
|
|
// the configuration on the basis of the
|
|
// command line arguments and the
|
|
// configuration file
|
|
Stream *active_stream; // This will point to the stream that
|
|
// currently has a sequence to be worked
|
|
// on (either added to the database or
|
|
// compared).
|
|
HashTableInt sid_table(GetPrimeLargerThan(cfg.max_streams));
|
|
// Hash table relating external stream ids to
|
|
// internal sids; make size of table
|
|
// smallest prime larger than the number
|
|
// of streams
|
|
SeqForest normal(cfg.max_elements); // Uninitialized forest of
|
|
// normal sequences
|
|
Array<Stream> streams(cfg.max_streams); // Array of stream objects,
|
|
// one for each data stream
|
|
// in input, which are
|
|
// allocated as needed
|
|
int num_streams_fnd = 0; // Number of data streams
|
|
// encountered to date
|
|
int total_pairs_read = cfg.pair_offset; // Number of pairs read from
|
|
// input to date from all
|
|
// the data streams combined
|
|
// -- can be offset using
|
|
// the "-n" switch
|
|
int db_size; // Total number of unique
|
|
// sequences in the database
|
|
int init_db_size = 0; // Number of unique
|
|
// sequences in the
|
|
// pre-existing database
|
|
|
|
|
|
|
|
|
|
// Read database into normal, if database exists
|
|
db_size = init_db_size = ReadDB(normal, cfg.db_name, cfg.seq_len);
|
|
|
|
if (cfg.add_to_db) {
|
|
while ((active_stream =
|
|
GetReadyStream(streams, sid_table, num_streams_fnd,
|
|
total_pairs_read, cfg))
|
|
!= NULL) {
|
|
active_stream->AddToDB(normal, db_size, total_pairs_read, cfg);
|
|
}
|
|
WriteDB(normal, cfg.db_name, db_size, cfg.seq_len);
|
|
if (cfg.output_graph) {
|
|
OutputGraph(normal,cfg.db_name);
|
|
}
|
|
|
|
}
|
|
else {
|
|
int i = 0;
|
|
while ((active_stream =
|
|
GetReadyStream(streams, sid_table, num_streams_fnd,
|
|
total_pairs_read, cfg))
|
|
!= NULL) {
|
|
active_stream->CompareSeq(cfg, normal, total_pairs_read);
|
|
}
|
|
}
|
|
FinalReport(cfg, normal, num_streams_fnd, db_size - init_db_size,
|
|
streams, db_size);
|
|
return(0);
|
|
}
|
|
|
|
/**********************************************************************
|
|
* GetReadyStream() *
|
|
* This function reads a pair from the input, appends the element *
|
|
* to the current sequence string in the appropriate data stream, *
|
|
* finds out if that data stream has a complete sequence to be *
|
|
* processed, continues until it has found such a data stream, and *
|
|
* returns a pointer to it. It updates num_streams_fnd, *
|
|
* total_pairs_read, sid_table, and streams. *
|
|
* *
|
|
* Input: Array<Stream> &streams: the array of streams that we have *
|
|
* found so far *
|
|
* HashTableInt &sid_table: hash table relating external sids *
|
|
* to internal sids *
|
|
* int &num_streams_fnd: the number of streams found so far; *
|
|
* int &total_pairs_read: the number of pairs read from the *
|
|
* input stream so far *
|
|
* const Config &cfg: configuration information *
|
|
* *
|
|
* Output: a pointer to the next stream that is ready for processing *
|
|
**********************************************************************/
|
|
|
|
Stream *GetReadyStream(Array<Stream> &streams, HashTableInt
|
|
&sid_table, int &num_streams_fnd, int
|
|
&total_pairs_read, const Config &cfg)
|
|
|
|
{
|
|
Stream *ready_stream = NULL;
|
|
int ext_sid;
|
|
int int_sid;
|
|
int sval;
|
|
|
|
cin >> ext_sid;
|
|
while (!cin.eof()) {
|
|
if (ext_sid == -1) {
|
|
break;
|
|
}
|
|
int_sid = sid_table.ExtToInt(ext_sid, num_streams_fnd);
|
|
cin >> sval;
|
|
++total_pairs_read;
|
|
|
|
// Update num_streams_fnd, if necessary
|
|
if (int_sid >= num_streams_fnd) {
|
|
if (int_sid > cfg.max_streams) {
|
|
cerr<<"ERROR: Too many streams to follow, aborting..."<<endl;
|
|
exit(-1);
|
|
}
|
|
// We need a new stream object
|
|
streams[num_streams_fnd].Init(cfg, int_sid, ext_sid);
|
|
num_streams_fnd = int_sid + 1;
|
|
}
|
|
streams[int_sid].Append(sval);
|
|
if (streams[int_sid].Ready()) {
|
|
ready_stream = &streams[int_sid];
|
|
break;
|
|
}
|
|
cin >> ext_sid;
|
|
}
|
|
|
|
return ready_stream;
|
|
}
|
|
|
|
/*********************************************************************
|
|
* ReadDB() *
|
|
* Reads the database from a file and returns the number of unique *
|
|
* sequences in the database. Checks for appropriate revision *
|
|
* number. If it is a revision DBREV database, the second line *
|
|
* will be "#DBseq_len: " followed by the sequence length. The *
|
|
* next line will contain a single number, giving the root of the *
|
|
* first tree. The following lines will contain the tree itself. *
|
|
* The first seq_len numbers make up the first sequence (so the *
|
|
* first number of the second line will be the same as the number *
|
|
* on the first line). The next number will be a negative number *
|
|
* between -(seq_len-1) and -2, indicating how far to backtrack in *
|
|
* the first sequence, and the following positive numbers give the *
|
|
* rest of the second sequence. So, for example, -3 would mean *
|
|
* backtrack 3 numbers, take the previous numbers including the *
|
|
* one you're on, and append the next two numbers. So after the *
|
|
* -3 you would find two positive numbers, followed by a negative *
|
|
* number (which you would use the same way as you used the -3, on *
|
|
* the most recent sequence). Each tree is terminated by the *
|
|
* number -1. So the sample input file *
|
|
* 3 *
|
|
* 3 4 2 9 10 3 -4 3 9 8 -2 3 -3 4 9 -1 *
|
|
* 2 *
|
|
* 2 3 4 5 6 7 -3 2 9 -1 *
|
|
* yields the sequences: *
|
|
* 3 4 2 9 10 3 *
|
|
* 3 4 2 3 9 8 *
|
|
* 3 4 2 3 9 3 *
|
|
* 3 4 2 3 4 9 *
|
|
* 2 3 4 5 6 7 *
|
|
* 2 3 4 5 2 9 *
|
|
* *
|
|
* Input: SeqForest &db_forest Forest of sequences *
|
|
* const string &db_name Name of database *
|
|
* int &seq_len User-specified sequence length *
|
|
* *
|
|
* Output: the number of unique sequences in the database *
|
|
* *
|
|
********************************************************************/
|
|
|
|
int ReadDB(SeqForest &db_forest, const string &db_name,
|
|
int &seq_len)
|
|
{
|
|
ifstream in_db_file(db_name.c_str()); // file to read the database from
|
|
int db_size = 0; // size of the database
|
|
int root; // the first element of the sequences
|
|
// we are reading in at the moment;
|
|
// i.e., the root of this tree
|
|
string buff;
|
|
int db_seq_len;
|
|
int rev_num;
|
|
|
|
if (!in_db_file.is_open()) {
|
|
cerr<<"WARNING: Cannot open database file " << db_name
|
|
<< " for input"<<endl<<"Creating a new file"<<endl;
|
|
return 0;
|
|
}
|
|
|
|
// Check to see if the first line contains "#DBrev:"
|
|
in_db_file>>buff;
|
|
if (buff == "#DBrev:") {
|
|
in_db_file>>rev_num;
|
|
if (rev_num > DBREV) {
|
|
cerr << "ERROR: The revision number is greater than " << DBREV
|
|
<< ". This version of STIDE is only capable of dealing "
|
|
<< "with databases through DBrev " << DBREV
|
|
<< ". Aborting..."<<endl;
|
|
exit(-1);
|
|
}
|
|
if (rev_num < DBREV) {
|
|
cerr << "ERROR: Revision number of database must be >= " << DBREV
|
|
<< endl;
|
|
exit(-1);
|
|
}
|
|
// Now we know that it is revision DBREV. Check sequence length of
|
|
// database against user-indicated sequence length
|
|
in_db_file>>buff;
|
|
// Now check to see if next line is "#DBseq_len: " followed by a
|
|
// number
|
|
if (buff != "#DBseq_len:") {
|
|
cerr << "ERROR: The second line of the database does not "
|
|
<< "contain the string \"#DBseq_len: \"" << endl
|
|
<< "followed by the sequence length of the database, as "
|
|
<< "required of revision " << DBREV
|
|
<< " databases. Aborting..."<< endl;
|
|
exit(-1);
|
|
}
|
|
in_db_file>>db_seq_len;
|
|
if (db_seq_len != seq_len) {
|
|
cerr << "WARNING: Database sequence length is " << db_seq_len
|
|
<< ", which does not match "
|
|
<< "sequence length specified" << endl
|
|
<< "by user (or by default if no specification was given), "
|
|
<< "which is " << seq_len << endl
|
|
<< "I will use the database sequence length. If that is "
|
|
<< "not what you intended, type Ctrl-C to abort." << endl;
|
|
seq_len = db_seq_len;
|
|
}
|
|
// Read next number into root
|
|
in_db_file >> root;
|
|
}
|
|
// Otherwise, we assume we have an old-style database, and let the
|
|
// user know that that's our assumption
|
|
else {
|
|
cerr << "WARNING: The string \"DBrev: \" is not in the first "
|
|
<< "line of the database." << endl
|
|
<< "I'm assuming that it's an older style of database, and "
|
|
<< "will read it in" << endl
|
|
<< "based on that assumption. If that is not what you want "
|
|
<< "me to do, type CTRL-C" << endl << endl;
|
|
// we have just read the first root into buff -- put it in root
|
|
// instead
|
|
root = atoi(buff.c_str());
|
|
}
|
|
|
|
while (!in_db_file.eof()) {
|
|
if (root == -1) break;
|
|
db_forest.trees_found[root]++;
|
|
in_db_file>>db_forest.trees[root];
|
|
db_size += db_forest.trees[root].NumLeaves();
|
|
in_db_file>>root;
|
|
}
|
|
in_db_file.close();
|
|
|
|
return db_size;
|
|
}
|
|
|
|
|
|
/*********************************************************************
|
|
* WriteDB() *
|
|
* Writes db_forest to the file db_name, with the format described *
|
|
* in the header of ReadDB(). Prints database statistics at the *
|
|
* end of the file. *
|
|
* *
|
|
* Input: const SeqForest &db_forest Forest of sequences in *
|
|
* database *
|
|
* const string &db_name Name of file in which to *
|
|
* put database. *
|
|
* const int db_size Number of unique sequences *
|
|
* in the database *
|
|
* const int seq_len Sequence length *
|
|
* *
|
|
* Output: none *
|
|
********************************************************************/
|
|
|
|
void WriteDB(const SeqForest &db_forest, const string &db_name, const
|
|
int db_size, const int seq_len)
|
|
{
|
|
|
|
ofstream out_db_file(db_name.c_str());
|
|
|
|
if (!out_db_file.is_open()) {
|
|
cerr << "ERROR: Cannot open database file " << db_name
|
|
<< "for output, aborting..." << endl ;
|
|
exit(-2);
|
|
}
|
|
out_db_file << "#DBrev: " << DBREV << endl;
|
|
out_db_file << "#DBseq_len: " << seq_len << endl;
|
|
|
|
for (int i = 0; i < db_forest.trees.Size(); i++) {
|
|
if (db_forest.trees_found[i]) {
|
|
out_db_file<<i<<endl;
|
|
out_db_file<<db_forest.trees[i]<<endl;
|
|
}
|
|
}
|
|
out_db_file<<" -1"<<endl;
|
|
// we can now write anything, so I will write the db stats
|
|
out_db_file<<"; DB STATS"<<endl;
|
|
WriteDBStats(db_forest, out_db_file, db_size);
|
|
out_db_file.close();
|
|
|
|
}
|
|
|
|
/*********************************************************************
|
|
* FinalReport() *
|
|
* Reports data at end of run. The number of streams, the number *
|
|
* of input pairs, and the number of sequences in the input are *
|
|
* always reported. If we have done a comparison run, we report *
|
|
* the number of anomalies, and the precentage of sequences that *
|
|
* were anomalous. Additionally, if asked for, the Hamming *
|
|
* distance or locality frame count is reported. If we have added *
|
|
* to the database, we report having done so and report the number *
|
|
* of sequences added. If database statistics are asked for, we *
|
|
* report the number of nodes, the number of unique sequences, the *
|
|
* number of branches, and the average database branch factor. *
|
|
* *
|
|
* Input: const Config &cfg: Configuration information *
|
|
* const SeqForest &normal: DB of normal sequences *
|
|
* const int num_streams_fnd: Total number of streams found*
|
|
* const int num_seqs_added: Number of unique sequences *
|
|
* added *
|
|
* const Array<Stream> &streams: Array of data streams *
|
|
* const int db_size: Number of unique sequences *
|
|
* in DB *
|
|
* *
|
|
* Output: none *
|
|
* *
|
|
*********************************************************************/
|
|
|
|
void FinalReport(const Config &cfg, const SeqForest &normal, const int
|
|
num_streams_fnd, const int num_seqs_added, const
|
|
Array<Stream> &streams, const int db_size)
|
|
{
|
|
int total_pairs = 0;
|
|
int total_seqs = 0;
|
|
int total_anoms = 0;
|
|
int total_max_lfc = 0;
|
|
int total_max_hdist = 0;
|
|
int db_nodes = 0;
|
|
int db_seqs = 0;
|
|
int db_branches = 0;
|
|
int j;
|
|
|
|
// Sum up number of pairs input and number of seqs from all the streams
|
|
for (j = 0; j < num_streams_fnd; j++) {
|
|
total_seqs += streams[j].GetNumSeqsFnd();
|
|
total_pairs += streams[j].GetNumPairsRead();
|
|
}
|
|
|
|
cout << endl;
|
|
cout << "Number of different streams in input = "
|
|
<< num_streams_fnd << endl;
|
|
cout << "Total number of input pairs = "
|
|
<< total_pairs << endl;
|
|
cout << "Total number of sequences in input = "
|
|
<< total_seqs << endl;
|
|
|
|
if (cfg.add_to_db) {
|
|
cout << "File added to database" << endl;
|
|
cout << "Number of new sequences added to the database: "
|
|
<< num_seqs_added << endl;
|
|
}
|
|
else {
|
|
cout << "Scan completed" << endl;
|
|
// Sum up number of anomalies from all the streams
|
|
for (j = 0; j < num_streams_fnd; j++) {
|
|
total_anoms += streams[j].GetNumAnoms();
|
|
}
|
|
|
|
cout << "Number of anomalies = "
|
|
<< total_anoms << endl;
|
|
cout << "Percentage anomalous = "
|
|
<< ((float)total_anoms * 100.0)/total_seqs << endl;
|
|
|
|
// If asked for, compute Hamming distances across streams and report
|
|
if (cfg.compute_hdist) {
|
|
for (j = 0; j < num_streams_fnd; j++) {
|
|
if (streams[j].GetMaxHDist() > total_max_hdist) {
|
|
total_max_hdist = streams[j].GetMaxHDist();
|
|
}
|
|
}
|
|
cout << "Largest minimum Hamming distance = "
|
|
<< total_max_hdist << endl;
|
|
}
|
|
|
|
// If asked for, compute lfc across streams and report
|
|
if (cfg.lf_size > 1) {
|
|
for (j = 0; j < num_streams_fnd; j++) {
|
|
if (streams[j].GetMaxLFC() > total_max_lfc) {
|
|
total_max_lfc = streams[j].GetMaxLFC();
|
|
}
|
|
}
|
|
cout << "Maximum lfc = " << total_max_lfc << endl;
|
|
}
|
|
}
|
|
|
|
// If asked for, compute db stats and report
|
|
if (cfg.write_db_stats) {
|
|
WriteDBStats(normal, cout, db_size);
|
|
}
|
|
}
|
|
|
|
|
|
/*********************************************************************
|
|
* WriteDBStats() *
|
|
* Computes and writes to standard output the number of nodes in *
|
|
* the database, the number of unique sequences, the number of *
|
|
* branches, and the average database branch factor. *
|
|
* *
|
|
* Input: const SeqForest &db_forest Forest of sequences in *
|
|
* database *
|
|
* ostream &out_stream Where to write info *
|
|
* const int db_size Number of unique sequences in the *
|
|
* database *
|
|
* *
|
|
* Output: none *
|
|
*********************************************************************/
|
|
|
|
void WriteDBStats(const SeqForest &db_forest, ostream &out_stream,
|
|
const int db_size)
|
|
{
|
|
int db_nodes = 0;
|
|
int db_branches = 0;
|
|
|
|
for (int i = 0; i < db_forest.trees.Size(); i++) {
|
|
if (db_forest.trees_found[i]) {
|
|
db_nodes += db_forest.trees[i].NumNodes();
|
|
db_branches += db_forest.trees[i].NumBranches();
|
|
}
|
|
}
|
|
|
|
out_stream << "Number of DB nodes = " << db_nodes << endl;
|
|
out_stream << "Number of unique sequences = "<<db_size << endl;
|
|
out_stream << "Number of branches (edges) = "<<db_branches << endl;
|
|
out_stream << "Average DB branch factor = "
|
|
<<((float)db_branches/(db_nodes - db_size))<<endl;
|
|
|
|
}
|
|
|
|
|
|
/*********************************************************************
|
|
* OutputGraph() *
|
|
* Writes a file db_name.dot containing input for the program Dot. *
|
|
* Running Dot on db_name.dot produces a PostScript file *
|
|
* containing a picture of the whole database tree. *
|
|
* *
|
|
* Input: const SeqForest &db_forest Forest of sequences in *
|
|
* database *
|
|
* const string db_name Filename to use *
|
|
* *
|
|
* Output: none *
|
|
*********************************************************************/
|
|
|
|
void OutputGraph(const SeqForest &db_forest, const string db_name)
|
|
{
|
|
char *dot_filename;
|
|
dot_filename = new char [strlen(db_name.c_str())+4];
|
|
strcpy(dot_filename, db_name.c_str());
|
|
ofstream output_file(strcat(dot_filename,".dot"));
|
|
|
|
output_file<<"digraph \""<<db_name<<"\" {"<<endl;
|
|
output_file<<" ratio=auto;"<<endl;
|
|
output_file<<" page=\"8.5,11\";"<<endl;
|
|
for (int i = 0; i < db_forest.trees.Size(); i++) {
|
|
if (db_forest.trees_found[i])
|
|
db_forest.trees[i].OutputGraph(output_file);
|
|
}
|
|
output_file<<"}"<<endl;
|
|
output_file.close();
|
|
}
|
|
|
|
|
|
/****************************************************************************
|
|
* GetPrimeLargerThan(int n) *
|
|
* Returns the smallest prime larger than the input integer. *
|
|
* Changes no values. *
|
|
* *
|
|
* Input: const int n *
|
|
* Output: smallest prime larger than n *
|
|
***************************************************************************/
|
|
|
|
int GetPrimeLargerThan(const int n)
|
|
{
|
|
int primes[n];
|
|
int primes_fnd = 1;
|
|
int curr_num = 3;
|
|
int is_prime = 1;
|
|
|
|
primes[0] = 2;
|
|
while(1) {
|
|
for (int i = 0; i < primes_fnd; i++) {
|
|
if ((curr_num % primes[i]) == 0) {
|
|
is_prime = 0;
|
|
break;
|
|
}
|
|
}
|
|
if (is_prime == 1) {
|
|
primes[primes_fnd++] = curr_num;
|
|
if (curr_num > n) {
|
|
break;
|
|
}
|
|
}
|
|
curr_num = curr_num + 2;
|
|
is_prime = 1;
|
|
}
|
|
return curr_num;
|
|
}
|
|
|
|
|
|
|