DNWA/11/wywolania/Data/stide_v1.1/Seq-code/stide.cc

575 lines
23 KiB
C++
Raw Normal View History

2021-01-28 18:33:55 +01:00
/*********************************************************************
* *
* STIDE: Sequence Time-Delay Embedding v1.1 *
* *
* Written by Steve Hofmeyr 7/21/96 *
* Revised by Julie Rehmeyer 3/98 *
* *
* Copyright (C) 1996, 1998 Regents of the University of New Mexico. *
* All Rights Reserved. *
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License as *
* published by the Free Software Foundation; either version 2 of *
* the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public *
* License along with this program; if not, write to the Free *
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, *
* USA. *
* *
********************************************************************/
#include <stdlib.h>
#include <string.h>
#include <iostream.h>
#include <fstream.h>
#include "../Utils/arrays.h"
#include "../Utils/hash.h"
#include "seq_config.h"
#include "seq_stream.h"
#include "flexitree.h"
#define DBREV 1
int counter = 0;
Stream *GetReadyStream(Array<Stream> &streams, HashTableInt
&sid_table, int &num_streams_fnd, int
&total_pairs_read, const Config &cfg);
int ReadDB(SeqForest &db_forest, const string &db_name,
int &seq_len);
void WriteDB(const SeqForest &db_forest, const string &db_name, const
int db_size, const int seq_len);
void FinalReport(const Config &cfg, const SeqForest &normal, const int
num_streams_fnd, const int num_seqs_added, const
Array<Stream> &streams, const int db_size);
void WriteDBStats(const SeqForest &db_forest, ostream &out_stream,
const int db_size);
void OutputGraph(const SeqForest &db_forest, string db_name);
int GetPrimeLargerThan(const int n);
/*********************************************************************
* main() *
* Input: int argc: Number of command-line arguments *
* char *argv[]: array of strings containing *
* command-line arguments *
* Output: 0 if successful, -1 if unsuccessful *
*********************************************************************/
int main(int argc, char *argv[])
{
Config cfg((const int) argc, (const char **) argv);
// Declare configuration object and do
// the configuration on the basis of the
// command line arguments and the
// configuration file
Stream *active_stream; // This will point to the stream that
// currently has a sequence to be worked
// on (either added to the database or
// compared).
HashTableInt sid_table(GetPrimeLargerThan(cfg.max_streams));
// Hash table relating external stream ids to
// internal sids; make size of table
// smallest prime larger than the number
// of streams
SeqForest normal(cfg.max_elements); // Uninitialized forest of
// normal sequences
Array<Stream> streams(cfg.max_streams); // Array of stream objects,
// one for each data stream
// in input, which are
// allocated as needed
int num_streams_fnd = 0; // Number of data streams
// encountered to date
int total_pairs_read = cfg.pair_offset; // Number of pairs read from
// input to date from all
// the data streams combined
// -- can be offset using
// the "-n" switch
int db_size; // Total number of unique
// sequences in the database
int init_db_size = 0; // Number of unique
// sequences in the
// pre-existing database
// Read database into normal, if database exists
db_size = init_db_size = ReadDB(normal, cfg.db_name, cfg.seq_len);
if (cfg.add_to_db) {
while ((active_stream =
GetReadyStream(streams, sid_table, num_streams_fnd,
total_pairs_read, cfg))
!= NULL) {
active_stream->AddToDB(normal, db_size, total_pairs_read, cfg);
}
WriteDB(normal, cfg.db_name, db_size, cfg.seq_len);
if (cfg.output_graph) {
OutputGraph(normal,cfg.db_name);
}
}
else {
int i = 0;
while ((active_stream =
GetReadyStream(streams, sid_table, num_streams_fnd,
total_pairs_read, cfg))
!= NULL) {
active_stream->CompareSeq(cfg, normal, total_pairs_read);
}
}
FinalReport(cfg, normal, num_streams_fnd, db_size - init_db_size,
streams, db_size);
return(0);
}
/**********************************************************************
* GetReadyStream() *
* This function reads a pair from the input, appends the element *
* to the current sequence string in the appropriate data stream, *
* finds out if that data stream has a complete sequence to be *
* processed, continues until it has found such a data stream, and *
* returns a pointer to it. It updates num_streams_fnd, *
* total_pairs_read, sid_table, and streams. *
* *
* Input: Array<Stream> &streams: the array of streams that we have *
* found so far *
* HashTableInt &sid_table: hash table relating external sids *
* to internal sids *
* int &num_streams_fnd: the number of streams found so far; *
* int &total_pairs_read: the number of pairs read from the *
* input stream so far *
* const Config &cfg: configuration information *
* *
* Output: a pointer to the next stream that is ready for processing *
**********************************************************************/
Stream *GetReadyStream(Array<Stream> &streams, HashTableInt
&sid_table, int &num_streams_fnd, int
&total_pairs_read, const Config &cfg)
{
Stream *ready_stream = NULL;
int ext_sid;
int int_sid;
int sval;
cin >> ext_sid;
while (!cin.eof()) {
if (ext_sid == -1) {
break;
}
int_sid = sid_table.ExtToInt(ext_sid, num_streams_fnd);
cin >> sval;
++total_pairs_read;
// Update num_streams_fnd, if necessary
if (int_sid >= num_streams_fnd) {
if (int_sid > cfg.max_streams) {
cerr<<"ERROR: Too many streams to follow, aborting..."<<endl;
exit(-1);
}
// We need a new stream object
streams[num_streams_fnd].Init(cfg, int_sid, ext_sid);
num_streams_fnd = int_sid + 1;
}
streams[int_sid].Append(sval);
if (streams[int_sid].Ready()) {
ready_stream = &streams[int_sid];
break;
}
cin >> ext_sid;
}
return ready_stream;
}
/*********************************************************************
* ReadDB() *
* Reads the database from a file and returns the number of unique *
* sequences in the database. Checks for appropriate revision *
* number. If it is a revision DBREV database, the second line *
* will be "#DBseq_len: " followed by the sequence length. The *
* next line will contain a single number, giving the root of the *
* first tree. The following lines will contain the tree itself. *
* The first seq_len numbers make up the first sequence (so the *
* first number of the second line will be the same as the number *
* on the first line). The next number will be a negative number *
* between -(seq_len-1) and -2, indicating how far to backtrack in *
* the first sequence, and the following positive numbers give the *
* rest of the second sequence. So, for example, -3 would mean *
* backtrack 3 numbers, take the previous numbers including the *
* one you're on, and append the next two numbers. So after the *
* -3 you would find two positive numbers, followed by a negative *
* number (which you would use the same way as you used the -3, on *
* the most recent sequence). Each tree is terminated by the *
* number -1. So the sample input file *
* 3 *
* 3 4 2 9 10 3 -4 3 9 8 -2 3 -3 4 9 -1 *
* 2 *
* 2 3 4 5 6 7 -3 2 9 -1 *
* yields the sequences: *
* 3 4 2 9 10 3 *
* 3 4 2 3 9 8 *
* 3 4 2 3 9 3 *
* 3 4 2 3 4 9 *
* 2 3 4 5 6 7 *
* 2 3 4 5 2 9 *
* *
* Input: SeqForest &db_forest Forest of sequences *
* const string &db_name Name of database *
* int &seq_len User-specified sequence length *
* *
* Output: the number of unique sequences in the database *
* *
********************************************************************/
int ReadDB(SeqForest &db_forest, const string &db_name,
int &seq_len)
{
ifstream in_db_file(db_name.c_str()); // file to read the database from
int db_size = 0; // size of the database
int root; // the first element of the sequences
// we are reading in at the moment;
// i.e., the root of this tree
string buff;
int db_seq_len;
int rev_num;
if (!in_db_file.is_open()) {
cerr<<"WARNING: Cannot open database file " << db_name
<< " for input"<<endl<<"Creating a new file"<<endl;
return 0;
}
// Check to see if the first line contains "#DBrev:"
in_db_file>>buff;
if (buff == "#DBrev:") {
in_db_file>>rev_num;
if (rev_num > DBREV) {
cerr << "ERROR: The revision number is greater than " << DBREV
<< ". This version of STIDE is only capable of dealing "
<< "with databases through DBrev " << DBREV
<< ". Aborting..."<<endl;
exit(-1);
}
if (rev_num < DBREV) {
cerr << "ERROR: Revision number of database must be >= " << DBREV
<< endl;
exit(-1);
}
// Now we know that it is revision DBREV. Check sequence length of
// database against user-indicated sequence length
in_db_file>>buff;
// Now check to see if next line is "#DBseq_len: " followed by a
// number
if (buff != "#DBseq_len:") {
cerr << "ERROR: The second line of the database does not "
<< "contain the string \"#DBseq_len: \"" << endl
<< "followed by the sequence length of the database, as "
<< "required of revision " << DBREV
<< " databases. Aborting..."<< endl;
exit(-1);
}
in_db_file>>db_seq_len;
if (db_seq_len != seq_len) {
cerr << "WARNING: Database sequence length is " << db_seq_len
<< ", which does not match "
<< "sequence length specified" << endl
<< "by user (or by default if no specification was given), "
<< "which is " << seq_len << endl
<< "I will use the database sequence length. If that is "
<< "not what you intended, type Ctrl-C to abort." << endl;
seq_len = db_seq_len;
}
// Read next number into root
in_db_file >> root;
}
// Otherwise, we assume we have an old-style database, and let the
// user know that that's our assumption
else {
cerr << "WARNING: The string \"DBrev: \" is not in the first "
<< "line of the database." << endl
<< "I'm assuming that it's an older style of database, and "
<< "will read it in" << endl
<< "based on that assumption. If that is not what you want "
<< "me to do, type CTRL-C" << endl << endl;
// we have just read the first root into buff -- put it in root
// instead
root = atoi(buff.c_str());
}
while (!in_db_file.eof()) {
if (root == -1) break;
db_forest.trees_found[root]++;
in_db_file>>db_forest.trees[root];
db_size += db_forest.trees[root].NumLeaves();
in_db_file>>root;
}
in_db_file.close();
return db_size;
}
/*********************************************************************
* WriteDB() *
* Writes db_forest to the file db_name, with the format described *
* in the header of ReadDB(). Prints database statistics at the *
* end of the file. *
* *
* Input: const SeqForest &db_forest Forest of sequences in *
* database *
* const string &db_name Name of file in which to *
* put database. *
* const int db_size Number of unique sequences *
* in the database *
* const int seq_len Sequence length *
* *
* Output: none *
********************************************************************/
void WriteDB(const SeqForest &db_forest, const string &db_name, const
int db_size, const int seq_len)
{
ofstream out_db_file(db_name.c_str());
if (!out_db_file.is_open()) {
cerr << "ERROR: Cannot open database file " << db_name
<< "for output, aborting..." << endl ;
exit(-2);
}
out_db_file << "#DBrev: " << DBREV << endl;
out_db_file << "#DBseq_len: " << seq_len << endl;
for (int i = 0; i < db_forest.trees.Size(); i++) {
if (db_forest.trees_found[i]) {
out_db_file<<i<<endl;
out_db_file<<db_forest.trees[i]<<endl;
}
}
out_db_file<<" -1"<<endl;
// we can now write anything, so I will write the db stats
out_db_file<<"; DB STATS"<<endl;
WriteDBStats(db_forest, out_db_file, db_size);
out_db_file.close();
}
/*********************************************************************
* FinalReport() *
* Reports data at end of run. The number of streams, the number *
* of input pairs, and the number of sequences in the input are *
* always reported. If we have done a comparison run, we report *
* the number of anomalies, and the precentage of sequences that *
* were anomalous. Additionally, if asked for, the Hamming *
* distance or locality frame count is reported. If we have added *
* to the database, we report having done so and report the number *
* of sequences added. If database statistics are asked for, we *
* report the number of nodes, the number of unique sequences, the *
* number of branches, and the average database branch factor. *
* *
* Input: const Config &cfg: Configuration information *
* const SeqForest &normal: DB of normal sequences *
* const int num_streams_fnd: Total number of streams found*
* const int num_seqs_added: Number of unique sequences *
* added *
* const Array<Stream> &streams: Array of data streams *
* const int db_size: Number of unique sequences *
* in DB *
* *
* Output: none *
* *
*********************************************************************/
void FinalReport(const Config &cfg, const SeqForest &normal, const int
num_streams_fnd, const int num_seqs_added, const
Array<Stream> &streams, const int db_size)
{
int total_pairs = 0;
int total_seqs = 0;
int total_anoms = 0;
int total_max_lfc = 0;
int total_max_hdist = 0;
int db_nodes = 0;
int db_seqs = 0;
int db_branches = 0;
int j;
// Sum up number of pairs input and number of seqs from all the streams
for (j = 0; j < num_streams_fnd; j++) {
total_seqs += streams[j].GetNumSeqsFnd();
total_pairs += streams[j].GetNumPairsRead();
}
cout << endl;
cout << "Number of different streams in input = "
<< num_streams_fnd << endl;
cout << "Total number of input pairs = "
<< total_pairs << endl;
cout << "Total number of sequences in input = "
<< total_seqs << endl;
if (cfg.add_to_db) {
cout << "File added to database" << endl;
cout << "Number of new sequences added to the database: "
<< num_seqs_added << endl;
}
else {
cout << "Scan completed" << endl;
// Sum up number of anomalies from all the streams
for (j = 0; j < num_streams_fnd; j++) {
total_anoms += streams[j].GetNumAnoms();
}
cout << "Number of anomalies = "
<< total_anoms << endl;
cout << "Percentage anomalous = "
<< ((float)total_anoms * 100.0)/total_seqs << endl;
// If asked for, compute Hamming distances across streams and report
if (cfg.compute_hdist) {
for (j = 0; j < num_streams_fnd; j++) {
if (streams[j].GetMaxHDist() > total_max_hdist) {
total_max_hdist = streams[j].GetMaxHDist();
}
}
cout << "Largest minimum Hamming distance = "
<< total_max_hdist << endl;
}
// If asked for, compute lfc across streams and report
if (cfg.lf_size > 1) {
for (j = 0; j < num_streams_fnd; j++) {
if (streams[j].GetMaxLFC() > total_max_lfc) {
total_max_lfc = streams[j].GetMaxLFC();
}
}
cout << "Maximum lfc = " << total_max_lfc << endl;
}
}
// If asked for, compute db stats and report
if (cfg.write_db_stats) {
WriteDBStats(normal, cout, db_size);
}
}
/*********************************************************************
* WriteDBStats() *
* Computes and writes to standard output the number of nodes in *
* the database, the number of unique sequences, the number of *
* branches, and the average database branch factor. *
* *
* Input: const SeqForest &db_forest Forest of sequences in *
* database *
* ostream &out_stream Where to write info *
* const int db_size Number of unique sequences in the *
* database *
* *
* Output: none *
*********************************************************************/
void WriteDBStats(const SeqForest &db_forest, ostream &out_stream,
const int db_size)
{
int db_nodes = 0;
int db_branches = 0;
for (int i = 0; i < db_forest.trees.Size(); i++) {
if (db_forest.trees_found[i]) {
db_nodes += db_forest.trees[i].NumNodes();
db_branches += db_forest.trees[i].NumBranches();
}
}
out_stream << "Number of DB nodes = " << db_nodes << endl;
out_stream << "Number of unique sequences = "<<db_size << endl;
out_stream << "Number of branches (edges) = "<<db_branches << endl;
out_stream << "Average DB branch factor = "
<<((float)db_branches/(db_nodes - db_size))<<endl;
}
/*********************************************************************
* OutputGraph() *
* Writes a file db_name.dot containing input for the program Dot. *
* Running Dot on db_name.dot produces a PostScript file *
* containing a picture of the whole database tree. *
* *
* Input: const SeqForest &db_forest Forest of sequences in *
* database *
* const string db_name Filename to use *
* *
* Output: none *
*********************************************************************/
void OutputGraph(const SeqForest &db_forest, const string db_name)
{
char *dot_filename;
dot_filename = new char [strlen(db_name.c_str())+4];
strcpy(dot_filename, db_name.c_str());
ofstream output_file(strcat(dot_filename,".dot"));
output_file<<"digraph \""<<db_name<<"\" {"<<endl;
output_file<<" ratio=auto;"<<endl;
output_file<<" page=\"8.5,11\";"<<endl;
for (int i = 0; i < db_forest.trees.Size(); i++) {
if (db_forest.trees_found[i])
db_forest.trees[i].OutputGraph(output_file);
}
output_file<<"}"<<endl;
output_file.close();
}
/****************************************************************************
* GetPrimeLargerThan(int n) *
* Returns the smallest prime larger than the input integer. *
* Changes no values. *
* *
* Input: const int n *
* Output: smallest prime larger than n *
***************************************************************************/
int GetPrimeLargerThan(const int n)
{
int primes[n];
int primes_fnd = 1;
int curr_num = 3;
int is_prime = 1;
primes[0] = 2;
while(1) {
for (int i = 0; i < primes_fnd; i++) {
if ((curr_num % primes[i]) == 0) {
is_prime = 0;
break;
}
}
if (is_prime == 1) {
primes[primes_fnd++] = curr_num;
if (curr_num > n) {
break;
}
}
curr_num = curr_num + 2;
is_prime = 1;
}
return curr_num;
}