#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
const char FILE_NAME[] = "Homo_sapiens.NCBI36.48.dna.chromosome.13.fa";
const char EST_FILE[] = "reads1000.fa";
char seq_name[100]; // To store name of sequence after '>' before '\n'
#define EMPTY -1
#define ANOTHERPRIME 8191
#define HASHLENGTH 13 // number of characters to use for hashing
#define REPEATNUM 20 // number of repeat words to calculate and display
#define MAX_EST_LENGTH 100
#define MAX_EST_NAME_LENGTH 40
#define COLLISION_THRESHOLD 50 // deal with repeats, maximum collision times
#define GAP_NUM_THRESHOLD 3
#define e 2.718281828459 // used by hashFunc
#define mu 0.33 // used by smith_waterman
#define delta 1.33 // used by smith_waterman
#define MATCH_PERCENT_THRESHOLD 95 // use this to determine mapped/found or not
/**** global variables*******/
long long int probeCounter = 0; // probe counts
int funcEvaNum = 0; // function evaluation time
int hashSize = 0; // hash table size
int estNum = 0; // EST number
int estSeqCount = 0, nameCount = 0, alignmentNum = 0, estFoundNum = 0, est_checker = 0;
int ind; // used by smith_waterman
int checker; // used by hashSearch
/**** function prototypes*****/
int charCount(FILE *in_file);
int hashCal(int a);
int encode(int b);
bool check(int item, int *hashTable, int h, int checkNum);
void Insert( int value, int i, int *hashTable, int *locTable);
int hashFunc(int *intTable, int i, int length);
int estCount(FILE *est_file);
void estChar_count_store(FILE *est_file, char *estCharTable, char *estName);
void reverse(char *estCharTable, char *rc_estCharTable);
char complement(char a);
void hashSearch(int value, int *hashTable, int *locTable, char *charTable, char *estCharTable, int count, int *estIntTable, int i, int rc);
void kewordsCal(int count, int *intTable, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable, int *keywordTable, int *keywordCount);
bool keyword_check(int item, int *hashTable, int h, int *keyword_countTable);
void keyword_Insert( int value, int i, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable);
int hash_value_location(int *keyword_hashTable, int value, int count);
void repeat_cal_main(int count, int *intTable, char *charTable);
double similarity_score(char a, char b);
double find_array_max(double array[], int length);
void print_alignment(double match_percentage, int tick, char *consensus_a, char *consensus_b, int *locTable, int h, int rc);
void mismatch_tolower(char *consensus_a, char *consensus_b, int tick);
void smith_waterman(char *seq_a, char *seq_b, int N_a, int N_b, int *locTable, int h, int rc);
/////////////////////////////////////////////////////////////////////////////
int main() {
clock_t startTime, finishTime;
double duration;
startTime = clock();
/***************************** The above three lines are for timing ************************/
FILE *in_file; // input file
in_file = fopen(FILE_NAME, "r");
if (in_file == NULL) {
printf("Cannot open %s\n", FILE_NAME);
exit(8);
}
int count = charCount(in_file); // Count number of characters
rewind(in_file); // Return to the beginning of the file
printf("\nNumber of characters in %s is %d\n", FILE_NAME, count);
printf("Allocating memory and saving data to charTable[count]...\n");
char *charTable = new char[count]; // Allocate memory for the characters in file
int ch; // character or EOF flag from input
while ( ( ch = fgetc(in_file)) != '\n' ) ; // skip the first line
for (int i = 0; i < count; i++) // Read each item into appropriate location
fscanf(in_file, "%c ", &charTable[i]);
for (int i = 0; i < count; i++)
charTable[i] = toupper(charTable[i]);
printf("Characters from file scanned, toUppered and saved in an array\n");
fclose(in_file); // Done with the file
// Now convert char to int and store it in another array
printf("\nEncoding char to int and storing them in a new array: intTable[count]...\n");
int *intTable = new int[count];
for (int i = 0; i < count; i++)
intTable[i] = encode((int)charTable[i]); // convert ATCG to corresponding numbers defined by encode(int)
printf("Encoding done. Integers stored.\n");
// Now let's deal with the hash table...
hashSize = hashCal(count); // get hash table size
printf("hashSize = %d\n", hashSize);
/*****calculate and print top repeat words******
repeat_cal_main(count, intTable, charTable);
***********************************************/
probeCounter = 0;
int *hashTable = new int[hashSize];
printf("\nHash Table Size = %d\n", hashSize);
printf("Initializing hash table hashTable[hashSize]...\n");
for (int i = 0; i < hashSize; i++)
hashTable[i] = EMPTY;
printf("Hash table initialized.\n");
printf("Initializing locTable[hashSize]...\n");
int *locTable = new int[hashSize]; // to keep track of the location on genome
printf("Location table initialized.\n");
printf("\nindexing...\n");
for (int i = 0; i <= count - HASHLENGTH; i++)
{
int value = hashFunc(intTable, i, HASHLENGTH);
Insert(value, i, hashTable, locTable);
}
printf("\nIndexing done.\n");
delete [] intTable; // done with intTable, release some memory
printf("\nProbe counts: %Ld\n", probeCounter);
printf("Total hash function evaluation times: %d\n", funcEvaNum);
/***************************** The below three lines are for timing ************************/
finishTime = clock();
duration = (double)(finishTime - startTime) / CLOCKS_PER_SEC;
printf("Indexing time: %3.2f seconds\n", duration);
////////////////////////////////////////////////////////////////////////////////////////////
clock_t est_startTime, est_finishTime;
double est_duration;
est_startTime = clock();
/***************************** The above three lines are for timing ************************/
// Now let's deal with the EST file
FILE *est_file; // input file
char estName[MAX_EST_NAME_LENGTH]; // store the name of the EST, assuming it's < 100 characters
char *estCharTable = new char[MAX_EST_LENGTH]; // store forward EST seq
char *rc_estCharTable = new char[MAX_EST_LENGTH]; // store reverse complement EST seq
est_file = fopen(EST_FILE, "r");
if (NULL == est_file) {
printf("Cannot open %s\n", EST_FILE);
exit(8);
}
estCount(est_file); // First count EST number and set global variable estNum
printf("******************************************************************************\n");
for (int j = 0; j < estNum; j++) // Now let's process one EST at a time!
{
est_checker = 0; // reset to 1 when print alignment, to make sure each EST gets counted once
checker = 1; // reset to not found
estChar_count_store(est_file, estCharTable, estName); // count and store characters in EST. Used for counting how many ESTs are mapped
reverse(estCharTable, rc_estCharTable); // perform and store reverse complement
int *estIntTable = new int[estSeqCount]; // forward sequence
for (int i = 0; i < estSeqCount; i++)
estIntTable[i] = encode((int)estCharTable[i]);
int *rc_estIntTable = new int[estSeqCount]; // reverse complement sequence
for (int i = 0; i < estSeqCount; i++)
rc_estIntTable[i] = encode((int)rc_estCharTable[i]);
// Now let's map the EST!!!
for (int i = 0; i < estSeqCount - HASHLENGTH; i++) //0~12,1~13,2~14...
{
if (checker == 1) // not found
{
// process forward sequence, rc = 0
hashSearch(hashFunc(estIntTable, i, HASHLENGTH), hashTable, locTable, charTable, estCharTable, count, estIntTable, i, 0);
// process reverse complement sequence, rc = 1
hashSearch(hashFunc(rc_estIntTable, i, HASHLENGTH), hashTable, locTable, charTable, rc_estCharTable, count, rc_estIntTable, i, 1);
}
}
delete [] estIntTable;
delete [] rc_estIntTable;
}
/***************************** The below two lines are for timing ************************/
est_finishTime = clock();
est_duration = (double)(est_finishTime - est_startTime) / CLOCKS_PER_SEC;
printf("******************************************************************************\n");
printf("Number of ESTs: %d\n", estNum);
//estFoundNum = estNum- estFoundNum;
printf("Number of mapped ESTs: %d\n", estFoundNum);
printf("Number of alignments: %d\n", alignmentNum);
printf("EST searching time: %3.2f seconds\n", est_duration);
printf("Efficiency: %7.0f ESTs/min", 60*estNum/est_duration);
////////////////////////////////////////////////////////////////////////////////////////////
fclose(est_file); // Done with the EST file
delete [] charTable; // Release memory
delete [] hashTable;
delete [] locTable;
printf("\n\nMemory released. Job done.\n\n");
return 0;
}
/////////////////////////////////////////////////////////////////////////////
int hashCal(int a)
{
/*
Prime number source:
http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
each number in the list is prime
each number is slightly less than twice the size of the previous
each number is as far as possible from the nearest two powers of two
*/
int goodPrimes[26] = {
53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613,
393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319,
201326611, 402653189, 805306457, 1610612741
};
for (int i = 0; i < 26; i++)
{
if (a < 0.6 * goodPrimes[i]) // Determines a good hash table size
{
hashSize = goodPrimes[i];
break;
}
}
return hashSize;
}
/////////////////////////////////////////////////////////////////////////////
int encode(int b)
{
if (b == 65) {return 2;} // A -> 2
else if (b == 67) {return 3;} // C -> 3
else if (b == 71) {return 4;} // G -> 4
else if (b == 84) {return 5;} // T -> 5
else {return 1;}
}
// Checking collision
bool check(int item, int *hashTable, int h, int checkNum)
{
if (checkNum < COLLISION_THRESHOLD)
{
probeCounter ++; // To keep track of the number of probes
return hashTable[h] != EMPTY;
}
else
probeCounter ++;
return 0;
}
// Adding an integer using double hashing …
void Insert( int value, int i, int *hashTable, int *locTable)
{
int item = value;
int h = value % hashSize;
int checkNum = 0;
int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME
while (check(item, hashTable, h, checkNum))
{
h = (h + h2) % hashSize;
checkNum++;
}
if (hashTable[h] == EMPTY)
{
hashTable[h] = item;
locTable[h] = i;
}
}
// Hashing function
int hashFunc(int *intTable, int i, int length)
{
long long int value = 0;
for (int j = 0; j < length; j++)
value = value * 5 + intTable[i + j];
funcEvaNum++;
return ( int) value;
}
// Count characters
int charCount(FILE *in_file)
{
int count = 0; // number of characters seen
// character or EOF flag from input
int ch;
if (( ch = fgetc(in_file)) == '>')
{
while ( ( ch = fgetc(in_file)) != '\n' ) // skip first line header from file after '>'
;
while (1)
{
ch = fgetc(in_file);
if (ch == '\n' || ch == ' ')
;
else if (ch == EOF)
break;
else
++count;
}
}
return count;
}
// Count the Number of EST in a file
int estCount(FILE *est_file)
{
int ch;
while(1)
{
ch = fgetc(est_file);
if (ch == '>')
estNum++;
else if (ch == EOF)
break;
}
printf("\nNumber of ESTs in %s is %d\n", EST_FILE, estNum);
rewind(est_file); // reset pointer position to the beginning of file
return estNum;
}
// Count and store a EST in an array
void estChar_count_store(FILE *est_file, char *estCharTable, char *estName)
{
int ch;
int seen = 0; // seen '>' how many times
int i = 0;
estSeqCount = 0, nameCount = 0;
while (seen != 2) // an EST is the sequence between two '>'s
{
if ((ch = fgetc(est_file)) == '>')
{
seen++;
if (seen == 1)
{
for (int i = 0; (ch = fgetc(est_file)) != '\n'; i++)
{
estName[i] = ch; // store EST name
nameCount++; // count EST name length
}
}
}
else if (ch == 'A' || ch == 'a' || ch == 'T' || ch == 't'
|| ch == 'C' || ch == 'c' || ch == 'G' || ch == 'g'
|| ch == 'N' || ch == 'n' || ch == '?')
{
estCharTable[i] = toupper(ch); // toUpper and save
i++;
estSeqCount++;
}
else if (ch == EOF)
break;
}
fseek (est_file, -1, SEEK_CUR); // reset pointer position to before '>'
printf(">");
for (int i = 0; i < nameCount; i++)
printf("%c", estName[i]);
printf("\nLength = %d\n", estSeqCount);
}
void reverse(char *estCharTable, char *rc_estCharTable)
{
for (int i = 0; i < estSeqCount; i++)
rc_estCharTable[i] = complement(estCharTable[estSeqCount - i - 1]);
}
char complement(char a)
{
if (a == 'A') {return 'T';} // A -> T
else if (a == 'C') {return 'G';} // C -> G
else if (a == 'G') {return 'C';} // G -> C
else if (a == 'T') {return 'A';} // T -> A
else return a;
}
//////////////////////////////////////////////////////////////////////////////////////
void hashSearch(int value, int *hashTable, int *locTable, char *charTable, char *estCharTable, int count, int *estIntTable, int i, int rc)
{
int item = value;
int h = value % hashSize;
int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME
while (1)
{
if (hashTable[h] == item)
{
smith_waterman(charTable + locTable[h] - i, estCharTable, estSeqCount + 5, estSeqCount, locTable, h, rc);
h = (h + h2) % hashSize;
while(hashTable[h] != EMPTY)
{
if (hashTable[h] == item)
{
smith_waterman(charTable + locTable[h] - i, estCharTable, estSeqCount + 5, estSeqCount, locTable, h, rc);
}
h = (h + h2) % hashSize;
}
checker = 0; // if found then there is no need to do further hash search
break;
}
else if (hashTable[h] == EMPTY) break;
else h = (h + h2) % hashSize;
}
}
////////////////////////////////////////////////////////////////////////////////////////////
void kewordsCal(int count, int *intTable, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable, int *keywordTable, int *keywordCount)
{
for (int i = 0; i < hashSize; i++)
keyword_hashTable[i] = EMPTY; // initializing keyword_hashTable
for (int i = 0; i < hashSize; i++)
keyword_countTable[i] = 0; // initializing keyword_countTable
printf("indexing for repeat words...\n");
for (int i = 0; i <= count - HASHLENGTH; i++)
{
int value = hashFunc(intTable, i, HASHLENGTH);
keyword_Insert(value, i, keyword_hashTable, keyword_countTable, keyword_locTable);
}
printf("Calculating top repeat words...\n");
for (int i = 0; i < REPEATNUM; i++)
{
int max = 0, max_location = 0;
for (int j = 0; j < hashSize; j++)
{
if (max < keyword_countTable[j])
{
max = keyword_countTable[j];
max_location = j;
}
}
keywordCount[i] = max;
keywordTable[i] = keyword_hashTable[max_location];
keyword_countTable[max_location] = 0;
}
}
/////////////////////////////////////////////////////////////////////////////
bool keyword_check(int item, int *keyword_hashTable, int h, int *keyword_countTable)
{
probeCounter++;
if (keyword_hashTable[h] == item)
keyword_countTable[h]++;
return keyword_hashTable[h] != item && keyword_hashTable[h] != EMPTY;
}
// Adding an integer using double hashing …
void keyword_Insert(int value, int i, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable)
{
int item = value;
int h = value % hashSize;
int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME
while (keyword_check(item, keyword_hashTable, h, keyword_countTable))
{
h = (h + h2) % hashSize;
}
//printf("%d [position] with item %d [value]\n", h, value);
if (keyword_hashTable[h] == EMPTY)
{
keyword_hashTable[h] = item; // insert into hash table
keyword_locTable[h] = i; // keep track of the location on the genome
}
}
/////////////////////////////////////////////////////////////////////////////
int hash_value_location(int *keyword_hashTable, int value, int count)
{
int i = 0;
for (i = 0; i < count; i++)
{
if (value == keyword_hashTable[i])
break;
}
return i;
}
void repeat_cal_main(int count, int *intTable, char *charTable)
{
// Now let's take a minute and calculate some common key words///////////////////////////
int *keyword_hashTable = new int[hashSize]; // for inserting values
int *keyword_countTable = new int[hashSize]; // to keep track of the number of collisions
int *keyword_locTable = new int[hashSize]; // to keep track of the location on genome
int *keywordTable = new int[REPEATNUM]; // to keep track of keywords
int *keywordCount = new int[REPEATNUM]; // to keep track of number of each keyword
kewordsCal(count, intTable, keyword_hashTable, keyword_countTable, keyword_locTable, keywordTable, keywordCount);
// print out top repeat words
printf("\nTop %d repeats:\n", REPEATNUM);
for (int j = 0; j < REPEATNUM; j++)
{
printf("%3d ", j + 1);
int hashValueLocation = hash_value_location(keyword_hashTable, keywordTable[j], hashSize);
for (int i = 0; i < HASHLENGTH; i++)
printf("%c", charTable[i + keyword_locTable[hashValueLocation]]);
printf("... Repeat times: %d\n", keywordCount[j]);
}
delete [] keyword_locTable;
delete [] keyword_hashTable; // release memory
delete [] keyword_countTable; // release memory
printf("\nRepeat calculation probe counts: %Ld\n", probeCounter);
// Done with calculating keywords/////////////////////////////////////////////////////////
}
// The code below was adapted from //https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm
void smith_waterman(char *seq_a, char *seq_b, int N_a, int N_b, int *locTable, int h, int rc)
{
// initialize H
double H[MAX_EST_LENGTH][MAX_EST_LENGTH];
for(int i=0;i<=N_a;i++){
for(int j=0;j<=N_b;j++){
H[i][j]=0;
}
}
double temp[4];
int I_i[MAX_EST_LENGTH][MAX_EST_LENGTH],I_j[MAX_EST_LENGTH][MAX_EST_LENGTH]; // Index matrices to remember the 'path' for backtracking
// here comes the actual algorithm
for(int i=1;i<=N_a;i++){
for(int j=1;j<=N_b;j++){
temp[0] = H[i-1][j-1]+similarity_score(seq_a[i-1],seq_b[j-1]);
temp[1] = H[i-1][j]-delta;
temp[2] = H[i][j-1]-delta;
temp[3] = 0;
H[i][j] = find_array_max(temp,4);
switch(ind){
case 0: // score in (i,j) stems from a match/mismatch
I_i[i][j] = i-1;
I_j[i][j] = j-1;
break;
case 1: // score in (i,j) stems from a deletion in sequence A
I_i[i][j] = i-1;
I_j[i][j] = j;
break;
case 2: // score in (i,j) stems from a deletion in sequence B
I_i[i][j] = i;
I_j[i][j] = j-1;
break;
case 3: // (i,j) is the beginning of a subsequence
I_i[i][j] = i;
I_j[i][j] = j;
break;
}
}
}
/*
// Print the matrix H to the console
cout<<"**********************************************"<<endl;
cout<<"The scoring matrix is given by "<<endl<<endl;
for(int i=1;i<=N_a;i++){
for(int j=1;j<=N_b;j++){
cout<<H[i][j]<<"\t";
}
cout<<endl;
}
*/
// search H for the maximal score
double H_max = 0;
int i_max=0,j_max=0;
for(int i=1;i<=N_a;i++){
for(int j=1;j<=N_b;j++){
if(H[i][j]>H_max){
H_max = H[i][j];
i_max = i;
j_max = j;
}
}
}
// Backtracking from H_max
int current_i=i_max,current_j=j_max;
int next_i=I_i[current_i][current_j];
int next_j=I_j[current_i][current_j];
int tick = 0;
int gap_num = 0;
double mismatch_num = 0;
double match_percentage;
char consensus_a[N_a+N_b+2],consensus_b[N_a+N_b+2];
while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=-1) && (next_i!=-1))
{
if(next_i==current_i)
{
gap_num++;
consensus_a[tick] = '-'; // deletion in A
}
else
consensus_a[tick] = seq_a[current_i-1]; // match/mismatch in A
if(next_j==current_j)
{
gap_num++;
consensus_b[tick] = '-'; // deletion in B
}
else
consensus_b[tick] = seq_b[current_j-1]; // match/mismatch in B
current_i = next_i;
current_j = next_j;
next_i = I_i[current_i][current_j];
next_j = I_j[current_i][current_j];
tick++;
//printf("tick = %d\n", tick);
}
mismatch_num = (tick - H_max - gap_num * (delta + 1)) / (mu + 1); // mismatch number
match_percentage = 100*(tick - gap_num - mismatch_num)/estSeqCount; // percentage of matches
if (match_percentage > MATCH_PERCENT_THRESHOLD & gap_num < GAP_NUM_THRESHOLD)
{
if (match_percentage != 100) // If there is mismatch, then tolower it!
mismatch_tolower(consensus_a, consensus_b, tick); // to increase the readability
print_alignment(match_percentage, tick, consensus_a, consensus_b, locTable, h, rc);
}
}
/////////////////////////////////////////////////////////////////////////////
double similarity_score(char a,char b){
double result;
if(a==b){
result=1;
}
else{
result=-mu;
}
return result;
}
/////////////////////////////////////////////////////////////////////////////
double find_array_max(double array[],int length){
double max = array[0]; // start with max = first element
ind = 0;
for(int i = 1; i<length; i++){
if(array[i] > max){
max = array[i];
ind = i;
}
}
return max; // return highest value in array
}
//////////////////////////////////////////////////////////////////////////////
void mismatch_tolower(char *consensus_a, char *consensus_b, int tick)
{
for(int i=tick-1;i>=0;i--)
{
if ((consensus_a[i] != consensus_b[i]) &
(consensus_a[i] != '-') & (consensus_b[i] != '-'))
{
consensus_a[i] = tolower(consensus_a[i]);
consensus_b[i] = tolower(consensus_b[i]);
}
}
}
//////////////////////////////////////////////////////////////////////////////
void print_alignment(double match_percentage, int tick, char *consensus_a, char *consensus_b, int *locTable, int h, int rc)
{
if (est_checker == 0)
{
estFoundNum++;
est_checker = 1; // reset to 1 to make sure each EST gets counted once, avoid repeat counts
}
alignmentNum++; // keep track of number of alignments printed
//*****************output********************
printf(" Genome: ");
for(int i=tick-1;i>=0;i--) printf("%c", consensus_a[i]);
printf("\n");
if (rc == 0)
printf(" EST: ");
else printf("*rc_EST: "); // reverse complement sequence, add "*rc_" for easy recognition
for(int j=tick-1;j>=0;j--) printf("%c", consensus_b[j]);
printf("\n");
printf("Mapped to %d ~ %d, ", locTable[h] + 1, locTable[h] + tick);
printf("percentage of matches = %2.0f%%\n", match_percentage);
}
/////////////////////////////////////////////////////////////////////////////
// above four functions are auxiliary functions used by smith_waterman: //
/////////////////////////////////////////////////////////////////////////////