open (INFILE, $ARGV[0]); # open file $Seq = ""; # initialize sequence while () { unless (/>/) { chomp; $Seq .= $_; # NOTE: assumes only one sequence } } $Len = length ($Seq); # length of sequence $End = $Len - 1; # number of 2-mers in sequence #loop over all 2-mers for ($i = 0; $i < $End ; $i++) { $Sing = substr ($Seq, $i, 1); # get single base $Two = substr ($Seq, $i, 2); # get two bases $Freq{$Sing}++; # increment counters $BiFreq{$Two}++; } #process last nucleotide $Sing = substr ($Seq, $End, 1); $Freq{$Sing}++; # compute nucleotide frequencies foreach $key (keys (%Freq)) { $Freq{$key} /= $Len; } # compute dinucleotide frequencies foreach $key (keys (%BiFreq)) { $BiFreq{$key} /= $End; } open (QFILE, $ARGV[1]); # open query sequence file $Query = ""; # initialize sequence while () { unless (/>/) { chomp; $Query .= $_; } } # get first nucleotide and initialize based on multinomial model $Sing = substr ($Query, 0, 1); $Ans = log ($Freq{$Sing}); # calculate number of 2-mers $End = length ($Query) - 1; # calculate 1-order model based on two-mer representation for ($i = 0; $i < $End; $i++) { $Two = substr ($Query, $i, 2); $Ans += log ($BiFreq{$Two}); # NOTE: log = natural log in perl } print "Log probability: $Ans\n"; # answer