#!/usr/bin/perl =pod =head1 RetrotransposonHunter RetrotransposonHunter - identifies potentially polymorphic DNA insertions for use in phylogeny or population genetics. =head1 DESCRIPTION RetrotransposonHunter is a generalized, stand-alone form of the AluHunter algorithm, which is described at U. The user first scans DNA sequences for insertions using U and any options he/she wishes. Then the user calls the stand-alone program, inputting a U output file> (.out file), the U that has been scanned for retrotransposons, and an U that has been formatted for BLAST searches with NCBI's makeblastdb. The result is a list of insertions that are absent in the outgroup genome. The program requires the latest BLAST. =head1 AVAILABILITY The code is available from the project website at http://www.aluhunter.com =head1 AUTHOR Written by Christina Bergey, Department of Anthropology, New York University. E-mail: cmb433@nyu.edu Website: www.christinabergey.com =cut use feature ':5.10'; use strict; use warnings; use Getopt::Long; use File::Basename; my $prog = $0; my $usage = <$prog -r repeatmasker.out -s source.fasta -g genome_blast_db > results.dat [-help] [-verbose] -repeatmasker RepeatMasker .out file -source SOURCE FASTA FILE (THE INPUT TO REPEATMASKER) -genome OUTGROUP GENOME (TO SCREEN INSERTIONS AGAINST) -flank-length LENGTH OF FLANKING SEQUENCE TO BLAST AGAINST GENOME -blast-path PATH TO NCBI's blastn PROGRAM EOQ # Get options my $help; my $verbose = ""; my $rm_out; my $source_fasta; my $genome; my $flank_length = 200; my $blast_path = ""; my $word_size = 20; my $e_val = "1e-15"; my $ok = GetOptions( 'repeatmasker|r=s' => \$rm_out, 'source|s=s' => \$source_fasta, 'genome|g=s' => \$genome, 'flank_length|f:i' => \$flank_length, 'blast_path|b:s' => \$blast_path, 'word_size|w:i' => \$word_size, 'e_value|e:s' => \$e_val, 'verbose' => \$verbose, 'help' => \$help ); my $all_def = defined $rm_out && defined $source_fasta && defined $genome; if ($help || !$ok || !$all_def) { print $usage; exit; } if ($verbose) { print STDERR "INPUT:\n"; print STDERR "\tRepeatMasker .out file: $rm_out\n"; print STDERR "\tSource FASTA file: $source_fasta\n"; print STDERR "\tGenome: $genome\n"; print STDERR "\tFlank length: $flank_length\n"; print STDERR "\tBLAST path: $blast_path\n"; print STDERR "\tWord size: $word_size\n"; print STDERR "\tE-value: $e_val\n\n"; } if ($blast_path) { $blast_path .= "/blastn"; $blast_path =~ s/\/+/\//g; } else { $blast_path = "blastn"; } # Open RM .out file. open (RM, "<$rm_out") or die "$! Could not open RepeatMasker .out file [$rm_out]. Aborting.\n"; while () { # Skip beginning header lines if ($_ !~ /\d/) { next; } # Get rid of beginning whitespace s/^\s+//; # Split line into component parts my @rm_parts = split /\s+/, $_; my $sw_score = $rm_parts[0]; my $perc_div = $rm_parts[1]; my $perl_del = $rm_parts[2]; my $perc_ins = $rm_parts[3]; my $query_seq = $rm_parts[4]; my $query_begin = $rm_parts[5]; my $query_end = $rm_parts[6]; my $query_left = $rm_parts[7]; my $pos_neg = $rm_parts[8]; my $match_repeat = $rm_parts[9]; my $repeat_family = $rm_parts[10]; my $repeat_begin = $rm_parts[11]; my $repeat_end = $rm_parts[12]; my $repeat_left = $rm_parts[13]; my $rm_id = $rm_parts[14]; print "Ins. #$rm_id: $query_seq "; print "[$query_begin - $query_end]\n"; # Grab flanks of this insertion from source FASTA. open (FASTA, "<$source_fasta") or die "$! Could not open FASTA source file [$source_fasta]. Aborting.\n"; my $fasta_seq = ""; my $in_seq = 0; # Search FASTA for correct source sequence while () { if (m/\Q$query_seq\E/) { $in_seq = 1; next; } elsif (/^>/ && $in_seq == 1) { $in_seq = 0; last; } elsif ($in_seq) { chomp; $fasta_seq .= $_; } } # Grab insertion's sequence, flanks' sequence my $ins_length = $query_end - $query_begin; my $ins_seq = substr $fasta_seq, $query_begin - 1, $ins_length + 1; my $left_flank_start = $query_begin - $flank_length; my $right_flank_end = $query_end + $flank_length; # Check to see if we overflowed out of the source sequence if ($left_flank_start < 1) { $left_flank_start = 1; } if ($right_flank_end > (length $fasta_seq)) { $right_flank_end = (length $fasta_seq) - 1; } my $left_flank = substr $fasta_seq, $left_flank_start - 1, $query_begin - $left_flank_start; my $right_flank = substr $fasta_seq, $query_end, $right_flank_end - $query_end; close FASTA; # Write flanks to temporary FASTA files that will serve as BLAST inputs my $blastinLString = ">Left Flank - Insertion " . $rm_id . "\n" . $left_flank; my $blastinRString = ">Right Flank - Insertion " . $rm_id . "\n" . $right_flank; open (BLASTINL, "+>blastinL.fasta") || die "ERROR Unable to open FASTA file for modification: $!\n"; print BLASTINL $blastinLString; close (BLASTINL); open (BLASTINR, "+>blastinR.fasta") || die "ERROR Unable to open FASTA file for modification: $!\n"; print BLASTINR $blastinRString; close (BLASTINR); # Call BLAST # Delete old BLAST results unlink("blastoutL.dat"); unlink("blastoutR.dat"); my $blast_cmd_L = $blast_path . " -query blastinL.fasta -db $genome -out blastoutL.dat -outfmt 6 "; my $blast_cmd_R = $blast_path . " -query blastinR.fasta -db $genome -out blastoutR.dat -outfmt 6 "; if ($e_val) { $blast_cmd_L .= "-evalue $e_val "; $blast_cmd_R .= "-evalue $e_val "; } if ($word_size) { $blast_cmd_L .= "-word_size $word_size "; $blast_cmd_R .= "-word_size $word_size "; } my $blast_result_L; my $blast_result_R; eval { if ($verbose) { print STDERR "BLAST commands:\n"; print STDERR "\t" . $blast_cmd_L . "\n"; } $blast_result_L = system($blast_cmd_L); if ($verbose) { print STDERR "\t" . $blast_cmd_R . "\n"; } $blast_result_R = system($blast_cmd_R); }; if ($@ || $blast_result_L == -1 || $blast_result_R == -1) { die "RetrotransposonHunter encountered an error when calling BLAST. Is blastn in your \$PATH? Aborting.\n"; } # Parse BLAST results to determine whether insertion is present in the outgroup genome open (L_BLAST, '); my @R_results = (); # Test to see if either flank had no matches. if (scalar @L_results == 0 || scalar @R_results == 0) { print STDERR "RESULT: Hit zero or one flank, not both.\n\n"; print "\tHit zero or one flank, not both.\n"; } else { # Compute size of smallest gap between L and R flanks. my $smallGap = -1; my $numSmGaps = 0; my $gap; my $source; my $leftStart; my $leftEnd; my $rightStart; my $rightEnd; GAPFINDER: foreach my $left_hit (@L_results){ if ($left_hit =~ m/\w+\t(\S+)\t[\d|\.]+\t\d+\t\d+\t\d+\t\d+\t\d+\t(\d+)\t(\d+)\t/){ $source = $1; $leftStart = $2; $leftEnd = $3; } else { next GAPFINDER; } foreach my $right_hit (@R_results){ if ($right_hit =~ m/\w+\t(\S+)\t[\d|\.]+\t\d+\t\d+\t\d+\t\d+\t\d+\t(\d+)\t(\d+)\t/){ $source = $1; $rightStart = $2; $rightEnd = $3; # Compute gap size, making sure target isn't reverse comp'd wrt outgroup. if ($leftEnd <= $rightStart) { # If all is good. ---L---| |---R--- $gap = $rightStart - $leftEnd; } elsif ($rightStart <= $leftEnd) { # outgroup is reverse comp to target. ---R---| |---L--- $gap = $leftEnd - $rightStart; } # Keep track of smallest gap size, number of small gaps we've found so far. if ($gap < $smallGap || $smallGap == -1) { $smallGap = $gap; # We found a smaller gap if ($gap < 1000) { $numSmGaps++; # This gap is not ridiculously huge if ($numSmGaps > 1) { # Too many possible places for the Alu to be # Might as well stop looking at the BLAST results # and jump to assigning it a code. last GAPFINDER; } } } # end if } # end regex if } # end foreach (right flank results) } # end foreach (left flank results) if ($verbose) { print STDERR "\t\tDone with this insertion. Smallest gap is ", $smallGap, "bp.\n"; print STDERR "\t\tNumber of small gaps: ", $numSmGaps, "\n"; } # Interpret results, and display whether insertion is present, absent, etc. if ($numSmGaps > 1) { print STDERR "RESULT: Ambiguous. Multiple possible matches in genome.\n\n"; print "\tAmbiguous. Multiple possible matches in genome.\n"; } elsif ($gap <= ($ins_length * .10)) { print STDERR "RESULT: No insertion present in genome.\n\n"; print "\tNo insertion present in genome.\n"; } elsif ($gap < ($ins_length * .90)) { print STDERR "RESULT: Insertion found in genome, but truncated.\n\n"; print "\tInsertion found in genome, but truncated.\n"; } elsif ($gap >= ($ins_length * .90) && $gap <= ($ins_length * 1.10)) { print STDERR "RESULT: Insertion found in genome.\n"; print "\tInsertion found in genome.\n\n"; } elsif ($gap > ($ins_length * 1.10)) { print STDERR "RESULT: Flanks found in outgroup, but very far apart.\n\n"; print "\tFlanks found in outgroup, but very far apart.\n"; } else { print STDERR "RESULT: Some error processing this insertion.\n\n"; print "\tSome error processing this insertion.\n"; } } } close RM; unlink ("blastoutL.dat"); unlink ("blastoutR.dat"); unlink ("blastinL.fasta"); unlink ("blastinR.fasta"); exit(0);