VCF2R
#!/usr/bin/env perl
use strict;
use warnings;
$| = 1; #Do not buffer output to std error
######Copyright 2015 Matthew F. Hockin Ph.D.
#All rights reserved
#The University of Utah
######mhockin@gmail.com
use Getopt::Long;
use File::Basename;
###########GLOBALS###########
my %SVbyChr;
ExecuteScript();
sub ExecuteScript{
my ($usr_FieldsAry_ref, $usr_InfoAry_ref, $usr_ChrAry_ref, $usr_RegionAry_ref, $usrChr_SegmentsHsh_ref, $deparse, $fileOut) = ParseCommand();
my $VCF_filePath = shift @ARGV;
die "VCF data file or file path must bare listed as LAST item on command line for VCF2R\n" unless ($VCF_filePath);
my ($VCFfile, $dir, $ext) = fileparse($VCF_filePath);
open (my $VCF_in_FH, '<', $dir.$VCFfile.$ext) or die "Cannot open $VCFfile.$ext using this path $dir\nPlease ensure this is correct\n";
my ($valid_VCF_fieldsAry_ref, $valid_VCF_infoAry_ref, $lastFile_POS) = LoadHeader($VCF_in_FH);
my $output_Field_IndxHsh_ref = ParseUserRequest($valid_VCF_fieldsAry_ref, $valid_VCF_infoAry_ref, $usr_FieldsAry_ref, $usr_InfoAry_ref);
open (my $OUT_file_FH, '>', $fileOut) or die "Cannot create output file $fileOut in current directory- please check permissions\n";
my $header_Write_test = 0; #flag to detect when file header is written- prevents duplicate headers
my $time_Start = time;
my %finished_Chr;
LINE: while(my @returnVals = LoadData($VCF_in_FH, $lastFile_POS, $usrChr_SegmentsHsh_ref)){
my ($SVbyChr_Hsh_ref, $chr);
($SVbyChr_Hsh_ref, $chr, $lastFile_POS) = @returnVals;
my $timeNow = localtime();
print STDERR "Were done loading Chromosome $chr at $timeNow\n";
my $aryIndxChr_byRegion_ref = FindChrRegionCoords($usrChr_SegmentsHsh_ref, $chr, $SVbyChr_Hsh_ref);
my $outputHsh_ref = GenerateOUTPUT($aryIndxChr_byRegion_ref, $output_Field_IndxHsh_ref, $SVbyChr_Hsh_ref);
my ($write_HeaderAry_ref, $fieldLen);
($write_HeaderAry_ref, $fieldLen, $header_Write_test) = Parse_OUTPUT($OUT_file_FH, $outputHsh_ref, $valid_VCF_fieldsAry_ref,
$valid_VCF_infoAry_ref, $deparse, $header_Write_test);
WriteOUT($OUT_file_FH, $write_HeaderAry_ref, $fieldLen, $outputHsh_ref, $deparse);
if (eof $VCF_in_FH == 1){
print STDERR "END OF INPUT FILE REACHED. Perl file position \"tell\" is $lastFile_POS\n";
my $elapsed = time - $time_Start;
print STDERR "Total processing time for $VCFfile was $elapsed seconds!\n";
last;
}
if (%$usrChr_SegmentsHsh_ref){
my $done = 0;
$finished_Chr{$chr} = '';
for (keys %$usrChr_SegmentsHsh_ref){
$done++ unless (exists $finished_Chr{$_});
}
if ($done == 0){
print STDERR "finishing file early, we found all requested Chr\n";
last;
}
}
}
close $VCF_in_FH;
close $OUT_file_FH;
return;
}
sub ParseCommand{
my ($pullFields, $pullInfo, $pullChrs, $pullRegions, $fileOut, $deparse);
GetOptions('field=s', => \$pullFields,
'info=s', => \$pullInfo,
'chr=s', => \$pullChrs,
'region=s' => \$pullRegions,
'out=s' => \$fileOut,
'deparse' => \$deparse);
my @Fields = split(/,/ , uc $pullFields) if $pullFields;
my @Info = split(/,/ , uc $pullInfo) if $pullInfo;
my @Chrs;
if ($pullChrs =~ m/-/){
$pullChrs =~ /\A(\d+)-(\d+)/;
@Chrs = ($1 .. $2);
if ($pullChrs =~ m/,/){
$pullChrs =~ m/\A\d+-\d+,(.+)/;
my @listedChr = split (/,/ , $1);
push @Chrs, @listedChr;
}
}else{
my @Chrs = split(/,/ , $pullChrs) if $pullChrs;
}
my @Regions = split(/-/ , $pullRegions) if $pullRegions;
my %usrChr_Segments; #container for chr and region information
if (@Chrs){
for (0..$#Chrs) {
my $region_Coords = $Regions[$_] ||= 0;
push @{$usrChr_Segments{$Chrs[$_]}}, $region_Coords;
}
}
return(\@Fields, \@Info, \@Chrs, \@Regions, \%usrChr_Segments, $deparse, $fileOut);
}
sub LoadHeader{
my $VCF_file = shift;
my ($lastFile_POS, @infoHeader, @fieldsHeader);
LINE: while (my $line = <$VCF_file>){
next LINE if ($line =~ m/^#{2}/);
if ($line =~ m/^#{1}/){
chomp $line;
$line =~ s/#//;
push @fieldsHeader, split (/\t/, $line); #record field identifiers from VCF global header
$lastFile_POS= tell($VCF_file);
next LINE;
}
my @currentLine = split (/\t/, $line);
my @infoData = split (/;/, $currentLine[7]);
foreach (@infoData){
$_ =~ m/([a-zA-Z]+)=/g;
push @infoHeader, $1;
}
return(\@fieldsHeader, \@infoHeader, $lastFile_POS);
}
}
sub LoadData{
my ($VCF_file, $lastFile_POS, $usrChr_SegmentsHsh_ref) = @_;
my (%singleLoci, %SVbyChr, @fieldsHeader, @infoHeader, $chr, $chr_Regex, $enable_RegEx_Fail);
seek($VCF_file, $lastFile_POS, 0);
LINE: while (my $line = <$VCF_file>){
chomp $line;
my @currentLine = split (/\t/ , $line);
if (%$usrChr_SegmentsHsh_ref){
unless (exists $$usrChr_SegmentsHsh_ref{$currentLine[0]}){
next LINE unless ($enable_RegEx_Fail)
}
}
$enable_RegEx_Fail = 1 unless ($enable_RegEx_Fail);
$chr = $currentLine[0] unless ($chr);
$chr_Regex = qr/\A$currentLine[0]\z/i unless ($chr_Regex);
$currentLine[0] =~ m/$chr_Regex/ ? $lastFile_POS = tell($VCF_file) : return(\%SVbyChr, $chr, $lastFile_POS);
my @infoData = split (/;/ , $currentLine[7]);
s/[a-zA-Z]+=//g for @infoData; #remove header from info fields retain data
$currentLine[7] = ''; #null value saves hash memory- actual values retained under diff hash key- DO NOT DELETE index pos will be wrong later
if (exists $singleLoci{$currentLine[0]}{$currentLine[1]}){ #warn of variant calls at redundant Chr, Pos (should not exist)
warn "Chr $currentLine[0] at $currentLine[1] has previously been called- and will NOT be RECORDED!\nThe consensus seq record for this variant is $currentLine[3]\n";
$lastFile_POS = tell($VCF_file);
next LINE;
}
my $pos = $currentLine[1];
$singleLoci{$chr}{$pos} = ''; #record in local (temporary) hash to check for redundant chr and pos fields
my %posInfo;
$posInfo{Summary} = \@currentLine;
$posInfo{Info} = \@infoData;
$posInfo{POS} = $pos;
push @{$SVbyChr{$chr}}, \%posInfo;
}
return(\%SVbyChr, $chr, $lastFile_POS);
}
sub ParseUserRequest{
my ($valid_VCF_FieldsAry_ref, $valid_VCF_InfoAry_ref, $usr_FieldsAry_ref, $usr_InfoAry_ref) = @_;
my %validQuery;
my %summary_keys = map {$valid_VCF_FieldsAry_ref->[$_] => $_} (0 .. $#$valid_VCF_FieldsAry_ref);
my %info_keys = map {$valid_VCF_InfoAry_ref->[$_] => $_} (0 .. $#$valid_VCF_InfoAry_ref);
my @usr_PullRequests = (@$usr_FieldsAry_ref, @$usr_InfoAry_ref);
foreach (my $i = 0; $i < @usr_PullRequests; $i++){
unless (exists $summary_keys{$usr_PullRequests[$i]} || exists $info_keys{$usr_PullRequests[$i]}){
die "Command line option \"--$usr_PullRequests[$i]\" is not valid VCF field to query.\nPlease check VCF file header and INFO field for other valid queries\n\n";
}
}
for (@$usr_FieldsAry_ref){
$validQuery{Summary}{$_} = $summary_keys{$_};
}
for (@$usr_InfoAry_ref){
$validQuery{Info}{$_} = $info_keys{$_};
}
delete $validQuery{Summary}{CHROM}; #default output Fields-remove to silently prevent user inadvertently invoking redundant output (see line 92)
delete $validQuery{Summary}{POS};
return (\%validQuery);
}
sub FindChrRegionCoords{ #Feeds BinarySearch to determine array bounds of given bp coordinates
my ($usrChrSegments_Hsh_ref, $chr, $SVbyChr_Hsh_ref) = @_;
my @indxChr_Regions;
if (%$usrChrSegments_Hsh_ref){
my @coordinates_ThisChr = split (/,/ , $$usrChrSegments_Hsh_ref{$chr}->[0]) unless ($$usrChrSegments_Hsh_ref{$chr}->[0] == 0);
if (@coordinates_ThisChr){
die "Unbounded chr Regions found! REQUIRE \"--region start,end\" coordinate PAIRS on command line!\n" if(@coordinates_ThisChr %2 !=0);
for (my $i = 0; $i < $#coordinates_ThisChr; $i = $i+2){
my ($bpStart, $bpEnd) = @coordinates_ThisChr[$i, $i+1];
my $return_Ary_ref = BinarySearchChrPOS($chr, $bpStart, $bpEnd, $SVbyChr_Hsh_ref);
unshift @$return_Ary_ref, $chr;
push @indxChr_Regions, $return_Ary_ref;
}
}else{
my ($region_Start, $region_End) = (0 , $#{$$SVbyChr_Hsh_ref{$chr}});
push @indxChr_Regions, [$chr, $region_Start, $region_End];
}
}else{
my ($region_Start, $region_End) = (0 , $#{$$SVbyChr_Hsh_ref{$chr}});
push @indxChr_Regions, [$chr, $region_Start, $region_End];
}
return(\@indxChr_Regions);
}
sub BinarySearchChrPOS{
my ($chr, $start, $end, $SVbyChr_Hsh_ref) = @_;
my @realCoordRange = ($$SVbyChr_Hsh_ref{$chr}[0]->{POS}, $$SVbyChr_Hsh_ref{$chr}[$#{$$SVbyChr_Hsh_ref{$chr}}]->{POS});
if ($start < $realCoordRange[0]){
print STDERR "Requested start position $start on $chr is lower than any called bp position for this Chromosome, using the first called position $realCoordRange[0] instead!\n";
$start = $realCoordRange[0];
}elsif ($start > $realCoordRange[1]){
die "Requested start position $start on $chr is higher than any called bp position for this Chromosome! INVALID coordinate RANGE on --region!\n";
}
if ($end >$realCoordRange[1]){
print STDERR "Requested start position $end on $chr is higher than any called bp position for this Chromosome, using the last called position $realCoordRange[1] instead!\n";
$end = $realCoordRange[1]
}elsif ($end < $realCoordRange[0]){
die "Requested end position $end on $chr is lower than any called bp position for this Chromosome! INVALID coordinate RANGE on --region!\n";
}
my @bpCoordRange = ($start, $end);
my $foundIndex; #used as ref for array
foreach my $coord (@bpCoordRange){
my $maxIndex = $#{$$SVbyChr_Hsh_ref{$chr}}; #whats the last array entry (highest BP coordinate) for current Chr data set.
my $stepSize = my $currentIndex = int($maxIndex * 0.5); #start in middle
push @$foundIndex, Binary_algorithim($chr, $coord, $stepSize, $currentIndex, $maxIndex, $SVbyChr_Hsh_ref);
}
return $foundIndex;
}
sub Binary_algorithim{
my ($chr, $coord, $stepSize, $currentIndex, $maxIndex, $SVbyChr_Hsh_ref) = @_;
my $counter = 0;
LINE: while ($coord != $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}){
while ($coord < $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}){
$stepSize = int(0.5 * $stepSize);
$stepSize == 0 ? $stepSize = 1: $stepSize = $stepSize;
$currentIndex = $currentIndex - $stepSize;
print STDERR "binary search on Chr $chr looking for bp POS $coord. At index position $currentIndex\ with bp coord $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}\n";
}
while ($coord > $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}){
$stepSize = int(0.5 * $stepSize);
$stepSize == 0 ? $stepSize = 1: $stepSize = $stepSize;
$currentIndex = $currentIndex + $stepSize;
print STDERR "binary search on Chr $chr looking for bp POS $coord at index position $currentIndex\ with bp coord $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}\n";
}
$counter ++ if ($stepSize == 1);
if ($counter >=5){
warn "We could not find any variant calls for bp $coord on chromosome $chr- despite $counter attempts at local search.\nUsing variant called at bp $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS} on $chr instead!!!!!\n";
print STDERR "Search complete.\n\n";
return $currentIndex;
}
if ($coord < $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS} ){
$stepSize = int(0.5 * $stepSize);
$stepSize == 0 ? $stepSize = 1: $stepSize = $stepSize;
$currentIndex = $currentIndex - $stepSize;
print STDERR "binary search reseeded on Chr $chr looking for bp POS $coord at index position $currentIndex\ with bp coord $$SVbyChr_Hsh_ref{$chr}[$currentIndex]->{POS}\n";
next LINE;
}
}
print STDERR "Search complete.\n\n";
return $currentIndex;
}
sub GenerateOUTPUT{
my ($ary_IndxChr_byRegion_ref, $output_Field_IndxHsh_ref, $SVbyChr_Hsh_ref) = @_;
my %output;
if (@$ary_IndxChr_byRegion_ref){
for my $i (0 .. $#$ary_IndxChr_byRegion_ref){
my ($chr, $regionStart, $regionEnd) = @{$ary_IndxChr_byRegion_ref->[$i]}[0..2];
for my $coordPos ($regionStart .. $regionEnd){
push @{$output{CHROM}} , $chr;
push @{$output{POS}} , ${$$SVbyChr_Hsh_ref{$chr}}[$coordPos]->{POS};
for my $fieldType (keys %$output_Field_IndxHsh_ref){
while (my ($outputField_name, $outputField_index) = each %{$output_Field_IndxHsh_ref->{$fieldType}}){
push @{$output{$outputField_name}} , ${$$SVbyChr_Hsh_ref{$chr}}[$coordPos]->{$fieldType}[$outputField_index];
}
}
}
}
}
else{
my @chrAry;
push @chrAry, keys %SVbyChr;
for my $chr (@chrAry){
my ($start, $end) = (0, $#{$$SVbyChr_Hsh_ref{$chr}}); #grab ary index coords for entire chromosome
for my $coordPos ($start .. $end){
push @{$output{CHROM}} , $chr;
push @{$output{POS}} , ${$$SVbyChr_Hsh_ref{$chr}}[$coordPos]->{POS};
for my $fieldType (keys %$output_Field_IndxHsh_ref){
while (my ($outputField_name, $outputField_index) = each %{$output_Field_IndxHsh_ref->{$fieldType}}){
push @{$output{$outputField_name}} , ${$$SVbyChr_Hsh_ref{$chr}}[$coordPos]->{$fieldType}[$outputField_index];
}
}
}
}
}
return(\%output);
}
sub Parse_OUTPUT{
my ($outFile, $outputHsh_ref, $valid_VCF_fieldsAry_ref, $valid_VCF_infoAry_ref, $deparse, $header_EXISTS) = @_;
my @fields_VCF = @$valid_VCF_fieldsAry_ref;
my @info_VCF = @$valid_VCF_infoAry_ref;
my $index = 0;
for (@fields_VCF){ #splice info header out replace with selected user info sub fields
if ($_ =~ m/\AINFO\Z/i){
splice (@fields_VCF, $index, 1);
splice (@fields_VCF, $index, 0, @info_VCF);
}
$index++;
}
#check ouputHsh_ref for data consistency prior to parsing header to include subfield indicies
my $counter = 0;
my ($fieldLen, $priorLen);
for (@fields_VCF){ #confirm identical output lengths or die due to corrupt data ouput
if (exists $$outputHsh_ref{$_}){
$fieldLen = $#{$$outputHsh_ref{$_}};
$priorLen = $fieldLen if ($counter == 0);
$counter++;
}
if ($counter > 0){
die "INTERNAL ERROR uneven array lengths at MAIN:sub:WriteOutputFile.\n" if ($priorLen != $fieldLen);
$priorLen = $fieldLen;
}
}
my (@write_Header, @header); #begin parsing header-- find fields that need to be expanded- include suffix for internal field #
LINE: for my $field (@fields_VCF){
if ($field eq "CHROM" || $field eq "POS"){
push @header, $field;
next LINE;
}
if (exists $$outputHsh_ref{$field}){
if ($deparse) {
if (${$outputHsh_ref}{$field}[0] =~ m/[,]/g){ #does field need to be expanded
my @expanded_Field = split (/,/ , ${$outputHsh_ref}{$field}[0]); #get all entries for this data field
my @expanded_Header;
for my $number_Suffix(1 .. @expanded_Field){
push @expanded_Header, ($field . $number_Suffix);
}
push @header, @expanded_Header;
push @write_Header, $field;
}else {
push @header, $field;
push @write_Header, $field;
}
}else{
push @header, $field;
push @write_Header, $field;
}
}
}
my $ chr = $$outputHsh_ref{CHROM}[0];
print STDERR "Generating $fieldLen lines for output on Chromosome $chr with the selected fields\n@header\n";
if ($header_EXISTS == 0){
print $outFile join ("\t", @header) . "\n"; #full VCF ordered header for all output Fields see 232
$header_EXISTS++;
}
return (\@write_Header, $fieldLen, $header_EXISTS)
}
sub WriteOUT{
my ($outFile, $write_HeaderAry_ref, $fieldLen, $outputHsh_ref, $deparse) = @_;
for my $line_Num (0 .. $fieldLen){
my @line; #container for ouput per line
push @line , ${$outputHsh_ref}{CHROM}[$line_Num];
push @line , ${$outputHsh_ref}{POS}[$line_Num];
for my $usr_Field(@$write_HeaderAry_ref){
my $parse_Line = ${$outputHsh_ref}{$usr_Field}[$line_Num];
if ($deparse){
if ($parse_Line =~ m/[,]/g){
my @parsed_Line = split (/,/ , $parse_Line);
push @line, @parsed_Line;
}
else{
push @line , $parse_Line;
}
}
else{
push @line, $parse_Line;
}
}
print $outFile join ("\t", @line) . "\n";
}
print STDERR "File write succesful for chromosome block\n\n";
return;
}