根据SNP的位置从基因组提取上下游序列

代码如下:

#!/usr/bin/perl -w

use strict;

die "perl $0 <vcf> <genome>" if(@ARGV == 0);

#Author:[email protected]

my $vcf=shift;
my $genome=shift;

my%hash;
my $id;
open GENOME,$genome or die $!;
while(<GENOME>){
    chomp;
    if(/^>/){
        $id=$_;
        $id=~s/>//;
        $id=~s/ //g;
    }else{
        $hash{$id} .= $_;
    }
}
close GENOME;

my@temp;
my$pos;
my$start;
my$end;
my$len;
my$seq;
my$fetchseq;
my($refindelseq,$altindelseq,$upseq,$downseq,$downstart,$refindellen,$upend,$upstart);
open VCF,$vcf or die $!;
while(<VCF>){
    chomp;
    next if(/^Chr/);
    @temp=split/\t/;
    if(exists $hash{$temp[0]}){
       $seq=$hash{$temp[0]};
       $pos=$temp[1];
       $refindelseq=$temp[3];
       $altindelseq=$temp[4];
       $refindellen=length($refindelseq);
       $upstart=$pos-1-100;
       $upend=$pos-1;
       $upseq=substr($seq,$upstart,100);
       $downstart=$pos+$refindellen-1;
       $downseq=substr($seq,$downstart,100);
       $end=$pos+100+$refindellen-1;
       print "$_\t>$temp[0]_$upstart\_$end\t$upseq\[$refindelseq/$altindelseq\]$downseq\n"
    }
}
close VCF;

猜你喜欢

转载自www.cnblogs.com/raisok/p/11457083.html