gene prediction commend 3

若使用本方法,请引用此文,谢谢!

JoF | Free Full-Text | Genome Re-Annotation and Transcriptome Analyses of Sanghuangporus sanghuang

# 进入conda环境
sudo su
密码
conda activate training

05-整合预测结果

# 新建文件夹
mkdir evm && cd evm

 第一步:准备输入文件

# Augustus
/media/aa/DATA/SZQ2/bj/my/genome/5.1216/10.gene_prediction/braker/augustus.hints.gff3
# transdecoder
/media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/5_1216_db.assemblies.fasta.transdecoder.genome.gff3
# maker
/media/aa/DATA/SZQ2/bj/my/genome/5.1216/10.gene_prediction/maker3/maker.gff3
# assembler-5_1216_dtransdecoder
/media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/5_1216_db.pasa_assemblies.gff3

 整合文件

/media/aa/DATA/SZQ2/Augustus/scripts/add_name_to_gff3.pl < /media/aa/DATA/SZQ2/bj/my/genome/5.1216/10.gene_prediction/braker/augustus.hints.gff3 --out=augustus.hints2.gff3 --filter
/root/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/misc/augustus_GFF3_to_EVM_GFF3.pl augustus.hints2.gff3 >augustus.hints3.gff3
cat /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/5_1216_db.assemblies.fasta.transdecoder.genome.gff3 augustus.hints3.gff3 /media/aa/DATA/SZQ2/bj/my/genome/5.1216/10.gene_prediction/maker3/maker.gff3 > gene_predictions.gff3
cat /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/5_1216_db.pasa_assemblies.gff3 > transcript_alignments.gff3
perl -p -i -e 's/^#.*//s' gene_predictions.gff3 transcript_alignments.gff3

第二步:创建权重文件

cp /media/aa/DATA/SZQ2/bj/my/genome/3.15188A/10.gene_prediction/evm/weights.txt weights.txt
vi weights.txt
ABINITIO_PREDICTION      Augustus       8
ABINITIO_PREDICTION      maker       6
TRANSCRIPT      assembler-3_15188_db      9
OTHER_PREDICTION  transdecoder  10

第三步:分割原始数据

用于后续并行. 为了降低内存消耗,–segmentsSize设置的大小需要少于1Mb(这里是100k), --overlapSize的不能太小,如果数学好,可用设置成基因平均长度加上2个标准差,数学不好,就设置成10K吧

# 创建索引
ln -s /media/aa/DATA/SZQ2/bj/my/genome/5.1216/03plion_primary/pilon02.fasta genome.fasta
# 将输入数据分隔成小份数据
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/partition_EVM_inputs.pl --genome genome.fasta --gene_predictions gene_predictions.gff3 --transcript_alignments transcript_alignments.gff3 --segmentSize 500000 --overlapSize 10000 --partition_listing partitions_list.out

第四步:创建并行运算命令并且执行

# 对小份数据进行EVM并行运算
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/write_EVM_commands.pl --genome genome.fasta --gene_predictions gene_predictions.gff3 --transcript_alignments transcript_alignments.gff3 --output_file_name evm.out --weights `pwd`/weights.txt --partitions partitions_list.out >  commands.list
ParaFly -c commands.list -CPU 15

第五步:合并并行结果

~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/recombine_EVM_partial_outputs.pl --partitions partitions_list.out --output_file_name evm.out

第六步:结果转换成GFF3

~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/convert_EVM_outputs_to_GFF3.pl  --partitions partitions_list.out --output evm.out  --genome genome.fasta 
find . -regex ".*evm.out.gff3" -exec cat {} \; | bedtools sort -i - > EVM.all.gff
# 当前权重设置下,EVM的结果更加严格,需要按照实际情况调整,增加其他证据。
cd ..

06-注释过滤

# 新建文件夹
mkdir evm_pasa && cd evm_pasa
# 整合
cat ../evm/*/evm.out.gff3 > evm_out.gff3
gff3_clear.pl --prefix evm evm_out.gff3 > evm.gff3
cat ../evm/transcript_alignments.gff3 > evidence.gff3
# 对EVM得到的gene models进行过滤:
# 把/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/evm_genes_filtering.pl中my $Pfam_db = "/opt/biosoft/hmmer-3.1b2/Pfam-AB.hmm"替换成:my $Pfam_db = "/media/aa/DATA/SZQ2/Pfam-A.hmm"(已完成)
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/evm_genes_filtering.pl evm.gff3 evidence.gff3 0.3 ../evm/genome.fasta 1e-6 0.3 8 > evm.filter.gff3

 PASA 对 EVM 结果进行更新

# 对EVM生成的GFF3格式的结果文件按染色体位置进行排序,并重命名基因ID
gff3_clear.pl --prefix evm evm.filter.gff3 > evm.gff3
# 使用 PASA 对上一步生成的 evm.gff3 文件进行更新。
cp /media/aa/DATA/SZQ2/PASApipeline-v2.5.2/pasa_conf/pasa.annotationCompare.Template.txt annotationCompare.config
vim annotationCompare.conf
DATABASE=5_1216_db
# 检验GFF3文件的兼容性
/media/aa/DATA/SZQ2/PASApipeline-v2.5.2/misc_utilities/pasa_gff3_validator.pl evm.gff3
# 进入rnaseq环境,注意查看路径是否有变动!!
exit
conda activate rnaseq
# 写入路径(已完成)
echo 'PATH=$PATH:/media/aa/DATA1/software/fasta-36.3.8g/bin/' >> ~/.bashrc
source ~/.bashrc
# 运行3次Launch_PASA_pipeline.pl操作
/media/aa/DATA/SZQ2/PASApipeline-v2.5.2/Launch_PASA_pipeline.pl -c annotationCompare.config -A -g ../evm/genome.fasta -t /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta.clean -T -u /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta -L --annots evm.gff3
first_update_gff3=`ls *gene_structures_post_PASA_updates*.gff3 -t | head -n 1`
/media/aa/DATA/SZQ2/PASApipeline-v2.5.2/Launch_PASA_pipeline.pl -c annotationCompare.config -A -g ../evm/genome.fasta -t /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta.clean -T -u /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta -L --annots $first_update_gff3
second_update_gff3=`ls *gene_structures_post_PASA_updates*.gff3 -t | head -n 1`
/media/aa/DATA/SZQ2/PASApipeline-v2.5.2/Launch_PASA_pipeline.pl -c annotationCompare.config -A -g ../evm/genome.fasta -t /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta.clean -T -u /media/aa/DATA/SZQ2/bj/my/genome/5.1216/08.RNA-seq_analysis/zonghe/transcripts.fasta -L --annots $second_update_gff3
third_update_gff3=`ls *gene_structures_post_PASA_updates*.gff3 -t | head -n 1`
# 整理
gff3_clear.pl --prefix evmPasa $third_update_gff3 > evmPasa.gff3

导出基因序列和相关统计信息

# 按染色体顺序排序和重命名
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/sort_and_rename_pasa_updated_gff3.pl evmPasa.gff3 genePrefix > genome.gff3
# 按染色体顺序排序和重命名
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/gff3ToGtf.pl ../evm/genome.fasta genome.gff3 > genome.gtf
# 进入training环境
sudo su
密码
conda activate training
# 得到protein.fasta用于后续注释
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/gff3_file_to_proteins.pl genome.gff3 ../evm/genome.fasta prot > protein.fasta
# 得到CDS
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/gff3_file_to_proteins.pl genome.gff3 ../evm/genome.fasta CDS > CDS.fasta
# 得到cDNA
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/gff3_file_to_proteins.pl genome.gff3 ../evm/genome.fasta cDNA > cDNA.fasta
# 得到gene
~/anaconda3/envs/training/opt/evidencemodeler-1.1.1/EvmUtils/gff3_file_to_proteins.pl genome.gff3 ../evm/genome.fasta gene > gene.fasta
# 整理文件
perl -p -i -e 's/^(>\S+).*/$1/' protein.fasta CDS.fasta cDNA.fasta gene.fasta
# 得到bestGeneModels
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/bestGeneModels.pl genome.gff3 > bestGeneModels.gff3 2> geneModelsStatistic
# 得到bestprotein
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/bestProtein.pl protein.fasta > bestprotein.gff3 2> protein_statistic
# 退出
cd ..

使用BUSCO进行基因组完整性分析

# 新建文件夹
mkdir BUSCO && cd BUSCO
# 建立索引
ln -s /media/aa/DATA/SZQ2/bj/my/genome/5.1216/03plion_primary/pilon02.fasta genome.fasta
# 去除可变剪接
mkdir GFF3_files
/media/aa/DATA/SZQ2/Zhanmengtao_bin-master/bestGeneModels.pl ../evm_pasa/genome.gff3 > GFF3_files/genome.gff3.gff3
# 粘贴结果
touch GFF3_files.result.txt
# 提取蛋白序列
mkdir protein_files
for i in `ls GFF3_files/*.gff3`
do
    x=${i/*\//}
    x=${x/.gff3/}
    echo "gff3_to_protein.pl genome.fasta $i > protein_files/proteins.$x.fasta"
done > command.gff3_to_protein.list
ParaFly -c command.gff3_to_protein.list -CPU 9
gff3_to_protein.pl genome.fasta  ../evm_pasa/evmPasa.gff3 > evmPasae.gff3.fasta
mkdir busco_out
# 进入pfam_scan环境,注意查看路径是否有变动!!
exit
conda activate pfam_scan
# BUSCO分析
for i in `ls protein_files/proteins.*.fasta`
do
    x=${i/*\//}
    x=${x/proteins./}
    x=${x/.fasta/}
    echo "cd busco_out; busco -i ../$i -c 8 -o $x -m proteins -l /home/aa/anaconda3/envs/pfam_scan/databases/basidiomycota_odb10 --offline"
done > command.BUSCO.list
ParaFly -c command.BUSCO.list -CPU 10
# 结果在short_summary.specific.basidiomycota_odb10.genome.gff3.txt
# 统计结果并画柱形图
grep C: */*/short_summary*.txt | perl -pe 's/.*odb10.(.+?).txt/$1/; $str = " " x (10 - length($1)); s/^/$str/;' > BUSCO.summary.txt
cd busco_out
ln -s */short_summary* .
generate_plot.py -wd ./ -rt specific
perl -p -i -e 's/my_family, colour/my_family, face="italic", colour/ if m/axis.text.y/; s/size=1/size=0.4/; s/\%BUSCO/\% BUSCO/;' busco_figure.R
cat busco_figure.R | R --vanilla --slave
cd ../../

猜你喜欢

转载自blog.csdn.net/weixin_58269397/article/details/130148968