python:脚本:1:AdapterRemoval_statistics_sample.py

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/genome_denovo/article/details/78606132

#!/usr/bin/env python
#coding:utf8
#author:zhounan

from __future__ import division
import os, string, sys

if len(sys.argv) != 2:
····print u'python AdapterRemoval_statistics_sample.py name,name,name'
····print u'每个样本名需要用逗号进行分隔,只有一个样品也要加逗号,*.settings文件名需要与样本名对应'
····print u'在使用AdapterRemoval的时候要加入--basename name这个参数'
····print u'需要做Q20 Q30的summary统计,因为要统计原始的碱基数目'
····sys.exit()
name=sys.argv[1]
sample_list=filter(lambda x: x.strip(), name.split(','))
opt_list=[]
for x in range(len(sample_list)):
····summary=[y.strip() for y in open('%s.summary'%sample_list[x])]
····raw_reads=int(summary[1].split('\t')[1])
····raw_bases=int(summary[1].split('\t')[2])
····settings=[y.strip() for y in open('%s.settings'%sample_list[x])]
····adaper_only_reads=0; len_less_50=0; len_less_100=0
····for c in range(len(settings)):
········if settings[c].strip().split(': ')[0] == 'Number of retained reads':
············rmadapter_reads=int(settings[c].strip().split(': ')[1])
········if settings[c].strip().split(': ')[0] == 'Number of retained nucleotides':
············rmadapter_bases=int(settings[c].strip().split(': ')[1])
········if settings[c].strip() == '[Length distribution]':
············for a in range(c+2,c+17):
················adaper_only_reads+=int(settings[a].strip().split('\t')[5])
············for a in range(c+17,c+53):
················len_less_50+=int(settings[a].strip().split('\t')[5])
············for a in range(c+17,c+103):
················len_less_100+=int(settings[a].strip().split('\t')[5])
····each_list=['%s样品接头统计结果:\n'%sample_list[x], '原始reads总数:\t%d\n'%raw_reads, '原始碱基总数:\t%d\n'%raw_bases, '去除接头后保留的reads数:\t%d\n'%rmadapter_reads, '去除接头后保留的碱基数:\t%d\n'%rmadapter_bases, '保留的reads数占原有的reads数的百分比:\t%d\n'%round(rmadapter_reads/raw_reads*100, 3), '保留的碱基数占原有的碱基数的百分比:\t%d\n'%round(rmadapter_bases/raw_bases*100, 3), '全是接头的reads数(包括discard reads数加上去除接头后reads长度小于15bp的read数):\t%d\n'%adaper_only_reads, '全是接头的reads数占原始reads总数的百分比:\t%d\n'%round(adaper_only_reads/raw_reads*100, 3), '去除接头后reads长度小于50bp的read数:\t%d\n'%len_less_50, '去除接头后reads长度小于50bp的read数占原始reads总数的百分比:\t%d\n'%round(len_less_50/raw_reads*100, 3), '去除接头后reads长度小于100bp的read数:\t%d\n'%len_less_100, '去除接头后reads长度小于100bp的read数占原始reads总数的百分比:\t%d\n\n'%round(len_less_100/raw_reads*100, 3)]
····opt_list.extend(each_list)
opt=open('each_sample_adapter_summary.xls','w')
opt.writelines(opt_list)
opt.close()

cython
pypy

猜你喜欢

转载自blog.csdn.net/genome_denovo/article/details/78606132