NCBI batch download genomes

illustrate

  • Use ftplibto download genomes from NCBI.

to be improved

  • At present, it can only handle the case that the folder does not contain a folder. If there is a folder, it will only be reminded.
  • At present, if there are multiple versions of annotations, the first version is downloaded by default.
  • The above deficiencies do not affect the sequence file down to the core.

premise

  1. *.txt folder for storage GenBank accession No.
  2. installed Python3.
  3. At present, the destination folder has been written in it, and after changing it, it should be downloaded to the current path.

#!/usr/bin/python
#author: W.-S. Zheng
#date 20180429
# -*- coding: UTF-8 -*-
#usage: download genomes from ncbi database

import glob
from ftplib import FTP
import os
import sys


class MyPathes:
    '''定义类储存当前路径,当前路径下的文件名,当前路径下的文件夹名'''

    def __init__(self, cur_path, ftp):
        self.cur = cur_path
        self.ftp = ftp
        self.folders = []
        self.files = []
        self.nextPathes = []
        self.links = []

        self.content = self.getContet()
        self.sort()
        self.createPathes()

    def getContet(self):
        dir_content = []
        self.ftp.cwd(self.cur)
        self.ftp.retrlines('LIST', callback = dir_content.append)
        return dir_content

    def sort(self):
        for line in self.content:
            if line.startswith('d'):
                self.folders.append(line.split(" ")[-1])
            elif line.startswith('l'):
                self.links.append(line.split(" ")[-1])
            else:
                self.files.append(line.split(" ")[-1])

    def oneDes(self):
        return(len(self.folders) == 1)


    def createPathes(self):
        for folder in self.folders:
            self.nextPathes.append(self.cur + '/' + folder)
        return(self.nextPathes)

# 转换accession no. 为路径
def acc2path(acc):
    root = '/genomes/all/GCA/'
    acc = acc.split('_')[1]
    acc = acc.split('.')[0]
    path = root + acc[0:3] + '/' + acc[3:6] + '/' + acc[6:9]
    return path

# 连接FTP
def con():
    ftp = FTP()
    ftp.connect("ftp.ncbi.nlm.nih.gov", 21)
    ftp.login()#连接的用户名,密码
    return(ftp)

# 下载文件
def downloadFile(filename):
    try:
        ftp.retrbinary("RETR " + filename, open(filename, 'wb').write)
    except FileExistsError:
        pass

# 下载文件夹
def downloadFolder(dir_folder):
    try:
        os.mkdir(dir_folder)
    except FileExistsError:
        print('File existed')
    os.chdir(dir_folder)



# 获得GenBank No.
# 存在同文件夹中,唯一的txt文件中
file = glob.glob('*.txt')
print(file)
accessions = open(file[0], 'r').readlines()
print(accessions)


# 逐个下载,适用于文件夹中不包括其他文件夹
for acc in accessions:
    os.chdir('E://')

    # 将No解析为路径
    root = acc2path(acc)
    print(root)


    # 连接FTP
    ftp = con()

    # 解析当前路径的内容
    x1 = MyPathes(root, ftp)

    # 进入要下载的文件夹
    # 如果不只一个版本提醒
    if not x1.oneDes():
        print("More than one version available!")

    else:
        # 连接FTP
        ftp = con()
        # 建立本地文件夹
        downloadFolder(x1.folders[0])
        # 进入下一级ftp目录
        x2 = MyPathes(x1.nextPathes[0], ftp)

        # 开始下载
        print('start!')
        prog = len(x2.files)
        for file in x2.files:
            downloadFile(file)
            sys.stdout.write("%i=" % (prog))
            sys.stdout.flush()
            prog = prog-1

        if not x2.nextPathes:
            print('Done')
        else:
            print('Must go deeper')

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325168901&siteId=291194637