小爬虫-从PhysioNet上下载MIT-BIH Arrhythmia Database的ECG数据


import urllib.request
import os


def url_open(url):
    '''open url and return source html code'''
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html


def save_file(file_url):
    ''' open a url and save file'''

    # get file name
    filename = file_url.split('/')[-1]
    # write file to local
    with open(filename, 'wb') as f:
        file = url_open(file_url)
        f.write(file)


def download_file(folder="files"):
    '''to download file form internet'''

    # build a folder if it doesn't exit
    if not os.path.exists(folder):
        os.makedirs(folder)
    os.chdir(folder)
    # based url
    url = "https://physionet.org/physiobank/database/mitdb/"

    for i in range(100,235):
        file_list = i
        # url of ECG signal head file '*.hea'
        file_url = url + str(file_list) + '.hea'
        # save file
        try:
            save_file(file_url)
        except:
            continue

    # discard the empty files
    file_path = 'D:\\Python\\PyCharm_Projects\\learn_py\\file'
    for root, dirs, files in os.walk(file_path):
        for f in files:
            empty_f = os.path.getsize(file_path + '\\' + f)
            if empty_f <= 0:
                    os.remove(file_path+'\\'+f)


if __name__=='__main__':
    download_file()

猜你喜欢

转载自blog.csdn.net/qq_23869697/article/details/80151289