学习Python的日子 爬虫(3)

一、使用正则得到尚硅谷老师的图片链接和简介信息,并且把图片保存下来,老师简介保存到文本中;

要求:

1)杨老师的信息图片<img src="pics/ygf.jpg"> 图片保存的名字叫ygf.jpg,其他老师类似;并且都保存到当前目录下的image目录

2)杨老师的信息保存文件名叫“ygf.txt,其他老师类似;并且保存到当前目录下的text目录

import re,os
import requests

def downloader(teacher_tuple):
    url = "http://www.atguigu.com/"
    url = url+teacher_tuple[0]
    bin = requests.get(url).content
    if not os.path.exists("./image"):
        os.mkdir("./image")
    filename = teacher_tuple[0].split('/')[1]
    with open("./images/"+filename,"wb") as f:
        f.write(bin)
        print(filename,"写入完成")

    if not os.path.exists("./text"):
        os.mkdir("./text")
    textname = filename.split(".")[0]
    with open("./texts/"+textname+".txt","w") as f:
        content = re.sub(r"<.*?>", "", teacher_tuple[1])
        content = re.sub(r"\s*", "", content)
        f.write(content)
        print(textname+".txt 写入完成")


def html_dealer(html):
    regex = re.compile(r'<div class="teacher_content.*\s*<img src="(.*?)">')
    image_list = regex.findall(html)

    regex = re.compile(r'</div></div>([\s\S]*?)</div>')
    content_list = regex.findall(html)

    regex = re.compile(r'老师</p>([\s\S]*?)<p')
    last_content_list = regex.findall(html)
    content_list.extend(last_content_list)

    for i in range(len(image_list)):
        teacher_tuple = (image_list[i],content_list[i])
        downloader(teacher_tuple)


def main():
    url = "http://www.atguigu.com/teacher.shtml"
    res = requests.get(url)
    html = res.content.decode()
    html_dealer(html)

if __name__ == "__main__":
    main()


猜你喜欢

转载自blog.csdn.net/qq_42240071/article/details/80965276
今日推荐