一、使用正则得到尚硅谷老师的图片链接和简介信息,并且把图片保存下来,老师简介保存到文本中;
要求:
1)杨老师的信息图片<img src="pics/ygf.jpg"> 图片保存的名字叫ygf.jpg,其他老师类似;并且都保存到当前目录下的image目录
2)杨老师的信息保存文件名叫“ygf.txt”,其他老师类似;并且保存到当前目录下的text目录
import re,os
import requests
def downloader(teacher_tuple):
url = "http://www.atguigu.com/"
url = url+teacher_tuple[0]
bin = requests.get(url).content
if not os.path.exists("./image"):
os.mkdir("./image")
filename = teacher_tuple[0].split('/')[1]
with open("./images/"+filename,"wb") as f:
f.write(bin)
print(filename,"写入完成")
if not os.path.exists("./text"):
os.mkdir("./text")
textname = filename.split(".")[0]
with open("./texts/"+textname+".txt","w") as f:
content = re.sub(r"<.*?>", "", teacher_tuple[1])
content = re.sub(r"\s*", "", content)
f.write(content)
print(textname+".txt 写入完成")
def html_dealer(html):
regex = re.compile(r'<div class="teacher_content.*\s*<img src="(.*?)">')
image_list = regex.findall(html)
regex = re.compile(r'</div></div>([\s\S]*?)</div>')
content_list = regex.findall(html)
regex = re.compile(r'老师</p>([\s\S]*?)<p')
last_content_list = regex.findall(html)
content_list.extend(last_content_list)
for i in range(len(image_list)):
teacher_tuple = (image_list[i],content_list[i])
downloader(teacher_tuple)
def main():
url = "http://www.atguigu.com/teacher.shtml"
res = requests.get(url)
html = res.content.decode()
html_dealer(html)
if __name__ == "__main__":
main()