由于之前有java基础和web开发基础,所以把https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000廖雪峰老师的python3的教程看到面向对象这里开始学习py爬虫
记录下今天的第一个爬虫例子
爬的是知乎的其中一个话题,但是感觉图片并没有爬完,目前不清楚情况,有待以后研究深入了解
贴下收获以及代码:
import urllib.request,re,os if __name__ == '__main__': targetPath = "D:\\python\\download\\images" def saveFile(path): #校验路径,如果不存在则创建 if not os.path.isdir(targetPath): os.mkdir(targetPath) pos = path.rindex('/') t = os.path.join(targetPath, path[pos + 1:]) print(t) return t url = "https://www.zhihu.com/question/36006897" headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() for myurl, other in set(re.findall(r'(https:[^\s]*?(png|gif|jpg))', str(data))): print(myurl) try: urllib.request.urlretrieve(myurl, saveFile(myurl)) except: print('挂掉了.....')