import urllib.request import urllib.parse import re import os """ 爬取百度贴吧 """ def getSubUrl(url): """ 获得该路径下子url """ headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } html = urllib.request.urlopen(urllib.request.Request(url,headers=headers)).read().decode('utf-8') subUrl = re.compile(r'href="(/p/.*?)"').findall(html) for i in subUrl: getImages("http://tieba.baidu.com"+i) def getImages(url): """ 爬取该路径下面的图片,存放到文件夹中 """ print(url+"开始") html = urllib.request.urlopen(url).read().decode('utf-8') picture_url_list = re.compile('class="BDE_Image" src="(.*?)"').findall(html) if not os.path.exists('e:/photos'): os.mkdir('e:/photos') print("创建目录e:/photos") #os.chdir('e:/photos') #每个连接单独创建目录 dir = 'e:/photos/'+url.split(r"/p/")[1] if not os.path.exists(dir): os.mkdir(dir) print("创建目录"+dir) os.chdir(dir) #下载图片 for i in range(len(picture_url_list)): picture_name = url.split(r"/p/")[1] + str(i) + '.jpg' try: urllib.request.urlretrieve(picture_url_list[i], picture_name) print("成功下载: " + picture_url_list[i]) except: print("Fail to download " + picture_url_list[i]) #删除空文件夹 if not os.listdir(dir): os.chdir('e:/photos') os.rmdir(dir) print("删除空文件夹:" + dir) if __name__ == '__main__': kw = input("请输入贴吧名称") startPage = int(input("请输入起始页")) endPage = int(input("请输入结束页")) for i in range(startPage,endPage+1): url = "http://tieba.baidu.com/f?" + urllib.parse.urlencode({"kw":kw}) \ + "&pn=" + str(i*50-50) getSubUrl(url)
python爬取贴吧图片
猜你喜欢
转载自blog.csdn.net/weixin_42591674/article/details/84766495
今日推荐
周排行