抓取百度贴吧 目标: 1、获取帖子标题、总页数、评论、图片 2、图片写入文件并保存 3、将各种信息实现打印(测试追踪) 4、输入帖子号便能实现以上操作(亦适用于其它帖子)
第一版:
# -*-coding:utf-8-*- import random import re import os import urllib import requests import urllib.request import time from PIL import Image from io import BytesIO from bs4 import BeautifulSoup """初始化查询的网址""" siteURL = "http://tieba.baidu.com/p/" def replace(x): """ 方便用replace方法把换行符等删除 :param: x :return: x.strip """ # 将list 转化为字符串 否则报错expected string or bytes-like object x = ''.join(x) removeImg = re.compile('<img.*?>|{7}| ') # 去除img标签,1-7位空格, removeAddr = re.compile('<a.*?>|</a>') # 删除超链接标签 replaceLine = re.compile('<tr>|<div>|</div>|</p>') # 把换行的标签换位\n replaceTD = re.compile('<td>') # 把表格制表<td>换为\t replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>') # 把换行符或者双换行符换为\n removeExtraTag = re.compile('.*?') # 把其余标签剔除 removeNoneLine = re.compile('\n+') # 把多余空行删除 removeNoneLine = re.compile('\n+') # 把多余空行删除 x = re.sub(removeImg, "", x) x = re.sub(removeAddr, "", x) x = re.sub(replaceLine, "\n", x) x = re.sub(replaceTD, "\t", x) x = re.sub(replaceBR, "\n", x) x = re.sub(removeExtraTag, "", x) x = re.sub(removeNoneLine, "\n", x) return x.strip() # 把strip()前后多余内容删除 def getSource(url): """ 获取网页源码 :param: url :return: result """ user_agents = [ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)' ] length = len(user_agents) - 1 print(length) index = random.randint(0, length) user_agent = user_agents[index] headers = {'User_agent': user_agent} r = requests.get(url, headers=headers) return r.text def saveImage(imageURL, path, title, name, pageName): """ 保存图片写入文件 :param: imageURL, path, title, name, pageName :return: """ try: # 命名格式有问题 不能用网址命名 因为有'/' 命名格式不行 下面在做一个过滤器 下周一来搞 # 解决方案 在调用前传入名称就好 proDir = os.path.split(os.path.realpath(__file__))[0] fileName = name + '.' + 'jpg' filePath = os.path.join(proDir, "photo", title, path, pageName) urllib.request.urlretrieve(imageURL, filePath + fileName) # urllib.request.urlretrieve(imageURL, filePath) # urllib.request.urlretrieve(imageURL, filePath + '\\%s.jpg' % imageURL) except Exception as e: print(e) def getTitle(url): """ 获取帖子的标题,并打印输出 :param: url :return: iteam """ result = getSource(url) pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S) iteam = re.findall(pattern, result) text = replace(iteam) print(u'这篇文章的标题为------' + text) return text def getPageNumber(url): """ 获取该帖子的总页数,并打印输出 :param: url :return:iteams """ result = getSource(url) soup = BeautifulSoup(result, 'lxml') # pattern = re.compile('<div class="pb_footer".*?<ul class="l_posts_num".*?<li class="l_reply_num".*?<span.*?red.*?></span>', re.S) # 若存在多个pattern子串 只返回第一个 # iteams = re.findall(pattern, result) iteams = soup.find_all('span', attrs={'class': 'red'}) iteam = iteams[-1].get_text() print(iteam) page = replace(iteam) print(u'这篇文章的总页数为------' + page) return page def getContent(url): """ 获取评论 :param: url :return:items """ result = getSource(url) pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S) items = re.findall(pattern, result) number = 1 for item in items: data = str(number) + u'楼\t楼主:' + item[0] + '\n内容:' + item[1] + '\n' print(data) number += 1 return items def makeDir(path, title): """ 创建目录 :param: path,title :return: path """ # 获取 proDir 主路径 proDir = os.path.split(os.path.realpath(__file__))[0] # filePath 配置文件路径地址 filePath = os.path.join(proDir, "photo", title) # strip() 作用是去掉换行符之类的 # 为每一页创建路径 path = path.strip() # os.path.exists() 用来检查路径是否存在 false true E = os.path.exists(os.path.join(filePath, path)) if not E: # 创建新目录,若想将内容保存至别的路径(非系统默认),需要更环境变量 os.makedir如果子目录创建失败或者已经存在 # 更改环境变量用os.chdir() 切换路径 os.makedirs(os.path.join(filePath, path)) os.chdir(os.path.join(filePath, path)) print(u'正在创建名为', path, u'的文件夹') return path else: print(u'名为', path, u'的文件夹已经存在...') return path def getImage(url): """ 获取晒图,清洗获得链接并保存入list :param: url :return: images """ result = getSource(url) soup = BeautifulSoup(result, 'lxml') # 此处用BeautifulSoup显然更高效 # find_all()返回一个list,find()返回一个元素 # 注意class属性和python内置的重合,所以加_变成class_ # items = soup.find_all('img', class_="BDE_Image") items = soup.find_all('img', attrs={'class': 'BDE_Image'}) images = [] number = 0 for item in items: print(u'发现一张图,链接为------', item['src']) images.append(item['src']) number += 1 if number >= 1: print(u'\n', u'共晒图', number, u'张,厉害了我的哥!!!') else: print(u'喏,没有图......') return images def getAllPage(Num, siteURL=siteURL): """ :param: Num,siteURL :return: """ siteURL = siteURL + str(Num) # 获取帖子标题 title = getTitle(siteURL) # 获取帖子页数 numbers = getPageNumber(siteURL) # 浏览全部页面 for page in range(1, int(numbers) + 1): # 格式化索引链接 url = siteURL + '?pn=' + str(page) print(u'\n\n', u'正准备获取第', page, u'页的内容...') # 获取评论 print(u'\n', u'正准备获取评论...') getContent(url) # 保存图片 # 每一页创建一个文件 page1 page2 page3 path = makeDir(path='page' + str(page), title=title) # 获取图片 print(u'\n', u'正准备获取图片...') images = getImage(url) print(images) print(u'\n', u'正准备保存图片...') number = 1 # 保存图片,先从之前的list中找链接 for detailURL in images: pageName = str(page) + str(number) name = 'page' + str(page) + 'num' + str(number) saveImage(detailURL, path, title, name, pageName) time.sleep(0.1) number += 1 print(u'\n\n', u'完成第', page, u'页' ) print(u'\n\n', u'恭喜,圆满成功!') def main(): """主函数 Num填写帖子号 打开帖子查看网址最后一串数字""" Num = 4252370485 items = getAllPage(Num) if __name__ == "__main__": main()
优化之后 删除一些重复的东西
''' 抓取百度贴吧(优化) 目标: 1、获取帖子标题、总页数、评论、图片 2、图片写入文件并保存 3、将各种信息实现打印(测试追踪) 4、输入帖子号便能实现以上操作(亦适用于其它帖子) ''' # -*-coding:utf-8-*- import random import re import os import urllib import requests import urllib.request import time from bs4 import BeautifulSoup """初始化查询的网址""" siteURL = "http://tieba.baidu.com/p/" def replace(x): """ 方便用replace方法把换行符等删除 :param: x :return: x.strip """ # 将list 转化为字符串 否则报错expected string or bytes-like object x = ''.join(x) removeImg = re.compile('<img.*?>|{7}| ') # 去除img标签,1-7位空格, removeAddr = re.compile('<a.*?>|</a>') # 删除超链接标签 replaceLine = re.compile('<tr>|<div>|</div>|</p>') # 把换行的标签换位\n replaceTD = re.compile('<td>') # 把表格制表<td>换为\t replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>') # 把换行符或者双换行符换为\n removeExtraTag = re.compile('.*?') # 把其余标签剔除 removeNoneLine = re.compile('\n+') # 把多余空行删除 x = re.sub(removeImg, "", x) x = re.sub(removeAddr, "", x) x = re.sub(replaceLine, "\n", x) x = re.sub(replaceTD, "\t", x) x = re.sub(replaceBR, "\n", x) x = re.sub(removeExtraTag, "", x) x = re.sub(removeNoneLine, "\n", x) return x.strip() # 把strip()前后多余内容删除 def getSource(url): """ 获取网页源码 :param: url :return: result """ # 设置18个浏览头 防止重复被封 user_agents = [ 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)' ] length = len(user_agents) # 从18个里面随机选一个 生成一个0到1的随机符点数: 0 <= n <= 17 index = random.randint(0, length-1) user_agent = user_agents[index] headers = {'User_agent': user_agent} r = requests.get(url, headers=headers) return r.text def get_Title_Numbers(url): """ 获取帖子的标题和总页数,并打印输出 :param: url :return:title,page """ result = getSource(url) # 下面的用正则来查找 总页数用正则查不到 因此就用了 beautifulsoup ,将两个合并在一块干脆全部使用soup吧 # pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S) # iteam = re.findall(pattern, result) # text = replace(iteam) # print(u'这篇文章的标题为------' + text) soup = BeautifulSoup(result, 'lxml') titles = soup.find_all('h1', attrs={'class': 'core_title_txt'}) title = titles[0].get_text() print(u'这篇文章的标题为------' + title) pages = soup.find_all('span', attrs={'class': 'red'}) page = pages[-1].get_text() print(u'这篇文章的总页数为------' + page) return title, page def makeDir(title): """ 创建目录 :param:title :return:filePath 缺陷:若是已经创建该文件夹 重复创建会失败 考虑下已经创建的情况(已改) """ # 获取 proDir 主路径 proDir = os.path.split(os.path.realpath(__file__))[0] filePath = os.path.join(proDir, "photo", title) # os.path.exists() 用来检查路径是否存在 false true E = os.path.exists(filePath) if not E: # 创建新目录,若想将内容保存至别的路径(非系统默认),需要更环境变量 os.makedir如果子目录创建失败或者已经存在 # 更改环境变量用os.chdir() 切换路径 os.makedirs(os.path.join(filePath)) os.chdir(os.path.join(filePath)) print(u'正在创建名为', title, u'的文件夹') return filePath+"\\" else: print(u'名为', title, u'的文件夹已经存在...') return filePath+"\\" def get_Content_Images(url, page, path): """ 获取图片与评论,返回图片打印输出评论 :param: url, page, title :return: """ result = getSource(url) pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S) items = re.findall(pattern, result) print(u'\n', u'正准备获取评论...') number = 1 for item in items: data = str(number) + u'楼\t楼主:' + item[0] + '\n内容:' + item[1] + '\n' print(data) number += 1 soup = BeautifulSoup(result, 'lxml') items = soup.find_all('img', attrs={'class': 'BDE_Image'}) images = [] number = 0 print(u'\n', u'正准备获取图片...') for item in items: print(u'发现一张图,链接为------', item['src']) images.append(item['src']) number += 1 if number >= 1: print(u'\n', u'共晒图', number, u'张!') else: print(u'emm,没有图......') print(u'\n', u'正准备保存图片...') number = 1 # 保存图片,先从之前的list中找链接 for detailURL in images: name = 'page' + str(page) + 'num' + str(number) fileName = name + '.' + 'jpg' urllib.request.urlretrieve(detailURL, path+fileName) time.sleep(0.1) number += 1 print(u'\n\n', u'获取第', page, u'页图片已经完成!') def getAllPage(Num, siteURL=siteURL): """ 得到全部页面 :param: Num,siteURL :return: """ siteURL = siteURL + str(Num) # 获取帖子标题和页数可以合并为一个 title, numbers = get_Title_Numbers(siteURL) # 创建文件夹 path = makeDir(title) # 浏览全部页面 从第一页开始到最后一页 for page in range(1, int(numbers) + 1): # 格式化索引链接 查看方法 看URL地址,找规矩 url = siteURL + '?pn=' + str(page) print(u'\n\n', u'正准备获取第', page, u'页的内容...') # 获取评论 Because每一页都不同 So都要重新 getSource(url) 可以将获取图片和评论两个结合在一块 get_Content_Images(url, page, path) print(u'\n\n', u'恭喜,圆满成功!') def shuru(): """ 输入函数 :param: :return: x """ print("请输入百度贴吧的帖子号:") x = input("请输入......") return x def main(): """主函数 Num填写帖子号 打开帖子查看网址最后一串数字""" # 4252370485 Num = 4252370485 items = getAllPage(Num) if __name__ == "__main__": main()