"""
level1:
贴吧图片爬虫
输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名
贴吧名_xx.jpg
"""
from urllib import request
from urllib import parse
from urllib import error
from lxml import etree
import string
def tiebaSpider(url, beginPage, endPage):
"""
作用:负责处理url,分配每个url去发送请求
url:需要处理的第一个url
beginPage: 爬虫执行的起始页面
endPage: 爬虫执行的截止页面
"""
for page in range(beginPage, endPage):
pn = page
filename = "第" + str(page) + "页.html"
# 组合为完整的 url,并且pn值每次增加50
fullurl = url + "&pn=" + str(pn)
# 发送请求获取HTML页面
print("正在下载" + filename)
# try:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
req = request.Request(fullurl, headers=headers)
response = request.urlopen(req)
html = response.read()
html = etree.HTML(html)
results = html.xpath('//div/a[@class="grbm_ele_a grbm_ele_big"]/img/@src')
for i in range(len(results)):
url = results[i]
# print(url)
imgresponse = request.urlopen(url)
images = imgresponse.read()
# 将获取到的HTML页面写入本地磁盘文件
with open('./images/%s_%s.jpg'%(kw,i), 'wb') as file:
file.write(images)
# except Exception as e:
# pass
# 模拟 main 函数
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
kw = input("请输入需要爬取的贴吧:")
# 输入起始页和终止页,str转成int类型
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "http://tieba.baidu.com/f?"
key = parse.urlencode({"kw" : kw})
# 组合后的url示例:http://tieba.baidu.com/f?kw=lol
url = url + key
url = parse.quote(url, safe=string.printable)
tiebaSpider(url, beginPage, endPage)
正则方法:亲测可用
"""
5、输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名 贴吧名_xx.jpg
"""
"""
level1:
贴吧图片爬虫
输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名
贴吧名_xx.jpg
"""
from urllib import request
from urllib import parse
from urllib import error
import string,re
def tiebaSpider(url, beginPage, endPage):
"""
作用:负责处理url,分配每个url去发送请求
url:需要处理的第一个url
beginPage: 爬虫执行的起始页面
endPage: 爬虫执行的截止页面
"""
for page in range(beginPage, endPage):
pn = page
filename = "第" + str(page) + "页.html"
# 组合为完整的 url,并且pn值每次增加50
fullurl = url + "&pn=" + str(pn)
# 发送请求获取HTML页面
print("正在下载" + filename)
# try:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
req = request.Request(fullurl, headers=headers)
response = request.urlopen(req)
html = response.read()
resHtml = html.decode("utf-8", 'ignore')
# 图片的提取
img = r'<a .*?><img .*? bpic="(.*?)" .*?/></a>'
img_pattern = re.compile(img, re.I | re.S | re.M)
image = img_pattern.findall(resHtml)
for i in range(len(image)):
url = image[i]
# print(url)
imgresponse = request.urlopen(url)
images = imgresponse.read()
# 将获取到的HTML页面写入本地磁盘文件
with open('./zhende/%s_%s.jpg'%(kw,i), 'wb') as file:
file.write(images)
# 模拟 main 函数
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
kw = input("请输入需要爬取的贴吧:")
# 输入起始页和终止页,str转成int类型
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "http://tieba.baidu.com/f?"
key = parse.urlencode({"kw" : kw})
# 组合后的url示例:http://tieba.baidu.com/f?kw=lol
url = url + key
url = parse.quote(url, safe=string.printable)
tiebaSpider(url, beginPage, endPage)
附上爬取头像吧图片
"""
/**
* _ooOoo_
* o8888888o
* 88" . "88
* (| -_- |)
* O\ = /O
* ___/`---'\____
* . ' \\| |// `.
* / \\||| : |||// \
* / _||||| -:- |||||- \
* | | \\\ - /// | |
* | \_| ''\---/'' | |
* \ .-\__ `-` ___/-. /
* ___`. .' /--.--\ `. . __
* ."" '< `.___\_<|>_/___.' >'"".
* | | : `- \`.;`\ _ /`;.`/ - ` : | |
* \ \ `-. \_ __\ /__ _/ .-` / /
* ======`-.____`-.___\_____/___.-`____.-'======
* `=---='
* .............................................
* 佛曰:bug泛滥,我已瘫痪!
*/
"""
import requests
from lxml import etree
import os
import sys
proxies = {
"http": "http//115.225.88.99:8118",
}
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
teiba_name = input("请输入贴吧名字:")
strat_page = int(input("请输入开始爬取页数:"))
end_page = int(input("请输入爬取结束页数:"))
# proxy = {"http":'115.225.88.99:8118'}
url = "https://tieba.baidu.com/f?"
word = {'kw': teiba_name}
# print(word['kw'])
while strat_page <= end_page:
# 将用户输入页码转换为对应url页数
urlpage = strat_page * 50 - 50
# 为读取贴吧下一页内容做准备
strat_page += 1
# 准备url页数字典
word['pn'] = urlpage
# 向贴吧页面链接发起请求,并返回响应对象
response = requests.get(url, word, headers=headers)
# 读取响应对象内容
html = response.text
# 解析html
html = etree.HTML(html)
# xpath匹配
results = html.xpath('//div[contains(@class,"threadlist_title pull_left j_th_tit")]')
# 标题页面链接前缀
tieba_url = "https://tieba.baidu.com"
# 计数器初始化:
title_page = 0
for result in results:
# 当前标题为第几个
title_page += 1
# 获取当前页面贴吧所有帖子标题链接
html_url = result.xpath('./a/@href')[0]
# 获取当前贴吧页面标题名称
html_title = result.xpath('./a')[0].text
# 拼接贴吧标题完整链接
full_tieba = tieba_url + html_url + '?'
# 初始化标题内容页数
page_now = {'pn': 1}
# 向链接发起请求并传入第一页参数
response = requests.get(full_tieba, params=page_now, headers=headers)
text = response.content.decode("utf-8",errors='ignore')
html = etree.HTML(text)
total_pages = html.xpath("//li[@style='margin-left:8px']/span[last()]")
print(total_pages)
total_pages = total_pages[0].text
# print("总页数:",total_pages)
# 计数器(当前标题页内容所含图片数)
a = 0
for each_page in range(1, int(total_pages) + 1):
# 准备标题内容页面数
page_now2 = {'pn': each_page}
# 向标题链接内容当前页面发起请求,返回响应对象
response = requests.get(full_tieba, params=page_now2, headers=headers)
text = response.content.decode("utf-8", errors='ignore')
html = etree.HTML(text)
# 获取图片链接
res_image = html.xpath('//div[contains(@class,"p_content")]//img[@class="BDE_Image"]/@src')
# print("正在抓取中。。。",strat_page, "图片数:",len(res_image),"标题数:",title_page,html_title,"页数:",each_page)
if len(res_image) != 0:
# 遍历图片链接列表
for each_image in res_image:
try:
# 当前标题页内容所含图片数
a += 1
response1 = requests.get(each_image, headers=headers)
# 返回html字节对象
html = response1.content
# 写入文件
# 返回真实页数
page_num = int(word['pn']) / 50 + 1
print("正在抓取中。。。当前页数:", int(page_num), "图片数:", len(res_image), "标题数:", title_page, html_title,
"标题内容页数:",
each_page)
new_path = "./touxiang/" + word['kw'] + "第" + str(int(page_num)) + "页" + str(
title_page) + "标题下" + str(each_page) + "图" + str(a) + ".jpg"
# 创建文件夹并写入图片
parent_path = os.path.dirname(new_path)
if not os.path.exists(parent_path):
os.makedirs(parent_path)
with open(new_path, "wb") as file_o:
file_o.write(html)
except Exception as e:
pass
# print("抓取完成!。。。")