import re
from urllib import request
from tools import Tools
import xlwt
'''
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码
'''
class BDTBSpider(object):
def __init__(self):
self.url = 'https://tieba.baidu.com'
self.html = ''
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
# 发起请求
def get_html(self,url):
req = request.Request(url,headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8','ignore')
# 解析数据
def parse_link(self):
pattern = re.compile('<div class="threadlist_title.*?<a rel="noreferrer".*?href="(.*?)".*?title="(.*?)"',re.S)
res = re.findall(pattern,self.html)
for info in res:
print("正在爬取{},请稍后.....".format(info[1]))
# 拼接帖子详情地址
url = self.url + info[0]
self.get_html(url)
# 创建一个workbook
workbook = xlwt.Workbook(encoding='utf-8')
sheet = workbook.add_sheet('data')
sheet.write(0,0,'用户昵称')
sheet.write(0,1,'用户头衔')
sheet.write(0,2,'用户等级')
sheet.write(0,3,'发表内容')
sheet.write(0,4,'客户端')
sheet.write(0,5,'楼层')
sheet.write(0,6,'发布日期')
self.count = 1
# 帖子标题传进来
self.parse_detail(sheet)
# 保存
workbook.save(info[1]+'.xls')
# 先找到class="next pagination-item 字符的位置
index = self.html.find('class="next pagination-item')
next_html = self.html[index-80:index]
next_pat = re.compile('<a href="(.*?)"')
next_link = re.search(next_pat,next_html)
# if next_link:
# link = 'http:'+next_link.group(1)
# self.get_html(link)
# self.parse_link()
# else:
# print('没有下一页')
# 解析详情页面的函数
def parse_detail(self,sheet):
print('正在爬取下一页,请稍后.....')
# 准备正则,从self.html中解析数据
pattern = re.compile('<li class="d_name".*?>(.*?)</li>.*?class="d_badge_title ">(.*?)</div>.*?class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap">(.*?)</div>',re.S)
res = re.findall(pattern,self.html)
# for遍历拿回的数据
for info in res:
nickname = Tools.strip_char(info[0])
content = Tools.strip_char(info[3])
# msg是一个元组,元组存放的是处理之后的数据
msg = Tools.get_client_floor_date(info[4])
sheet.write(self.count,0,nickname)
sheet.write(self.count,1,info[1])
sheet.write(self.count,2,info[2])
sheet.write(self.count,3,content)
sheet.write(self.count,4,msg[0])
sheet.write(self.count,5,msg[1])
sheet.write(self.count,6,msg[2])
self.count += 1
# 找到下一页位置
index = self.html.find('下一页')
if index != -1:
next_html = self.html[index-40:index]
pattern = re.compile('<a href="(.*?)"')
next_href = re.search(pattern,next_html).group(1)
print(next_href)
# 拼接完整地址
url = self.url + next_href
self.get_html(url)
self.parse_detail(sheet)
else:
print('没有下一页')
def start(self,name):
self.get_html('https://tieba.baidu.com/f?kw=%s&tab=good&cid=&pn=0'%name)
self.parse_link()
print(self.url)
if __name__ == '__main__':
bdtb = BDTBSpider()
name = input('请输入贴吧名称:')
bdtb.start(name)
===================================================================
#coding:utf-8import re
from urllib import request
import time
from tools import Tools,DBManager
class QSBKSpider(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/text/page/1'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
self.html = ''
def get_html(self):
count = 1
while True:
try:
req = request.Request(url=self.url,headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8')
except Exception as e:
count += 1
if count > 10:
print('当前页数据获取失败!')
break
print('%s,正在尝试第%s次连接.....'%(e,count))
else:
break
time.sleep(1)
# 解析数据
def parse_data(self):
# 准备正则
pattern = re.compile('<div.*?class="author.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?>(.*?)</div>.*?<div class="content.*?<span>(.*?)</span.*?<i class="number.*?>(.*?)</i.*?<i.*?>(.*?)</i>',re.S)
res = re.findall(pattern,self.html)
for info in res:
data = list(info)
data[0] = Tools.strip_char(info[0])
data[2] = Tools.strip_char(info[2])
DBManager.insert_data(data)
# 正则直接找不到下一页,先找下一页的字符
index = self.html.find('class="next"')
# 如果找到没有下一页
if index != -1:
# 截取下一页前后部分的字符串
s = self.html[index-90:index]
# 在使用正则从截取的字符串中进行匹配
pattern = re.compile('href="(.*?)"')
next_href = re.search(pattern,s)
page = next_href.group(1).split('/')[-2]
print('正在爬取第{}页'.format(page))
self.url = 'https://www.qiushibaike.com' + next_href.group(1)
self.get_html()
# 调用自身函数 解析数据
self.parse_data()
else:
print('没有下一页..')
# next_pat = re.compile('<span class="next">(.*?)</span>',re.S)
# res = re.search(next_pat,self.html)
# if res:
# print('正在处理下一页数据.......')
# # 查找下一页的链接 正则
# next_pat = re.compile('<span class="dots".*?<a.*?<a.*?<a href="(.*?)".*?>.*?<span.*?>',re.S)
# res = re.search(next_pat,self.html)
# # 拼接完整的链接
# self.url = 'https://www.qiushibaike.com'+res.group(1)
# print(self.url)
# self.get_html()
# # 调用函数本身,解析数据
# self.parse_data()
#
# else:
# print('没有下一页了')
def start(self):
self.get_html()
self.parse_data()
if __name__ == '__main__':
# 连接数据库
DBManager.connet_db()
# 爬虫
qsbk = QSBKSpider()
qsbk.start()
# 关闭数据库
DBManager.close_db()
=====================================================
#coding:utf-8
import re
from urllib import request,parse
from tools import Tools
import xlwt
'''
1.初始化函数
2.启动爬虫函数
3.请求函数
4.解析函数
'''
class NHSpider(object):
def __init__(self):
# 初始化基础url
self.url = 'http://neihanshequ.com/joke/?is_json=0&app_name=neihanshequ_web&max_time='
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
# 记录html源代码
self.html = ''
# 创建好工作簿
self.workbook = xlwt.Workbook(encoding='utf-8')
# 添加数据表
self.sheet = self.workbook.add_sheet('内涵段子')
# 写入表头
self.sheet.write(0,0,'用户头像')
self.sheet.write(0,1,'用户昵称')
self.sheet.write(0,2,'发布时间')
self.sheet.write(0,3,'段子内容')
self.sheet.write(0,4,'点赞数')
self.sheet.write(0,5,'踩数')
self.sheet.write(0,6,'评论数')
self.sheet.write(0,7,'转发数')
# 声明写入数据时记录行数的属性
self.count = 1
# 请求函数
def get_html(self,max_time='1520381421'):
# 根据max_time拼接完整的url地址
url = self.url + max_time
req = request.Request(
url,
headers=self.headers
)
response = request.urlopen(req)
# 记录拿到的html源代码
self.html = response.read().decode('utf-8')
# 解析数据的函数
def parse_data(self):
# 解析段子数据
data_pat = re.compile('<div.*?class="detail-wrapper.*?<img.*?src="(.*?)".*?class="name">(.*?)</span.*?<span.*?>(.*?)</span.*?<h1.*?>(.*?)</h1>.*?class="digg">(.*?)</span.*?class="bury">(.*?)</span.*?class="repin">(.*?)</span.*?class="share">(.*?)</span.*?class="comment.*?>(.*?)</span>',re.S)
res = re.findall(data_pat,self.html)
for msg in res:
print('正在爬取第%s条段子' % self.count)
date = Tools.strip_char(msg[2])
content = Tools.strip_char(msg[3])
self.sheet.write(self.count,0,msg[0])# 用户头像
self.sheet.write(self.count,1,msg[1])# 用户昵称
self.sheet.write(self.count,2,date) # 发布日期
self.sheet.write(self.count,3,content) # 段子内容
self.sheet.write(self.count,4,msg[4]) # 点赞数
self.sheet.write(self.count,5,msg[5]) # 踩数
self.sheet.write(self.count,6,msg[6]) # 评论数
self.sheet.write(self.count,7,msg[7]) # 转发数
# 计数+1
self.count+=1
# 匹配max_time的正则
max_pat = re.compile("max_time: '(.*?)'",re.S)
res = re.search(max_pat,self.html)
# if res:只要有max_time就一直获取数据
# if self.count>300 直接结束 只爬取300条数据
if res:
# 根据max_time 再次发起请求
self.get_html(res.group(1))
# 解析数据
# 在函数内部调用执行函数本身,这种方式叫做递归函数
self.parse_data()
else:
self.workbook.save('内涵段子.xls')
# 启动爬虫函数
def start(self):
# 调用get_html获取第一页数据
self.get_html()
self.parse_data()
if __name__ == '__main__':
nh = NHSpider()
nh.start()
============================================================
#coding:utf-8
import re
from urllib import request,parse
# xlwt 操作excel表格
import xlwt
from random import choice
# 1.创建一个工作簿对象
# workbook = xlwt.Workbook(encoding='utf-8')
# # 2.添加一张表
# sheet = workbook.add_sheet('python职位表')
# # 3.向表中添加数据
# sheet.write(0,0,'职位名称')
# sheet.write(0,1,'工作地点')
# sheet.write(0,2,'公司名称')
# sheet.write(0,3,'薪资待遇')
# sheet.write(0,4,'发布日期')
# # 4.保存
# workbook.save('python职位信息.xls')
'''
https://sou.zhaopin.com/jobs/searchresult.ashx?kw=python&sm=0&p=1
https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京%2B上海%2B广州%2B深圳%2B杭州&kw=python&p=1
jl 工作地点
kw 搜索关键字
p 页码
'''
'''
1.初始化函数
kw搜索关键词,基础的url地址,请求头,用来记录html源代码属性,total_page总页码
2.start函数
爬虫的启动函数
3.get_html函数
根据url地址,获取html源代码,转换为str类型,并赋值给self.html
4.parse_total函数
从html源代码中,根据正则提取职位总个数,计算总页码,math.ceil()向上取整
5.parse_info函数
根据总页码,获取每一页的html源代码,根据正则提取职位信息,并对数据进行简单的清洗工作
将数据存储表格中
6.filter函数
将正则匹配到的数据进行清洗,把多余的数据剔除
'''
# 智联招聘爬虫类
class ZLSpider(object):
def __init__(self,kw,citys):
data = {
'jl':'+'.join(citys),
'kw':kw,
}
# 记录搜索关键词
self.kw = kw
# 编码参数
data = parse.urlencode(data)
# 拼接完整地址
self.url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?'+data
# 请求头列表
self.UserAgents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13'
]
# 声明html属性 记录html源代码
self.html = ''
# 将数据中的特殊字符剔除
def fillter_data(self,info):
# 元组转换为列表
rs_list = list(info)
rs_list[0] = re.sub(re.compile('<.*?>| '),'',info[0])
rs_list[1] = re.sub(re.compile('<.*?>'),'',info[1])
rs_list[-1] = re.sub(re.compile('<.*?>| '),'',info[-1])
return rs_list
# 发送请求的函数
def get_html(self,url):
# 创建请求对象,随机取出请求头
# parse.urlencode 对url进行编码
req = request.Request(url,headers={
'User-Agent':choice(
self.UserAgents)})
# 发起请求,接收响应
response = request.urlopen(req)
# 转换html utf-8 gbk gb2312
self.html = response.read().decode('utf-8')
# 解析职位总个数,计算总页数
def parse_total(self):
print(self.url)
# 准备正则
pattern = re.compile('<span.*?em>(.*?)</em>',re.S)
# search()
rs = re.search(pattern,self.html)
# 总职位数 转换整数
total_zw = int(rs.group(1))
# 计算总页数
import math
# ceil()向上取整
# total_zw//60 向下取整
total_page = math.ceil(total_zw/60)
print('共有{}个职位信息,共{}页'.format(total_zw,total_page))
# 智联招聘网页只显示前90页数据
self.total_page = 90
# 解析每一页的职位信息
def parse_info(self):
workbook = xlwt.Workbook(encoding='utf-8')
sheet = workbook.add_sheet(self.kw+'职位表')
sheet.write(0,0,'职位名称')
sheet.write(0,1,'公司名称')
sheet.write(0,2,'最低月薪')
sheet.write(0,3,'最高月薪')
sheet.write(0,4,'工作地点')
sheet.write(0,5,'发布日期')
# 向表格中写入数据时的行号
count = 1
for page in range(1,11):
print('正在爬取第{}页,请稍后...'.format(page))
# 拼接带页码的url地址
url = self.url+'&p={}'.format(page)
self.get_html(url)
# 准备正则
pattern = re.compile('<table.*?class="newlist.*?<td class="zwmc.*?<a.*?>(.*?)</a>.*?class="gsmc.*?<a.*?>(.*?)</a>.*?class="zwyx.*?>(.*?)</td>.*?class="gzdd.*?>(.*?)</td.*?class="gxsj.*?>(.*?)</td>',re.S)
res = re.findall(pattern,self.html)
for s in res:
rs_list = self.fillter_data(s)
sheet.write(count,0,rs_list[0])
sheet.write(count,1,rs_list[1])
# 把职位月薪分成最低和最高
if '-' in rs_list[2]:
max_money = rs_list[2].split('-')[1]
min_money = rs_list[2].split('-')[0]
else:
max_money = min_money = '面议'
sheet.write(count,2,min_money)
sheet.write(count,3,max_money)
sheet.write(count,4,rs_list[3])
sheet.write(count,5,rs_list[4])
# count+1
count += 1
workbook.save(self.kw+'智联职位信息.xls')
# 启动爬虫函数
def start(self):
self.get_html(self.url)
self.parse_total()
self.parse_info()
if __name__ == '__main__':
kw = input('请输入要查询的职位名称:')
citys = []
while len(citys) <5:
city = input('请输入查询的城市,最多5个(q结束):')
if city == 'q':
break
citys.append(city)
zl = ZLSpider(kw,citys)
zl.start()
===================================================
#coding:utf-8
# 网络请求包
from urllib import request,parse
# 正则
import re
import os
# 发送请求接收响应数据
def get_html(url):
'''
根据url地址发送请求,接收响应数据,返回响应数据
:param url:请求地址
:return: str类型的html源代码
'''
# 构建request对象
req = request.Request(
url=url,
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
)
# 发起请求 接收响应
response = request.urlopen(req)
# 转换数据
html = response.read().decode('utf-8')
# 返回html源代码
return html
# 根据正则提取详情url和分类标题
def get_detail(html):
'''
根据正则提取详情url和分类标题
:param html: 网页源代码
:return:
'''
# 1.准备正则
pattern = re.compile('<div class="il_img.*?<a href="(.*?)" title="(.*?)"')
# 2.提取数据
res = re.findall(pattern,html)
# 3.for循环遍历
for info in res:
link = info[0]
title = info[1]
path = 'images/'+title
if not os.path.exists(path):
os.mkdir(path)
# 拼接详情url地址
detail_url = 'http://www.ivsky.com'+link
# 执行获取图片src\下载图片的函数
get_img_src(detail_url,path)
# 根据正则提取图片地址,下载图片
def get_img_src(url,path):
# 获取详情页面的html
html = get_html(url)
pattern = re.compile('<div class="il_img.*?<img src="(.*?)"')
res = re.findall(pattern,html)
# 分割图片名称
for src in res:
print(src)
name =src.split('/')[-1]
# 下载图片
request.urlretrieve(src,path+'/'+name)
# 爬虫的主函数
def main():
url = 'http://www.ivsky.com/tupian/ziranfengguang/'
html = get_html(url)
get_detail(html)
# 是否在当前文件直接运行
if __name__ == '__main__':
main()
================================================
#coding:utf-8
# 网络请求包
from urllib import request,parse
# 正则
import re
import os
# 风景图片爬虫类
class IvskySpider(object):
def __init__(self):
# 爬虫的初始地址
self.url = 'http://www.ivsky.com/tupian/ziranfengguang/'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
self.html = ''
def get_html(self):
# 构建请求对象
req = request.Request(self.url,headers=self.headers)
# 发起请求
response = request.urlopen(req)
# 转换html
html = response.read().decode('utf-8')
# 给对象属性赋值
self.html = html
# return html
# 解析详情链接和title
def parse_detail(self):
# 1.准备正则
pattern = re.compile(
'<div class="il_img.*?<a href="(.*?)" title="(.*?)"')
# 2.提取数据
res = re.findall(pattern, self.html)
# 3.for循环遍历
for info in res:
link = info[0]
title = info[1]
path = 'images/' + title
if not os.path.exists(path):
os.mkdir(path)
# os.chdir('images')
# os.mkdir(title)
# 拼接详情url地址
detail_url = 'http://www.ivsky.com' + link
# 赋值
self.url = detail_url
self.path = path
# 调用解析下载图片函数
self.parse_src_download()
# 解析详情页面每张图的链接,并下载存储
def parse_src_download(self):
# 获取详情页面的html源代码
self.get_html()
pattern = re.compile('<div class="il_img.*?<img src="(.*?)"')
res = re.findall(pattern, self.html)
# 分割图片名称
for src in res:
print(src)
name = src.split('/')[-1]
# 下载图片
request.urlretrieve(src, self.path + '/' + name)
#定义启动爬虫的函数
def start(self):
self.get_html()
self.parse_detail()
if __name__ == '__main__':
ivsky = IvskySpider()
ivsky.start()