python爬虫---数据解析、神级龙卫小说获取、批量获取4K美女图片、获取免费简历模板等10个案例讲解

python爬虫—数据解析

https://live.csdn.net/list/CSDNedu

  • 数据解析的作用

    • 可以帮助我们实现聚焦爬虫
  • 数据解析的实现方式

    • 正则
    • bs4
    • xpath
    • pyquery
  • 数据解析的通用原理

    • 问题1:聚集爬虫爬取的数据是存储在哪里的?
      • 都被存储在了相关的标签中
        1. 定位标签
        2. 取文本或者属性

正则(不用)

如何获取图片?

import requests

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

# 如何爬取图片

# 第一种方式

# 图片地址
url = 'https://pic.qiushibaike.com/system/pictures/12389/123894177/medium/KPFH2OYHBAL1925C.jpg'

img_data = requests.get(url=url, headers=headers).content  # 返回的是byte类型的数据

with open('./img.jpg', 'wb') as fp:
    fp.write(img_data)

# 第二种方式
# 弊端:不能使用UA伪装
from urllib import request

url = 'https://pic.qiushibaike.com/system/pictures/12389/123894177/medium/KPFH2OYHBAL1925C.jpg'

request.urlretrieve(url, filename='./qiutu.jpg')

获取qiushibaike图片数据

import requests
import re
import os

from urllib import request

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

# 创建文件夹
dirName = './imgLibs'
if not os.path.exists(dirName):
    os.mkdir(dirName)

"""
爬取1~3页的所以图片: 
https://www.qiushibaike.com/imgrank/page/1/
https://www.qiushibaike.com/imgrank/page/2/
https://www.qiushibaike.com/imgrank/page/3/
1. 使用通用爬虫将前3页对应的数据拿到
"""

"""
<div class="thumb">

<a href="/article/123898296" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12389/123898296/medium/PK7O8YIG418SSOS1.jpg" alt="糗事#123898296" class="illustration" width="100%" height="auto">
</a>
</div>

ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
"""

# 设定一个通用的url模板,模板是不可变的
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for page in range(1, 4):
    new_url = format(url % page)
    # print(new_url)
    page_text = requests.get(url=new_url,headers=headers).text  # 每一个页码对应的页面源码数据
    # print(page_text)
    # 在通用爬虫的基础上实现聚焦爬虫(每一个页面源码数据种解析处图片数据)
    ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
    img_src = re.findall(ex, page_text, re.S)  # 有回车正则失效, re.S  忽略回车
    for src in img_src:
        src = 'https:' + src
        img_name = src.split('/')[-1]
        img_path = dirName + '/' + img_name  # ./imgLibs/xxx.jpg
        print(src)
        print()
        try:
            request.urlretrieve(src, filename=img_path)
            print(img_name, '下载成功!!!')
        except:
            print(src, "404 Not Found!")

bs4解析(一般使用)

  • 原理

    • 实例化一个BeautifulSoup的对象,需要将即将被解析的页面源码数据加载到该对象中
    • 调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取
  • 环境的安装

    • pip install bs4
    • pip install lxml
  • BeautifulSoup的实例化:

    • BeautifulSoup(fp, ‘lxml’):将本地存储的一个html文档中的数据加载到实例化号的BeautifulSoup对象中
    • BeautifulSoup(‘page_text’, ‘lxml’):将互联网上获取的页面源码加载到实例化号的BeautifulSoup对象中
  • 定位标签的操作

    • soup.div:定位到第一个出现的tagName标签
    • 属性定位:
      • soup.find(‘div’, class_=‘bg-gj-w’)
      • soup.find(‘a’, id=‘result_more_appli_ABC’) 返回定位到的标签
      • soup.find_all(‘div’) 找到所有的div,返回的是一个列表
    • 选择器定位:
      • 返回的是列表
      • print(soup.select(’.bg-gj-w’)) # 类选择器
      • print(soup.select(’#result_more_appli_ABC’)) # id择器
      • print(soup.select(’.class > ul > li’)) # id择器 一个大于号代表一级
      • print(soup.select(’.class li’)) # 层级择器 一个空格代表多级
  • 取文本

    • res.string:获取直系的文本内容
    • res.text:获取所有的文本内容
  • 取属性

    • res[‘href’]
from bs4 import BeautifulSoup

fp = open('jay.html', 'r', encoding='utf-8')

soup = BeautifulSoup(fp, 'lxml')
# print(soup)
# print(soup.div)
# print(soup.find('div', class_='bg-gj-w'))
# print(soup.find('a', id='result_more_appli_ABC'))
# print(soup.find_all('div'))  # 找到所有的div,返回的是一个列表
#
# print(soup.select('.bg-gj-w'))  # 类选择器
# print(soup.select('#result_more_appli_ABC'))  # id择器
# print(soup.select('.class > ul > li'))  # id择器 一个大于号代表一级
# print(soup.select('.class li'))  # 层级择器 一个空格代表多级

res = soup.select('#sogou_feedback ')[0]
print(res)
print(res.string)  # 取文本
print(res.text)
print(res['href'])

bs4解析获取三国演义小说

https://www.shicimingju.com/book/sanguoyanyi.html

"""
爬取三国演义整篇内容(章节名称和章节列表)
https://www.shicimingju.com/book/sanguoyanyi.html
"""
import requests
from bs4 import BeautifulSoup

fp = open('sangup.txt', 'w', encoding='utf-8')

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
main_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=main_url, headers=headers).text
# 解析出章节名称和章节详情页的url
soup = BeautifulSoup(page_text, 'lxml')

a_list = soup.select('.book-mulu > ul > li > a')  # 返回的列表中存储的是一个个的a标签
for a in a_list:
    title = a.string
    detail_url = 'https://www.shicimingju.com' + a['href']
    print(detail_url)
    detail_page_text = requests.get(detail_url, headers=headers).text
    # 解析详情页中的章节内容
    soup = BeautifulSoup(detail_page_text, 'lxml')
    content = soup.find('div', class_='chapter_content').text
    fp.write(title+':'+content+'\n')
    print(title, '下载成功!')

fp.close()

神级龙卫小说获取

import requests
from bs4 import BeautifulSoup

fp = open('shenji.txt', 'w', encoding='utf-8')

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
main_url = 'http://m.ajnnan.com/17_17514/all.html'

page_text = requests.get(url=main_url, headers=headers)
page_text.encoding = 'utf-8'
page_text = page_text.text

# 解析出章节名称和章节详情页的url
soup = BeautifulSoup(page_text, 'lxml')

a_list = soup.select('#chapterlist > p > a')  # 返回的列表中存储的是一个个的a标签
for a in a_list[1:]:
    title = a.string
    detail_url = 'http://m.ajnnan.com' + a['href']
    # print(detail_url, title)
    detail_page_text = requests.get(detail_url, headers=headers)
    detail_page_text.encoding = 'utf-8'
    detail_page_text = detail_page_text.text
    # 解析详情页中的章节内容
    soup = BeautifulSoup(detail_page_text, 'lxml')
    content = soup.find('div', id='chaptercontent').text
    next_url = 'http://m.ajnnan.com' + soup.find('a', id='pb_next')['href']
    # print(next_url)
    detail_page_text1 = requests.get(next_url, headers=headers)
    detail_page_text1.encoding = 'utf-8'
    # print(detail_page_text1)
    detail_page_text1 = detail_page_text1.text
    soup1 = BeautifulSoup(detail_page_text1, 'lxml')
    content = content + soup1.find('div', id='chaptercontent').text
    next_url2 = 'http://m.ajnnan.com' + soup1.find('a', id='pb_next')['href']
    detail_page_text2 = requests.get(next_url2, headers=headers)
    detail_page_text2.encoding = 'utf-8'
    detail_page_text2 = detail_page_text2.text
    soup2 = BeautifulSoup(detail_page_text2, 'lxml')
    content = content + soup2.find('div', id='chaptercontent').text
    fp.write(title+':'+content+'\n')
    print(title, '下载成功!')

fp.close()

xpath解析(重点使用)

浏览器直接F12–>在elements中找到对应的标签–>右击–>copy–>copy xpath

  • 原理

    • 实例化一个etree的对象,然后将即将被解析的页面源码加载到该对象中
    • 使用etree对象中的xpath方法结合不同形式的xpath表达式实现标签定位和数据提取
  • 环境安装:

    • pip install lxml
  • etree对象的实例化:

    • etree.parse(‘test.html’)
    • etree.HTML(page_text)
  • xpath表达式

    • 最左侧的/表示:xpath表达式一定要从根标签逐层进行标签查找和定位
    • 最左侧的//表示:xpath表达式可以从任意位置定位标签
    • 非最左侧的/表示:表示一个层级
    • 非最左侧的//表示:表示多个层级
    • 属性定位://tagName[@attrName=“value”]
    • 索引定位://tagName[index] 索引是从1开始
  • 取文本

    • /text(): 直系文本内容
    • //text(): 所有的文本内容
  • 取属性

    • /@attrName
from lxml import etree

parser = etree.HTMLParser(encoding='utf-8')  # html代码书写不规范,不符合xml解析器的使用规范。加上这句话后解决
tree = etree.parse('jay.html', parser=parser)
# print(tree)
# print(tree.xpath('/html/body/div/p'))  # 绝对
# print(tree.xpath('//p'))  # 所有p
# print(tree.xpath('/html/body//p'))  # html/body下的所有p
#
# print(tree.xpath('//div[@class="song"]'))  # 属性定位
# print(tree.xpath('//li[7]'))  # 从0开始

print(tree.xpath('//a[@class="feng"]/text()')[0])  # 列表 那元素
print(tree.xpath('//a[@class="feng"]/@href')[0])  # 列表 那元素

获取糗事百科中的文字内容

https://www.qiushibaike.com/text/

"""
获取糗事百科中的文字内容
https://www.qiushibaike.com/text/
"""
import requests
from lxml import etree

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

url = 'https://www.qiushibaike.com/text/'
page_text = requests.get(url=url, headers=headers).text

# 解析内容
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@class="col1 old-style-col1"]/div')
# print(len(div_list))
for div in div_list:
    author = div.xpath('./div[1]/a[2]/h2/text()')[0]  # 实现局部解析,前边一定要写个.  否则只写/就是全局的啦
    content = div.xpath('./a[1]/div/span//text()')
    content = ''.join(content)

    print(author, content)

批量获取4K美女图片

http://pic.netbian.com/4kmeinv/

"""
### 获取4K美女图片
http://pic.netbian.com/4kmeinv/
解决中文乱码问题
第一页:http://pic.netbian.com/4kmeinv/   需要单独处理
第二页:http://pic.netbian.com/4kmeinv/index_2.html

"""
import requests
import os

from urllib import request
from lxml import etree

dirName = 'meinvLibs'
if not os.path.exists(dirName):
    os.mkdir(dirName)

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(1, 11):
    if page == 1:
        new_url = 'http://pic.netbian.com/4kmeinv/'
    else:
        new_url = format(url%page)
    # print(new_url)
    page_text = requests.get(url=new_url, headers=headers).text
    tree = etree.HTML(page_text)
    a_list = tree.xpath('//div[@class="slist"]/ul/li/a')
    for a in a_list:
        img_src = 'http://pic.netbian.com' + a.xpath('./img/@src')[0]
        img_name = a.xpath('./b/text()')[0]
        img_name = img_name.encode('iso-8859-1').decode('gbk')  # 通用方式解决中文乱码问题
        # print(img_src, img_name)  # 无中文乱码
        imgPath = dirName + '/' + img_name + '.jpg'
        request.urlretrieve(img_src, filename=imgPath)
        print(img_name+'.jpg', '下载成功!')

作业:对免费的简历模板进行获取和保存

https://sc.chinaz.com/jianli/free.html 免费的简历模板进行获取和保存

"""
获取免费简历模板
https://sc.chinaz.com/jianli/free.html
"""
import requests
import os
from lxml import etree

dirName = 'jianliLibs'
if not os.path.exists(dirName):
    os.mkdir(dirName)

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

url = 'https://sc.chinaz.com/jianli/free_%d.html'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)

for page in range(1, 3):
    if page == 1:
        new_url = 'https://sc.chinaz.com/jianli/free.html'
    else:
        new_url = format(url%page)

    page_text = requests.get(url=new_url, headers=headers).text
    tree = etree.HTML(page_text)
    a_list = tree.xpath('//*[@id="main"]/div/div/p/a')
    for a in a_list:
        url_ = 'https:' + a.xpath('./@href')[0]
        name = a.xpath('./text()')[0]
        name = name.encode('iso-8859-1').decode('utf-8')  # 通用方式解决中文乱码问题
        # print(url_, name)
        new_page_text = requests.get(url=url_, headers=headers).text
        tree = etree.HTML(new_page_text)
        download_url = tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
        res = requests.get(url=download_url)
        path = dirName + '/' + name + '.rar'
        with open(path, 'wb') as fp:  # 以二进制的形式写,不需要指定编码格式,获取到的content是bytes
            fp.write(res.content)
        print(name, '下载成功!')

猜你喜欢

转载自blog.csdn.net/qq_31910669/article/details/111499567