Python from entry to crawler case implementation~

foreword

Hello! Hello everyone, this is the Demon King~

Environmental use:

  • Python 3.8
  • Pycharm

Module use:

  • requests >>> pip install requests
  • parsel >>> pip install parsel

Parsing method css xpath re
requests selenium

If installing python third-party modules:

  1. win + R Enter cmd Click OK, enter the installation command pip install module name (pip install requests) Enter
  2. Click Terminal in pycharm to enter the installation command

How to configure the python interpreter in pycharm?

  1. Select file >>> setting >>> Project >>> python interpreter (python interpreter)
  2. Click on the gear, select add
  3. Add python installation path

Python related learning direction, application direction:

  1. Website development Website development
    , Douban Meituan youtube
    forum official website background information management system…
  2. The crawler program can be crawled when it is visible (Baidu library content format is relatively messy) No data is disclosed by the data collection program on the web page (personal information involves copyright [not profitable] astringent sentiments)
    Collect data content in batches, and the script for snapping up goods automatically sends a bullet screen automatically Comment and like script automatically send emails
  3. Data Analysis
    Take the acquired data and make some visual charts to clearly see the data trend, price range, customer population analysis…
  4. Artificial intelligence (you can find a job but the education requirements are very high, master's degree related majors)
    face recognition video face changing small robot voice recognition tone assistant...
  5. Office automation
    is suitable for accounting, financial, and clerical work that usually processes a lot of data.
  6. game
    development

Make a simple crawler case: a little front-end knowledge

  1. Basic data type string usage definition
  2. Data container list (list) dict (dictionary)
  3. for loop use
  4. Simple use of requests
  5. Use of parsing methods
import re

"""
# 什么样的数据才是字符串数据 字符串是什么样子的?
a = 'python'  # 单引号 双引号 三引号(也可以作为多行注释)
b = "hello"
print(c)
"""

"""
列表 [] 数据容器 存储数据内容
    列表取值 根据他索引位置提取内容
    列表切片
lis = [1, 2, 3, 4, 5, 6, 7, 8]
# 提取lis里面 元素 4 怎么取
print(lis[3])
print(lis[-5])
# 提取列表里面 1 2 3 4   顾头不顾尾
print(lis[0:4:1])  # 步长默认是 1
# 提取列表 1 3 5 7   1 2 3 4 5 7
print(lis[0:7:2])
# 提取列表 2 4 6
print(lis[1:6:2])
lis = ['1', '2', '3', '4', '5', '6', '7', '8']
# 如果说想要提取  1 2 3 4 5 6 7 8 都提取出来 一个一个提取
# 如果想要获取数据 1,2,3,4,5,6,7,8  列表转字符串
# for i in lis:
#     print(i)
print(str(lis))
string = ','.join(lis)  # 把列表 转成字符串
print(string)
print('1,2,3,4,5,6,7,8')
"""
# # 导入一个数据请求模块
# import requests   # requests 别人写好的代码 程序 可以直接拿过来使用
# # 导入解析模块
# import parsel
# # 导入文件操作模块
# import os
# import re
#
# # 代理ip结构
# # proxies_dict = {
    
    
# #     "http": "http://" + ip:端口,
# #     "https": "http://" + ip:端口,
# # }
#
#
# def get_proxies():
#     proxies_url = 'http://tiqu.pyhttp.taolop.com/getip?count=1&neek=15790&type=2&yys=0&port=1&sb=&mr=1&sep=0&ts=1&time=4'
#     json_data = requests.get(url=proxies_url).json()
#     # print(json_data)
#     proxies_dict = {
    
    
#         "http": "http://" + json_data['data'][0]['ip'] + ':' + str(json_data['data'][0]['port']),
#         "https": "http://" + json_data['data'][0]['ip'] + ':' + str(json_data['data'][0]['port']),
#     }
#     return proxies_dict
#
#
# proxies_dict = get_proxies()
# list_url = 'https://www.qbiqu.com/0_1/'  # 小说目录页面
# headers = {
    
    
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
# }
# response = requests.get(list_url, headers, proxies=proxies_dict)
# response.encoding = 'gbk'  # 编码  gbk编码格式
# # print(response.text)
# href = re.findall('<dd><a href="(.*?)">.*?</a></dd>', response.text)
# name = re.findall('<div id="fmimg"><img alt="(.*?)"', response.text)[0]  # 小说名字
# # print(href)
# for index in href:
#     # https://www.qbiqu.com/0_1/2.html
#     index_url = 'https://www.qbiqu.com' + index
#     # print(index_url)
#     # 爬虫就是模拟浏览器对于服务器发送请求
#     # url = 'https://www.qbiqu.com/0_1/1.html'  # 用自定义的变量接收字符串数据内容 url
#     # requests 发送请求模块  get 请求方法 url 要请求网址
#     response = requests.get(index_url, headers, proxies=proxies_dict)
#     response.encoding = 'gbk'  # 编码  gbk编码格
#     # print(response.text)
#     # re.sub(r'[/\*?":<>|]', '', title)
#     """
#     解析数据:
#         css xpath re
#
#     什么时候使用css和xpath:  没有办法直接对于字符串数据进行提取
#     css选择器 就根据标签属性提取数据
#     xpath 根据标签节点提取数据
#         当得到数据, 有标签的时候
#
#     无论是css还是xpath 都可以跨标签提取
#
#     re 当你没有办法使用标签提取数据的时候用正则 可以直接对于字符串数据进行提取
#
#     css和xpath 相当于面条 (不能生吃)
#     re 相当于 方便面  (可以直接生吃)
#     """
#     selector = parsel.Selector(response.text)  # response.text 字符串数据 转成可解析的对象
#     # h1::text 提取h1标签里面文本内容 get() 获得获取一个 python优点 简洁优雅
#     title = selector.css('.bookname h1::text').get()  # ctrl + C  ctrl + v
#     # title_1 = selector.xpath('//*[@class="bookname"]/h1/text()').get()  # ctrl + C  ctrl + v
#
#     # getall() 获取所有 全都要
#     content_list = selector.css('#content::text').getall()
#     content = ''.join(content_list)
#     # filename = 'data\\'
#     # if not os.path.exists(filename):
#     #     os.mkdir(filename)
#     # 关于文件操作 保存 相对路径(代码在哪里你就保存到哪里) 和 绝对路径(指定那个盘里面哪一个文件夹)
#     # mode 保存方式 w 写入数据(会覆盖)  a追加保存(不会覆盖)
#     with open(name + '.txt', mode='a', encoding='utf-8') as f:  # 配置文件 文件路径 名字 保存方式 编码格式
#         f.write(title)  # 写入内容
#         f.write('\n')
#         f.write(content)
#         f.write('\n')
#
#     print('正在保存: ', title)
#

import requests
import parsel
import concurrent.futures


def get_response(html_url):
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
    }
    response = requests.get(url=html_url, headers=headers)
    response.encoding = response.apparent_encoding
    return response


def get_list_url(html_url):
    """章节url"""
    html_data = get_response(html_url).text
    selector = parsel.Selector(html_data)
    href = re.findall('<a href="/biquge_\d+/(\d+).html"', html_data)[11:]
    # href = selector.css('#list a::attr(href)').getall()[9:]
    return href


def get_name(html_url):
    html_data = get_response(html_url).text
    selector = parsel.Selector(html_data)
    name = selector.css('#info h1::text').get()
    return name


def get_content(list_url):
    html_data = get_response(list_url).text
    selector = parsel.Selector(html_data)
    title = selector.css('.bookname::text').get()
    content_list = selector.css('#booktxt p::text').getall()
    content = '\n'.join(content_list)
    novel_content = [title, content]
    # print(html_data)
    # print(title)
    # print(content_list)
    # print(content)
    return novel_content


def save(name, title, content):
    with open(name + '.txt', mode='a', encoding='utf-8') as f:
        f.write(title)
        f.write('\n')
        f.write(content)
        f.write('\n')
    print(title)


def main(html_url):
    href = get_list_url(html_url)
    name = get_name(html_url)
    for index in href:
        for page in range(1, 3):
            index_url = f'https://www.biqugeso.org/biquge_132699/{
    
    index}_{
    
    page}.html'
            print(index_url)
            content = get_content(index_url)
            save(name, content[0], content[1])



if __name__ == '__main__':
    url = 'https://www.biqugeso.org/biquge_132699/'
    main(url)

Better with video

Python basics: from entry to crawler case implementation~

epilogue

Well, this article of mine ends here!

If you have more suggestions or questions, feel free to comment or private message me! Let's work hard together (ง •_•)ง

Follow the blogger if you like it, or like and comment on my article! ! !

Guess you like

Origin blog.csdn.net/python56123/article/details/124148001