python + selenium + chrome mortal Cultivation novel crawling

Just beginning to learn reptiles, according to a project combat online video learned to crawl a site's images, and then to think of his own before looking at the "mortal Cultivation Biography" (APP before reading the phone can be viewed for free, now charges, but the page on a computer that can see) crawling about.

Problems encountered

Screen reading that can be directly applied to the code to crawl, and found that the video code can only crawl static pages, there is no way to dynamically load a page crawling. Under the Internet to find a method for dynamic web page, there are two ways, one is the reverse analysis protocol packets to find the URL request script, and then use the code to directly request; the other is the use of selenium module to simulate the behavior of the browser, and then arrested take data.
Analysis of two ways: The first one is the fastest, but for me this is white with some difficulty, so I gave up; choose to use the second method.

Code completion of the entire steps

1, and selenium module mounting ChromeDriver

Baidu some ways, the final choice of method a blog, can not find the link, then briefly describe:

  1. View your own chrome version to Download chromedriver above to find the corresponding version can be downloaded. Here in my own browser, for example, my chrome is 80.0.3987.122 version, but only on the following website version was more confused I do not know to choose which one to download, and then later learned that google's official website as long as the big It corresponds to the version number on it. Here Insert Picture Description
    I chose a download above (see notes as points go, it was found above a relatively new update date).
  2. The chromedriver.exe chrome.exe put under the directory, then the path system environment variable to add this path;
  3. Use pycharm install selenium module, from selenium import webdriver driver = webdriver.Chrome () to test these two, my error, and then restart my computer on OK;

2, the code explain

I write all the code directly in the main inside,
1, to obtain the addresses of all sections of the page

def getHtml(http_url):
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"}
response = requests.get(http_url,headers = header)
return response.text

def main():
#获取所有章节的地址
novel_url = "https://www.booktxt.net/1_1562"#
content = getHtml(novel_url)
pattern = re.compile(r'"5(.*?).html"')#正则匹配
novel_num = pattern.findall(content)
novel_url_list = [novel_url+"/5"+element for element in novel_num]

2, to set chrome headless mode (without display interface)

#设置无头浏览
chrome_options = Options()
chrome_options.add_argument('--headless')

#只打开一个浏览器(不需要放到循环里面)
driver = webdriver.Chrome(chrome_options = chrome_options)

3, traversing each chapter URLs to crawl content and save

count = 1              #计数,放到保存的每一章的文件前面,方便排序
for url in novel_url_list:
    
    driver.get(url)#打开网页
    content_text = driver.find_element_by_id("content").text       #使用id属性抓取本章的内容
    head = driver.find_element_by_xpath("//div[@class = 'bookname']/h1").text     #使用xpath格式抓取章节名称
        

    #保存内容到文件
    dir_path = "./凡人修仙"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    head = re.sub('[\/:*?"<>|]', '', head)# 去掉章节名中的非法字符,因为要以这个名字作为txt文件的名称
    fileName = dir_path+"/"+str(count)+head+".txt"
    file = open(fileName,"w",encoding="utf-8")
    file.write(content_text)
    file.close()
    
   print("当前下载章章数为:%d"%count)   #输出信息,方便查看
   # print("网址为%s"%url)
    count +=1

4, the final results of FIG.

Here Insert Picture Description

3, complete code

import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def getHtml(http_url):
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"}
    response = requests.get(http_url,headers = header)

    return response.text


def main():
    #获取一章的页面地址数据
    novel_url = "https://www.booktxt.net/1_1562"
    content = getHtml(novel_url)
    pattern = re.compile(r'"5(.*?).html"')#正则匹配
    novel_num = pattern.findall(content)
    novel_url_list = [novel_url+"/5"+element for element in novel_num]
    # print(novel_url_list)

    #设置无头浏览
    chrome_options = Options()
    chrome_options.add_argument('--headless')

    #打开一个浏览器
    driver = webdriver.Chrome(chrome_options = chrome_options)
    count = 1

    for url in novel_url_list:
        #获取一页的内容
        driver.get(url)
        try:
            content_text = driver.find_element_by_id("content").text
            head = driver.find_element_by_xpath("//div[@class = 'bookname']/h1").text
            #print(head)
        except Exception:
            head = "第"+str(count)+"无此章节"
            content_text = ""
            pass

        #保存内容到文件
        dir_path = "./凡人修仙"
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        head = re.sub('[\/:*?"<>|]', '', head)# 去掉非法字符
        fileName = dir_path+"/"+str(count)+head+".txt"
        file = open(fileName,"w",encoding="utf-8")


        file.write(content_text)
        file.close()
        print("当前下载章章数为:%d"%count)
        print("网址为%s"%url)
        count +=1

    return

if __name__ == '__main__':
    main()
Released two original articles · won praise 0 · Views 41

Guess you like

Origin blog.csdn.net/u014610970/article/details/104630346