BS4 actual combat data crawling of the Romance of the Three Kingdoms


# Requirements: Crawl the chapter titles and chapter contents in the novels of the Three Kingdoms http://www.shicimingju.com/book/sanguoyanyi.html
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
# Crawl the homepage data Take
headers = {
'User-Agent': 'Mozilla / 5.0 (Windows NT 6.3; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
} # UA camouflage
url = 'http: //
www.shicimingju.com/book/sanguoyanyi.html ' page_text = requests.get (url = url, headers = headers) .text

# The title of the chapter and the url of the details page are parsed on the home page
# 1 To instantiate the beautifulsoup object, you need to The page source data is loaded into the object
soup = BeautifulSoup (page_text, 'lxml')
# The title of the chapter and the url of the detail page are
parsed on the homepage li_list = soup.select ('. Book-mulu> ul> li')

fp = open ("./ sanguo.txt", 'w', encoding = 'utf-8')
for li in li_list:
title = li.a.string #todo
detail_url = 'http://www.shicimingju.com '+ li.a [' href ']
# Initiate a request to the detail page and parse out the chapter content
detail_page_text = requests.get (url = detail_url, headers = headers) .text
# Parse out the relevant content in the detail page
detail_soup = BeautifulSoup ( detail_page_text, 'lxml')
div_tag = detail_soup.find ('div', class_ = 'chapter_content')
# Parsing to the chapter content
content = div_tag.text ()
fp.write (title + ':' + content + '\ n')
print (title, "Crawl success")


Guess you like

Origin www.cnblogs.com/huahuawang/p/12692354.html