Crawling poems famous novels - Three Kingdoms

from bs4 import BeautifulSoup
import urllib.request
import time

def get_request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
request = urllib.request.Request(url=url, headers=headers)
return request

def get_content(request):
response = urllib.request.urlopen(request)
content=response.read().decode('utf8')
# print(content)
return content

def parse_content(content):
# 生成soup对象
soup=BeautifulSoup(content,'lxml')
# 根据方法查找所有的章节和内容
= soup.find odiv ( 'div', _ class = "MULU-Book")
# Print (odiv)
get_text (odiv)
DEF get_string (the href):
# construct the requested object
Request = Get_Request (the href)
Response = the urllib.request.urlopen (Request)
content = response.read (). decode ( 'utf8')
# objects generated soup
soup = BeautifulSoup (content, 'lxml')
# find chapters
odiv = soup.find ( 'div', class _ = "chapter_content" )
# returns the content
return odiv.text
DEF get_text (odiv):
# soup generating objects
# = the BeautifulSoup soup (odiv, 'lxml')
# Find all sections of a link in accordance with the method
oa_list odiv.find_all = ( 'a')
# Print (len (oa_list))
# convenient list to give each object a link and title

fp = open ( 'Three Kingdoms .txt', 'w', encoding = 'utf8')
for OA in oa_list:
# give the title
title = oa.string
Print ( "Downloading% S ----"% title)
# Print (title)
# links give
the href = 'HTTP: //www.shicimingju.com'+oa [' the href ']
# Print (the href)
# Exit ()
# sends a request to the href, parse the response to obtain the contents of
text = get_string (href)
# write to file
fp.write (title + '\ the n-' + text)
Print ( 'download ends')
the time.sleep (2)
fp.close ()




DEF main ():
url =' HTTP: // the WWW .shicimingju.com / book / sanguoyanyi.html '
Construction # request object
Request = Get_Request (URL)
# give response
= get_content Content (Request)
# bs4 by parsing web content
parse_content (Content)
IF the __name__ == '__main__':
main ()

Guess you like

Origin www.cnblogs.com/zhangshuntao123/p/11626727.html