Getting Started with Python Crawler

import requests
import re

# TODO Download the homepage url of each novel

# TODO big loop

# 1. Download the novel's homepage 
novel_url = ' http://www.jingcaiyuedu.com/book/15205/list.html ' 
response = requests.get(novel_url)
 # Process the explicit specification of character encoding, 
response.encoding = ' utf -8 ' 
html = response.text # string 
# print(html) 
# 2. Extract chapter url non-greedy match 
title = re.findall(r ' <meta name="keywords" content="《(.*?)》' ,html)[0]
 # print(title) 
# id = list dl has two 
dl = re.findall(r ' <dl id="list">.*?</dl> ',html)[1]
# print(dl)
chapter_info_list = re.findall(r'<a.*?href="(.*?)".*?>(.*?)</a>',dl)
# print(chapter_info_list)

#Data persistence is written to txt 
fb = open( ' %s.txt ' %title, ' w ' ,encoding= ' utf-8 ' )

# 3. Loop through each chapter and extract the content 
for chapter_info in chapter_info_list:
    chapter_url = chapter_info[0]
    chapter_title = chapter_info[1 ]
     #Handle relative url 
    if  ' http '  not  in chapter_url:
        chapter_url = ' http://www.jingcaiyuedu.com%s ' % chapter_url #download
     chapter page 
    chapter_response = requests.get(chapter_url)
    chapter_response.encoding = " utf-8 " 
    chapter_html = chapter_response.text
     # print(chapter_response.text) 
    #Extract content 
    chapter_content = re.findall(r ' <script>a1\(\);</script>(.*?) <script>a2\(\);</script> ' ,chapter_html)[0] #Clean
     the data and process the extra characters 
    chapter_content = chapter_content.replace( '  ' , '' )
    chapter_content = chapter_content.replace('<br/>','')
    chapter_content = chapter_content.replace('<br>','')
    chapter_content = chapter_content.replace(' ','')
    # print(chapter_content)
    # 写入文件
    fb.write(chapter_title)
    fb.write('\n')
    fb.write(chapter_content)
    fb.write('\n')
    # chapter_response.close()
    print(chapter_url)

    # exit()

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324482846&siteId=291194637