import requests import re # TODO Download the homepage url of each novel # TODO big loop # 1. Download the novel's homepage novel_url = ' http://www.jingcaiyuedu.com/book/15205/list.html ' response = requests.get(novel_url) # Process the explicit specification of character encoding, response.encoding = ' utf -8 ' html = response.text # string # print(html) # 2. Extract chapter url non-greedy match title = re.findall(r ' <meta name="keywords" content="《(.*?)》' ,html)[0] # print(title) # id = list dl has two dl = re.findall(r ' <dl id="list">.*?</dl> ',html)[1] # print(dl) chapter_info_list = re.findall(r'<a.*?href="(.*?)".*?>(.*?)</a>',dl) # print(chapter_info_list) #Data persistence is written to txt fb = open( ' %s.txt ' %title, ' w ' ,encoding= ' utf-8 ' ) # 3. Loop through each chapter and extract the content for chapter_info in chapter_info_list: chapter_url = chapter_info[0] chapter_title = chapter_info[1 ] #Handle relative url if ' http ' not in chapter_url: chapter_url = ' http://www.jingcaiyuedu.com%s ' % chapter_url #download chapter page chapter_response = requests.get(chapter_url) chapter_response.encoding = " utf-8 " chapter_html = chapter_response.text # print(chapter_response.text) #Extract content chapter_content = re.findall(r ' <script>a1\(\);</script>(.*?) <script>a2\(\);</script> ' ,chapter_html)[0] #Clean the data and process the extra characters chapter_content = chapter_content.replace( ' ' , '' ) chapter_content = chapter_content.replace('<br/>','') chapter_content = chapter_content.replace('<br>','') chapter_content = chapter_content.replace(' ','') # print(chapter_content) # 写入文件 fb.write(chapter_title) fb.write('\n') fb.write(chapter_content) fb.write('\n') # chapter_response.close() print(chapter_url) # exit()