With process-oriented thinking,The first entry crawler. Use modules: requests, os, re.
The first crawler, only to experience the fun of crawling. So specify the access url, without using the bs4 module,no function encapsulation, also not written as an object. This script will be improved in the future. Just sharing the fun of the first simple crawler XD.
The idea is: 1. Parse url (requests module) 2. Get source text 3. Regular filter text (re module) 4. Download information 5. Clean information 6. Information storage
import requests import re import them def url2text(url): temp_url = url temp_rsp = requests.get(temp_url) temp_rsp.encoding = 'utf-8' temp_tex = temp_rsp.text return temp_tex def save_path(path,title): temp_path = path + title if not os.path.exists(temp_path): os.mkdir(temp_path) os.chdir(temp_path) def chap_item_get(url_list): c_url=[] c_tit = [] c_list = [] for each in url_list: temp_url,temp_tit = each temp_tit = temp_tit.lstrip() if 'book' not in temp_url: temp_url ='http://www.8wenku.com%s' %temp_url c_url.append(temp_url) c_tit.append(temp_tit) c_list.append(c_url) c_list.append(c_tit) return c_list def chap_download(item_list): list_leng = len(item_list[0]) for i in range(list_leng): chp_tit = item_list[1][i] print(chp_tit) if '章' in chp_tit or \ 'rolo' in chp_tit or \ 'pilo' in chp_tit or \ 'PILO' in chp_tit or \ 'pillo' in chp_tit or \ '过场' in chp_tit or \ '幕间' in chp_tit or \ 'postscript' in chp_tit or \ 'hap' in chp_tit: fb = open('%s.txt' %chp_tit,'w',encoding='utf-8') chp_text = url2text(item_list[0][i]) chp_cont = re.findall(r' will do everything for you!<br><br />(.*?)</div>',chp_text,re.S)[0] chp_cont = chp_cont.replace('<br />','') fb.write(chp_tit) fb.write(chp_cont) fb.close() cha_cont = [] print('%s crawled successfully' %chp_tit) print('='*60) else: print('%s skipped!!' %chp_tit) print('='*60) url = 'http://www.8wenku.com/book/1498' path = input('Please enter the save path:') def download_novel(path,url): html_text = url2text(url) title = re.findall(r'<h2 class="tit">《(.*?)》</h2>',html_text)[0] url_list = re.findall(r'<a target="_blank" href="(.*?)">(.*?)</a>',html_text) save_path(path,title) item_list = chap_item_get(url_list) chap_download(item_list) download_novel(path,url)