Getting Started Crawler (Function Encapsulation) - Python

With process-oriented thinking,The first entry crawler. Use modules: requests, os, re.

The first crawler, only to experience the fun of crawling. So specify the access url, without using the bs4 module,no function encapsulation, also not written as an object. This script will be improved in the future. Just sharing the fun of the first simple crawler XD.

The idea is: 1. Parse url (requests module) 2. Get source text 3. Regular filter text (re module) 4. Download information 5. Clean information 6. Information storage

import requests
import re
import them

def url2text(url):
	temp_url = url
	temp_rsp = requests.get(temp_url)
	temp_rsp.encoding = 'utf-8'
	temp_tex = temp_rsp.text
	return temp_tex
	
def save_path(path,title):
	temp_path = path + title
	if not os.path.exists(temp_path):
		os.mkdir(temp_path)
	os.chdir(temp_path)

def chap_item_get(url_list):
	c_url=[]
	c_tit = []
	c_list = []
	for each in url_list:
		temp_url,temp_tit = each
		temp_tit = temp_tit.lstrip()
		if 'book' not in temp_url:
			temp_url ='http://www.8wenku.com%s' %temp_url
			c_url.append(temp_url)
			c_tit.append(temp_tit)
	c_list.append(c_url)
	c_list.append(c_tit)
	return c_list
	
def chap_download(item_list):
	list_leng = len(item_list[0])
	for  i in range(list_leng):
		chp_tit = item_list[1][i]
		print(chp_tit)
		if 	'章'	in chp_tit or \
			'rolo'	in chp_tit or \
			'pilo'	in chp_tit or \
			'PILO'	in chp_tit or \
			'pillo'	in chp_tit or \
			'过场'	in chp_tit or \
			'幕间'	in chp_tit or \
			'postscript' in chp_tit or \
			'hap'	in chp_tit:		
			fb = open('%s.txt' %chp_tit,'w',encoding='utf-8')
			chp_text = url2text(item_list[0][i])
			chp_cont = re.findall(r' will do everything for you!<br><br />(.*?)</div>',chp_text,re.S)[0]
			chp_cont = chp_cont.replace('<br />','')
			fb.write(chp_tit)
			fb.write(chp_cont)
			fb.close()
			cha_cont = []
			print('%s crawled successfully' %chp_tit)
			print('='*60)
		else:
			print('%s skipped!!' %chp_tit)	
			print('='*60)
		
url = 'http://www.8wenku.com/book/1498'
path = input('Please enter the save path:')

def download_novel(path,url):
	
	html_text = url2text(url)
	title = re.findall(r'<h2 class="tit">《(.*?)》</h2>',html_text)[0]
	url_list = re.findall(r'<a target="_blank" href="(.*?)">(.*?)</a>',html_text)

	save_path(path,title)
	
	item_list = chap_item_get(url_list)
	
	chap_download(item_list)
		
download_novel(path,url)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324642901&siteId=291194637