Python crawler combat: use pyquery to crawl the content of the cat's eye movie TOP100 list-1

The last time I used beautifulsoup to crawl the TOP100 cat's eye movies, this time I used the recently learned pyquery to fight again, and it feels better than bs4.


Share the code below, welcome to communicate.

from pyquery import PyQuery as pq
import requests
import them
import time

begin = time.clock() # Add program running timing function.

file_path = 'D:\python3.6\scrapy\cat's eye' # Define a folder to facilitate subsequent check if the folder exists
file_name = 'maoyan.txt' # custom named file name,
file = file_path+'\\'+file_name # Create the full address of the file for subsequent reference

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
	
start = "http://maoyan.com/board/4" # Get the start page of the url
flage='?offset=' # url change flag
node='dd' # The target url corresponds to the target node in html
step=10 # url change step length
length=100 # url change max


def create_file(file_path,file): # Define functions for checking and creating target folders and files
	
	if os.path.exists(file_path)== False: # check folder does not exist
		os.makedirs(file_path) # create a new custom folder
		fp = open(file,'w') # create new custom file
	# "w" opens in write mode, only the file can be written, if the file does not exist, create the file; if the file already exists, clear it first, and then open the file
	
	elif os.path.exists(file_path)== True: # check folder exists
		with open(file, 'w', encoding='utf-8') as f: # Open the file in the destination folder
			f.seek(0)
	# f.seek(offset[,where]) Move the file pointer to the offset position relative to where. where is 0 for the beginning of the file, which is the default value; 1 for the current position; 2 for the end of the file
			f.truncate()
	#Clear the contents of the file, note: This function can only be executed when files opened in writable mode such as "r+" "rb+" "w" "wb" "wb+"

create_file(file_path,file)

for n in range(0,length,step): #Install url step change to crawl the target content of each page and store it locally
	dict ={} #Create an empty dictionary to store the crawled content
	if n==0: # Get the homepage url
		url=start
		i=n+1 # Custom i means the number of pages to crawl
	else : # url definition outside the home page
		url = start + flag + page (n)
		i=(n/10)+1
	r = requests.get(url,headers=headers) # Get the html of each page
	doc=pq(r.text) # Parse using ququery class
	page=doc.find(node)  #  查找所有node的内容
				
	for data in page.items():  #  遍历该页所有node的内容,并获取对应的目标值,然后写入本地
		# print(data,type(data)) #  检验data类型
		# print('1'*50)
		index = data.children('i').text()
		name = data.find('.name').text()
		star = data.find('.star').text()
		releasetime = data.find('.releasetime').text()
		score = data.find('.score').text()
		
		dict['index']=index
		dict['name']=name
		dict['star']=star
		dict['releasetime']=releasetime
		dict['score']=score
		
		with open(file, 'a', encoding='utf-8') as f: # 打开目标file文件
			f.write(str(dict)+'\n')   # 注意添加 换行符 '\n',实现每个dict自动换行写入txt中		
	print('第%d页爬取完毕!'%(i))
	
end = time.clock() # 添加程序运行计时功能。
print("爬取完毕,耗时:%f"%(end-begin))

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325621240&siteId=291194637