The last time I used beautifulsoup to crawl the TOP100 cat's eye movies, this time I used the recently learned pyquery to fight again, and it feels better than bs4.
Share the code below, welcome to communicate.
from pyquery import PyQuery as pq import requests import them import time begin = time.clock() # Add program running timing function. file_path = 'D:\python3.6\scrapy\cat's eye' # Define a folder to facilitate subsequent check if the folder exists file_name = 'maoyan.txt' # custom named file name, file = file_path+'\\'+file_name # Create the full address of the file for subsequent reference headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} start = "http://maoyan.com/board/4" # Get the start page of the url flage='?offset=' # url change flag node='dd' # The target url corresponds to the target node in html step=10 # url change step length length=100 # url change max def create_file(file_path,file): # Define functions for checking and creating target folders and files if os.path.exists(file_path)== False: # check folder does not exist os.makedirs(file_path) # create a new custom folder fp = open(file,'w') # create new custom file # "w" opens in write mode, only the file can be written, if the file does not exist, create the file; if the file already exists, clear it first, and then open the file elif os.path.exists(file_path)== True: # check folder exists with open(file, 'w', encoding='utf-8') as f: # Open the file in the destination folder f.seek(0) # f.seek(offset[,where]) Move the file pointer to the offset position relative to where. where is 0 for the beginning of the file, which is the default value; 1 for the current position; 2 for the end of the file f.truncate() #Clear the contents of the file, note: This function can only be executed when files opened in writable mode such as "r+" "rb+" "w" "wb" "wb+" create_file(file_path,file) for n in range(0,length,step): #Install url step change to crawl the target content of each page and store it locally dict ={} #Create an empty dictionary to store the crawled content if n==0: # Get the homepage url url=start i=n+1 # Custom i means the number of pages to crawl else : # url definition outside the home page url = start + flag + page (n) i=(n/10)+1 r = requests.get(url,headers=headers) # Get the html of each page doc=pq(r.text) # Parse using ququery class page=doc.find(node) # 查找所有node的内容 for data in page.items(): # 遍历该页所有node的内容,并获取对应的目标值,然后写入本地 # print(data,type(data)) # 检验data类型 # print('1'*50) index = data.children('i').text() name = data.find('.name').text() star = data.find('.star').text() releasetime = data.find('.releasetime').text() score = data.find('.score').text() dict['index']=index dict['name']=name dict['star']=star dict['releasetime']=releasetime dict['score']=score with open(file, 'a', encoding='utf-8') as f: # 打开目标file文件 f.write(str(dict)+'\n') # 注意添加 换行符 '\n',实现每个dict自动换行写入txt中 print('第%d页爬取完毕!'%(i)) end = time.clock() # 添加程序运行计时功能。 print("爬取完毕,耗时:%f"%(end-begin))