Python crawler combat: use beautiful soup to crawl the content of the cat's eye movie TOP100 list-1

I recently finished studying Cui Dashen's first crawler case: grab the cat's eye movie ranking and   click to open the link

But he did it with regular expressions. As a programming novice, I really don't have enough brainpower to understand those dazzling regular expressions.

So I directly used the beautiful soup I learned later in the actual combat, and at the same time I added some actual combat.

Share the code directly below, soup is not particularly skilled and needs to be optimized. Later, use pyquery to practice your hands. By the way, let's do data storage in practice.

I'll share my experience in actual combat later.

 
 
 
 
import requests
from bs4 import BeautifulSoup
import them
import time

start = time.clock() # Add program running timing function.

file_path = 'D:\python3.6\scrapy\cat's eye' # Define a folder to facilitate subsequent check if the folder exists
file_name = 'maoyan.txt' # custom named file name,
file = file_path+'\\'+file_name # Create the full address of the file for subsequent reference

url = "http://maoyan.com/board/4" # Get the start page of the url
 
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}


def Create_file(file_path,file): # Define functions for checking and creating target folders and files
	
	if os.path.exists(file_path)== False: # check folder does not exist
		os.makedirs(file_path) # create a new custom folder
		fp = open(file,'w') # create new custom file
	# "w" opens in write mode, only the file can be written, if the file does not exist, create the file; if the file already exists, clear it first, and then open the file
	
	elif os.path.exists(file_path)== True: # check folder exists
		with open(file, 'w', encoding='utf-8') as f: # Open the file in the destination folder
			f.seek(0)
	# f.seek(offset[,where]) Move the file pointer to the offset position relative to where. where is 0 for the beginning of the file, which is the default value; 1 for the current position; 2 for the end of the file
			f.truncate()
	#Clear the contents of the file, note: This function can only be executed when files opened in writable mode such as "r+" "rb+" "w" "wb" "wb+"

def get_all_pages(start):
# Define a function to get the target content of all pages
	
	pages=[]	
	for n in range(0,100,10):
	# Get the step value of offset, pay attention to converting n of int to str
	# Traverse all urls and get the target content of each page
		if n==0:
			url=start
		else:
			url=start+'?offset='+str(n)
			
		r = requests.get(url, headers=headers)
		soup = BeautifulSoup(r.content, 'lxml')
		page= soup.find_all(name='dd')	
		# Get the contents of all dd nodes of the gage
		
		pages.extend(page)
		# Expand all the page lists obtained into pages, so as to traverse the content of each dd node below
		
	return pages
	# Return the content of all pages' dd nodes, each dd node content is stored in a list

	
Create_file(file_path,file)

text = get_all_pages(url)

for film in text:
# Traverse all elements in the list text, that is, the content of each dd node
# This for loop should be optimized into a custom function form;
	dict ={}
	# create empty dict
	
	# print(type(film)) # Confirm that the film attribute is tag, so you can use tag-related methods to process film
	# print('*'*50) # You can separate and check the output content for easy comparison

	dict['Index']=film.i.string # Select the string attribute value of the first child node i of film
	
	comment1 = film.div.div.div # Get the third direct descendant node, such as all elements of the <div class="movie-item-info"> node in the comment below
	
	name= comment1.find_all(name='p')[0].string
	star = comment1.find_all(name='p')[1].string
	releasetime = comment1.find_all(name='p')[2].string
	
	dict['name']=name
	dict['star']=str.strip(star)
	dict['releasetime']=releasetime
		
	comment2 = comment1.find_next_sibling()
	# Get the next node of the third direct descendant node, such as all elements of the <div class="movie-item-info"> node in the comment below
	# print(comment2) # Check if comment2 is the target text
	sco1=comment2.i.string
	sco2=comment2.i.find_next_sibling().string
	
	# print(type(sco1)) # Judge that sco1 is a tag type
	# print(sco1) # Check if sco1 is the target output

	score = (sco1.string+str.strip(sco2))# Get the merged score string
	dict['score']=score
	
	print(dict) # Check if dict is the target output
	
	with open(file, 'a', encoding='utf-8') as f: # to open the target file
		f.write(str(dict)+'\n') # Pay attention to adding a newline character '\n', to realize that each dict is automatically written into txt		
end = time.clock() # Add program running timing function.
print('Crawling completed','\n','Time-consuming:',end-start) # Add program running timing function.


Operation result reference:



Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325658647&siteId=291194637