I recently finished studying Cui Dashen's first crawler case: grab the cat's eye movie ranking and click to open the link
But he did it with regular expressions. As a programming novice, I really don't have enough brainpower to understand those dazzling regular expressions.
So I directly used the beautiful soup I learned later in the actual combat, and at the same time I added some actual combat.
Share the code directly below, soup is not particularly skilled and needs to be optimized. Later, use pyquery to practice your hands. By the way, let's do data storage in practice.
I'll share my experience in actual combat later.
import requests from bs4 import BeautifulSoup import them import time start = time.clock() # Add program running timing function. file_path = 'D:\python3.6\scrapy\cat's eye' # Define a folder to facilitate subsequent check if the folder exists file_name = 'maoyan.txt' # custom named file name, file = file_path+'\\'+file_name # Create the full address of the file for subsequent reference url = "http://maoyan.com/board/4" # Get the start page of the url headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} def Create_file(file_path,file): # Define functions for checking and creating target folders and files if os.path.exists(file_path)== False: # check folder does not exist os.makedirs(file_path) # create a new custom folder fp = open(file,'w') # create new custom file # "w" opens in write mode, only the file can be written, if the file does not exist, create the file; if the file already exists, clear it first, and then open the file elif os.path.exists(file_path)== True: # check folder exists with open(file, 'w', encoding='utf-8') as f: # Open the file in the destination folder f.seek(0) # f.seek(offset[,where]) Move the file pointer to the offset position relative to where. where is 0 for the beginning of the file, which is the default value; 1 for the current position; 2 for the end of the file f.truncate() #Clear the contents of the file, note: This function can only be executed when files opened in writable mode such as "r+" "rb+" "w" "wb" "wb+" def get_all_pages(start): # Define a function to get the target content of all pages pages=[] for n in range(0,100,10): # Get the step value of offset, pay attention to converting n of int to str # Traverse all urls and get the target content of each page if n==0: url=start else: url=start+'?offset='+str(n) r = requests.get(url, headers=headers) soup = BeautifulSoup(r.content, 'lxml') page= soup.find_all(name='dd') # Get the contents of all dd nodes of the gage pages.extend(page) # Expand all the page lists obtained into pages, so as to traverse the content of each dd node below return pages # Return the content of all pages' dd nodes, each dd node content is stored in a list Create_file(file_path,file) text = get_all_pages(url) for film in text: # Traverse all elements in the list text, that is, the content of each dd node # This for loop should be optimized into a custom function form; dict ={} # create empty dict # print(type(film)) # Confirm that the film attribute is tag, so you can use tag-related methods to process film # print('*'*50) # You can separate and check the output content for easy comparison dict['Index']=film.i.string # Select the string attribute value of the first child node i of film comment1 = film.div.div.div # Get the third direct descendant node, such as all elements of the <div class="movie-item-info"> node in the comment below name= comment1.find_all(name='p')[0].string star = comment1.find_all(name='p')[1].string releasetime = comment1.find_all(name='p')[2].string dict['name']=name dict['star']=str.strip(star) dict['releasetime']=releasetime comment2 = comment1.find_next_sibling() # Get the next node of the third direct descendant node, such as all elements of the <div class="movie-item-info"> node in the comment below # print(comment2) # Check if comment2 is the target text sco1=comment2.i.string sco2=comment2.i.find_next_sibling().string # print(type(sco1)) # Judge that sco1 is a tag type # print(sco1) # Check if sco1 is the target output score = (sco1.string+str.strip(sco2))# Get the merged score string dict['score']=score print(dict) # Check if dict is the target output with open(file, 'a', encoding='utf-8') as f: # to open the target file f.write(str(dict)+'\n') # Pay attention to adding a newline character '\n', to realize that each dict is automatically written into txt end = time.clock() # Add program running timing function. print('Crawling completed','\n','Time-consuming:',end-start) # Add program running timing function.
Operation result reference: