1. Thematic web crawler design
1. Thematic web crawler name
Name: Climbing Horse Honeycomb Hot Spots in Quanzhou
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
ur1 = 'http: //www.mafengwo.cn/search/q.php? q =% E6% B3% 89% E5% B7% 9E & seid = 8517A6C2- 4C2D-453A-83F4-4C281B0E91E9 '# Open web
headers = (' User-Agent ':' Mozilla / 5.0 (Windows NT 6.3; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 69.0.3497.100 Safari / 537.36 '} #
r = requests.get (ur1, headers = headers)
#Request website r.encoding = r.apparent_encoding # Unified encoding
data = r.text
soup = BeautifulSoup (data,' html.parser ') # Use "delicious "Soup" tool
print (soup.prettify ()) # Display website structure
title = []
midlle = []
for i in soup.find_all (class _ = "title"): # Put popular attractions into the empty list
title.append (i .get_text (). strip ())
for k in soup.find_all (class _ = "middle"): # Put the attraction profile into the empty list
midlle.append (k.get_text (). strip ())
data = [title, midlle]
print (data)
2. Analysis of the content and data characteristics of thematic web crawlers
This time the crawler mainly crawled the information, ranking, and bee ratings of the popular attractions in Quanzhou
3. Overview of thematic web crawler design scheme (including implementation ideas and technical difficulties)
Realization idea: Get the HTML page of the popular scenic spot in Quanzhou, and use requests to crawl the data, BeautifulSoup parses the page, and uses records to store and read data
Technical difficulties: crawling data, data cleaning
Second, the structural characteristics of the theme page analysis
1. Structural characteristics of the theme page
2. Htmls page analysis
Check the name of the tag attraction and find that the title is under p class = 'title', and the bee rating is under the font tag. 3. Web crawler programming
The main body of the crawler program should include the following parts, with source code and more detailed comments, and provide a screenshot of the output result after each part of the program.
Three, data crawling and collection
def get (url, list, num): #Define a function to get information headers = ('user-agent': 'Mo + zilla / 5.0 (Windows; U; Windows NT 5.1; it; rv: 1.8.1.11) Gecko / 20071127 Firefox / 2.0.0.11 ') #Pretend to be a crawler or it will not be able to crawl webpage information r = requests.get (url, timeout = 30, headers = headers) #The time to send the request is 30s r.raise_for_status () r.encoding = r. apparent_encoding soup = BeautifulSoup (r.text, "html.parser") #html compiler list1 = [] list2 = [] #Create two lists to store the names of attractions and information about bee ratings list1 = soup.find_all ('p' , class _ = 'title') #Find data with label p list2 = soup.find_all ('font', color = "# 474747") #Find data with label font, color = "# 474747" print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t ".format ('Ranking', 'Attraction Name', 'Bee Ratings')) for i in range(num): list.append ([i + 1, list1 [i] .string, list2 [i] .string]) print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t" .format (i + 1, list1 [i] .string, list2 [i] .string)) #Data Add to list array
2. Data cleaning and processing
def check_file (file_path): quanzhou = pd.DataFrame (pd.read_excel ('D: \ python \ quanzhou.xlsx')) print ('\ n ==== The empty value of each column is as follows: ====' ) print (quanzhou.isnull ()) #Statistical null value print (quanzhou.duplicated ()) #Find duplicate values print (quanzhou.isna (). head ()) #Statistical missing value # If the result is False, it is not Null value print (quanzhou.describe ()) #Describe the data
3. Data visualization and persistence
def chart ():
plt.rcParams ['font.sans-serif'] = ['SimHei'] #set font
filename = 'D: \ python \ quanzhou.xlsx'
colnames = ['rank', 'spot name', 'Bee Rating Number']
df = pd.read_excel (filename)
X = df.loc [1: 8, 'Attraction Name']
Y = df.loc [1: 8, 'Bee Rating Number']
plt.bar (X , Y)
plt.title ("Historical Map of Quanzhou Hot Spots")
plt.show
def create_file (file_path, msg): #Define a create folder and open the crawled resources in excel format view = r'D: \ python \ quanzhou.xlsx ' df = pd.DataFrame (msg, columns = (' rank ',' Attraction name ',' Bee evaluation number ')) df.to_excel (view) print (' Create excel completed ')
4. Complete program code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import scipy as sp
from numpy import genfromtxt
import matplotlib
from pandas import DataFrame
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
ur1 = 'http: //www.mafengwo.cn/search/q.php? q =% E6% B3% 89% E5% B7% 9E & seid = 8517A6C2-4C2D-453A-83F4-4C281B0E91E9' # Open the web
headers = { 'User-Agent': 'Mozilla / 5.0 (Windows NT 6.3; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 69.0.3497.100 Safari / 537.36'} #
r = requests.get (ur1, headers = headers )
#Request website r.encoding = r.apparent_encoding # Unified encoding
data = r.text
soup = BeautifulSoup (data, 'html.parser') # Use the "delicious soup" tool
print (soup.prettify ()) # Show website Structure
title = []
midlle = []
for i in soup.find_all (class _ = "title"):
#Put popular attractions into the empty list title.append (i.get_text (). Strip ())
for k in soup. find_all (class _ = "middle"): # Put the attraction profile into the empty
listmidlle.append (k.get_text (). strip ())
data = [title, midlle]
print (data)
def get (url, list, num):
#Define a function to get information headers = ('user-agent': 'Mo + zilla / 5.0 (Windows; U; Windows NT 5.1; it; rv: 1.8.1.11) Gecko / 20071127 Firefox / 2.0.0.11 ') #Pretend to be a crawler or it will not be able to crawl webpage information
r = requests.get (url, timeout = 30, headers = headers)
#The time to send the request is 30s r.raise_for_status ()
r.encoding = r. apparent_encoding
soup = BeautifulSoup (r.text, "html.parser")
list1 = []
list2 = [] #Create two lists to store the names of attractions and information about bee ratings
list1 = soup.find_all ('p', class_ = ' title ')
#Find data with label p list2 = soup.find_all (' font ', color = "# 474747") #Find data with label font, color = "# 474747"
print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t ".format ('Ranking', 'Attraction Name', 'Bee Rating'))
for i in range (num):
list.append ([i + 1, list1 [i].string,list2[i].string])
print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t" .format (i + 1, list1 [i] .string, list2 [i] .string)) #Data Add to list array
def create_file (file_path, msg):
#Define a create folder and open the crawled resources in excel format view = r'D: \ python \ quanzhou.xlsx '
df = pd.DataFrame (msg, columns = (' rank ',' Attraction name ',' Bee evaluation number '))
df.to_excel (view)
print (' Create excel completed ')
def check_file (file_path):
quanzhou = pd.DataFrame (pd.read_excel ('D: \ python \ quanzhou.xlsx'))
print ('\ n ==== The empty value of each column is as follows: ====' )
print (quanzhou.isnull ()) #Statistical null value
print (quanzhou.duplicated ()) #Find duplicate values
print (quanzhou.isna (). head ()) #Statistical missing value # If the result is False, it is not Null value
print (quanzhou.corr ()) #Correlation coefficient of two variables
print (quanzhou.describe ()) #Print out data
def chart ():
plt.rcParams ['font.sans-serif'] = [' SimHei '] #Set font
filename =' D: \ python \ quanzhou.xlsx '
colnames = [' Ranking ',' Attraction Name ',' Bee Ratings']
df = pd.read_excel (filename)
X = df.loc [ 1: 8, 'Name of attractions'
Y = df.loc [1: 8, 'Bee rating']
plt.bar (X, Y)
plt.title ("Quanzhou popular attractions histogram")
plt.show
def main():
list = []
url = "http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E"
get(url,list,8)
create_file('D:\python\quanzhou.xlsx',list)
check_file('D:\python\quanzhou.xlsx')
chart()
chart2()
main()
four,
1. After analyzing and visualizing the subject data, what conclusions can be obtained?
You can get the data you want through python, which can save a lot of work and improve efficiency.
2. Make a simple summary of the completion of the program design task.
Through this assignment, I have mastered new skills, learned more knowledge, and deepened my understanding of python