Crawling of popular attractions (Quanzhou)

1. Thematic web crawler design

1. Thematic web crawler name

Name: Climbing Horse Honeycomb Hot Spots in Quanzhou

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
ur1 = 'http: //www.mafengwo.cn/search/q.php? q =% E6% B3% 89% E5% B7% 9E & seid = 8517A6C2- 4C2D-453A-83F4-4C281B0E91E9 '# Open web
headers = (' User-Agent ':' Mozilla / 5.0 (Windows NT 6.3; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 69.0.3497.100 Safari / 537.36 '} #
r = requests.get (ur1, headers = headers)
#Request website r.encoding = r.apparent_encoding # Unified encoding
data = r.text
soup = BeautifulSoup (data,' html.parser ') # Use "delicious "Soup" tool
print (soup.prettify ()) # Display website structure
title = []
midlle = []
for i in soup.find_all (class _ = "title"): # Put popular attractions into the empty list
title.append (i .get_text (). strip ())
for k in soup.find_all (class _ = "middle"): # Put the attraction profile into the empty list
midlle.append (k.get_text (). strip ())
data = [title, midlle]
print (data)

 

2. Analysis of the content and data characteristics of thematic web crawlers

This time the crawler mainly crawled the information, ranking, and bee ratings of the popular attractions in Quanzhou

3. Overview of thematic web crawler design scheme (including implementation ideas and technical difficulties)

Realization idea: Get the HTML page of the popular scenic spot in Quanzhou, and use requests to crawl the data, BeautifulSoup parses the page, and uses records to store and read data

Technical difficulties: crawling data, data cleaning

Second, the structural characteristics of the theme page analysis

1. Structural characteristics of the theme page

2. Htmls page analysis

Check the name of the tag attraction and find that the title is under p class = 'title', and the bee rating is under the font tag. 3. Web crawler programming

The main body of the crawler program should include the following parts, with source code and more detailed comments, and provide a screenshot of the output result after each part of the program.

Three, data crawling and collection

def get (url, list, num): 
    #Define a function to get information headers = ('user-agent': 'Mo + zilla / 5.0 (Windows; U; Windows NT 5.1; it; rv: 1.8.1.11) Gecko / 20071127 Firefox / 2.0.0.11 ') #Pretend to be a crawler or it will not be able to crawl webpage information 
    
    r = requests.get (url, timeout = 30, headers = headers) 
    
    #The time to send the request is 30s r.raise_for_status () 
    r.encoding = r. apparent_encoding 
    
    soup = BeautifulSoup (r.text, "html.parser") #html compiler 
   
    list1 = []    
    list2 = [] #Create two lists to store the names of attractions and information about bee ratings 
    
    list1 = soup.find_all ('p' , class _ = 'title') 
    #Find data with label p list2 = soup.find_all ('font', color = "# 474747") #Find data with label font, color = "# 474747" 
    
    print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t ".format ('Ranking', 'Attraction Name', 'Bee Ratings'))
    
    for i in range(num):
        list.append ([i + 1, list1 [i] .string, list2 [i] .string])
        print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t" .format (i + 1, list1 [i] .string, list2 [i] .string)) #Data Add to list array

2. Data cleaning and processing
def check_file (file_path): 
    quanzhou = pd.DataFrame (pd.read_excel ('D: \ python \ quanzhou.xlsx')) 
    print ('\ n ==== The empty value of each column is as follows: ====' ) 
    print (quanzhou.isnull ()) #Statistical null value 
    print (quanzhou.duplicated ()) #Find duplicate values 
    print (quanzhou.isna (). head ()) #Statistical missing value # If the result is False, it is not Null value 
    
    print (quanzhou.describe ()) #Describe the data

3. Data visualization and persistence

def chart ():
plt.rcParams ['font.sans-serif'] = ['SimHei'] #set font

filename = 'D: \ python \ quanzhou.xlsx'
colnames = ['rank', 'spot name', 'Bee Rating Number']
df = pd.read_excel (filename)

X = df.loc [1: 8, 'Attraction Name']
Y = df.loc [1: 8, 'Bee Rating Number']

plt.bar (X , Y)
plt.title ("Historical Map of Quanzhou Hot Spots")
plt.show

def create_file (file_path, msg): 
    #Define a create folder and open the crawled resources in excel format view = r'D: \ python \ quanzhou.xlsx ' 
    df = pd.DataFrame (msg, columns = (' rank ',' Attraction name ',' Bee evaluation number ')) 
    df.to_excel (view) 
    print (' Create excel completed ')

4. Complete program code

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import scipy as sp
from numpy import genfromtxt
import matplotlib
from pandas import DataFrame
import matplotlib.pyplot as plt
from scipy.optimize import leastsq

ur1 = 'http: //www.mafengwo.cn/search/q.php? q =% E6% B3% 89% E5% B7% 9E & seid = 8517A6C2-4C2D-453A-83F4-4C281B0E91E9' # Open the web
headers = { 'User-Agent': 'Mozilla / 5.0 (Windows NT 6.3; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 69.0.3497.100 Safari / 537.36'} #
r = requests.get (ur1, headers = headers )
#Request website r.encoding = r.apparent_encoding # Unified encoding
data = r.text
soup = BeautifulSoup (data, 'html.parser') # Use the "delicious soup" tool
print (soup.prettify ()) # Show website Structure
title = []
midlle = []
for i in soup.find_all (class _ = "title"):
#Put popular attractions into the empty list title.append (i.get_text (). Strip ())
for k in soup. find_all (class _ = "middle"): # Put the attraction profile into the empty
listmidlle.append (k.get_text (). strip ())
data = [title, midlle]
print (data)

def get (url, list, num):
#Define a function to get information headers = ('user-agent': 'Mo + zilla / 5.0 (Windows; U; Windows NT 5.1; it; rv: 1.8.1.11) Gecko / 20071127 Firefox / 2.0.0.11 ') #Pretend to be a crawler or it will not be able to crawl webpage information

r = requests.get (url, timeout = 30, headers = headers)

#The time to send the request is 30s r.raise_for_status ()
r.encoding = r. apparent_encoding

soup = BeautifulSoup (r.text, "html.parser")

list1 = []
list2 = [] #Create two lists to store the names of attractions and information about bee ratings

list1 = soup.find_all ('p', class_ = ' title ')
#Find data with label p list2 = soup.find_all (' font ', color = "# 474747") #Find data with label font, color = "# 474747"

print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t ".format ('Ranking', 'Attraction Name', 'Bee Rating'))

for i in range (num):
list.append ([i + 1, list1 [i].string,list2[i].string])
print ("{: ^ 10} \ t {: ^ 30} \ t {: ^ 10} \ t" .format (i + 1, list1 [i] .string, list2 [i] .string)) #Data Add to list array

def create_file (file_path, msg):
#Define a create folder and open the crawled resources in excel format view = r'D: \ python \ quanzhou.xlsx '
df = pd.DataFrame (msg, columns = (' rank ',' Attraction name ',' Bee evaluation number '))
df.to_excel (view)
print (' Create excel completed ')

def check_file (file_path):
quanzhou = pd.DataFrame (pd.read_excel ('D: \ python \ quanzhou.xlsx'))
print ('\ n ==== The empty value of each column is as follows: ====' )
print (quanzhou.isnull ()) #Statistical null value
print (quanzhou.duplicated ()) #Find duplicate values
print (quanzhou.isna (). head ()) #Statistical missing value # If the result is False, it is not Null value

print (quanzhou.corr ()) #Correlation coefficient of two variables
print (quanzhou.describe ()) #Print out data

def chart ():
plt.rcParams ['font.sans-serif'] = [' SimHei '] #Set font

filename =' D: \ python \ quanzhou.xlsx '
colnames = [' Ranking ',' Attraction Name ',' Bee Ratings']
df = pd.read_excel (filename)

X = df.loc [ 1: 8, 'Name of attractions'
Y = df.loc [1: 8, 'Bee rating']

plt.bar (X, Y)
plt.title ("Quanzhou popular attractions histogram")
plt.show

def main():
list = []
url = "http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E"
get(url,list,8)
create_file('D:\python\quanzhou.xlsx',list)
check_file('D:\python\quanzhou.xlsx')
chart()
chart2()

main()

four,

1. After analyzing and visualizing the subject data, what conclusions can be obtained?

You can get the data you want through python, which can save a lot of work and improve efficiency.

2. Make a simple summary of the completion of the program design task.

Through this assignment, I have mastered new skills, learned more knowledge, and deepened my understanding of python

 

 

Guess you like

Origin www.cnblogs.com/a1959711087/p/12720439.html