对热门景点进行爬虫（泉州）

一、主题式网络爬虫设计方案

1.主题式网络爬虫名称

名称：爬取马蜂窝泉州热门景点

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
ur1='http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E&seid=8517A6C2-4C2D-453A-83F4-4C281B0E91E9'#打开网页
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}#
r=requests.get(ur1,headers=headers)#请求网站
r.encoding=r.apparent_encoding#统一编码
data=r.text
soup=BeautifulSoup(data,'html.parser')#使用“美味的汤”工具
print(soup.prettify())#显示网站结构
title=[]
midlle=[]
for i in soup.find_all(class_="title"):#把热门景点放入空列表
title.append(i.get_text().strip())
for k in soup.find_all(class_="middle"):#把景点简介放入空列表
midlle.append(k.get_text().strip())
data=[title,midlle]
print(data)

2.主题式网络爬虫爬取的内容与数据特征分析

这次爬虫主要是爬取马蜂窝泉州热门景点的各景点信息，排名，以及蜂评数

3.主题式网络爬虫设计方案概述（包括实现思路与技术难点）

实现思路：获取马蜂窝泉州热门景点目标的HTML页面，使用requests爬取数据，BeautifulSoup解析页面，使用records进行数据存储、读取

技术难点：爬取数据，数据清洗

二、主题页面的结构特征分析

1.主题页面的结构特征

2，Htmls页面解析

查看标签景点名称发现标题在p class=’title’下，蜂评数在font 标签下三、网络爬虫程序设计

爬虫程序主体要包括以下各部分，要附源代码及较详细注释，并在每部分程序后面提供输出结果的截图。

三，数据的爬取和采集

def get(url,list,num): #定义一个获取信息函数
    headers = {'user-agent':'Mo+zilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'} #伪装爬虫 不然无法爬取网页信息
    
    r = requests.get(url,timeout = 30,headers=headers)  #发送请求 时间为30s
    
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    
    soup = BeautifulSoup(r.text,"html.parser") #html编译器
   
    list1 = []   
    list2 = []  #创建两个列表存放景点名字和蜂评数的信息
    
    list1 = soup.find_all('p',class_='title')  #寻找标签为p的数据
    list2 = soup.find_all('font',color="#474747") #寻找标签为font，color="#474747"的数据
    
    print("{:^10}\t{:^30}\t{:^10}\t".format('排名','景点名字','蜂评数'))
    
    for i in range(num):
        list.append([i+1,list1[i].string,list2[i].string])
        print("{:^10}\t{:^30}\t{:^10}\t".format(i+1,list1[i].string,list2[i].string))  #将数据添加入list数组中

2.数据清洗和处理

def check_file(file_path):
    quanzhou = pd.DataFrame(pd.read_excel('D:\python\quanzhou.xlsx'))
    print('\n====各列的空值情况如下：====')
    print(quanzhou.isnull()) #统计空值情况
    print(quanzhou.duplicated()) #查找重复值
    print(quanzhou.isna().head()) #统计缺失值  # 得出结果为False则不为空值
    
    print(quanzhou.describe()) #描述数据

3，数据的可视化持久化

def chart():
plt.rcParams['font.sans-serif']=['SimHei'] #设置字体

filename = 'D:\python\quanzhou.xlsx'
colnames=['排名','景点名字','蜂评数']
df = pd.read_excel(filename)

X=df.loc[1:8,'景点名字']
Y=df.loc[1:8,'蜂评数']

plt.bar(X,Y)
plt.title("泉州热门景点柱状图")
plt.show

def create_file(file_path,msg): #定义一个创建文件夹,将爬取的资源用excel格式打开
    view =r'D:\python\quanzhou.xlsx'
    df = pd.DataFrame(msg,columns=['排名','景点名字','蜂评数'])
    df.to_excel(view)
    print('创建excel完成')

4.完整程序代码

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import scipy as sp
from numpy import genfromtxt
import matplotlib
from pandas import DataFrame
import matplotlib.pyplot as plt
from scipy.optimize import leastsq

ur1='http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E&seid=8517A6C2-4C2D-453A-83F4-4C281B0E91E9'#打开网页
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}#
r=requests.get(ur1,headers=headers)#请求网站
r.encoding=r.apparent_encoding#统一编码
data=r.text
soup=BeautifulSoup(data,'html.parser')#使用“美味的汤”工具
print(soup.prettify())#显示网站结构
title=[]
midlle=[]
for i in soup.find_all(class_="title"):#把热门景点放入空列表
title.append(i.get_text().strip())
for k in soup.find_all(class_="middle"):#把景点简介放入空列表
midlle.append(k.get_text().strip())
data=[title,midlle]
print(data)

def get(url,list,num): #定义一个获取信息函数
headers = {'user-agent':'Mo+zilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'} #伪装爬虫不然无法爬取网页信息

r = requests.get(url,timeout = 30,headers=headers) #发送请求时间为30s

r.raise_for_status()
r.encoding = r.apparent_encoding

soup = BeautifulSoup(r.text,"html.parser")

list1 = []
list2 = [] #创建两个列表存放景点名字和蜂评数的信息

list1 = soup.find_all('p',class_='title') #寻找标签为p的数据
list2 = soup.find_all('font',color="#474747") #寻找标签为font，color="#474747"的数据

print("{:^10}\t{:^30}\t{:^10}\t".format('排名','景点名字','蜂评数'))

for i in range(num):
list.append([i+1,list1[i].string,list2[i].string])
print("{:^10}\t{:^30}\t{:^10}\t".format(i+1,list1[i].string,list2[i].string)) #将数据添加入list数组中

def create_file(file_path,msg): #定义一个创建文件夹,将爬取的资源用excel格式打开
view =r'D:\python\quanzhou.xlsx'
df = pd.DataFrame(msg,columns=['排名','景点名字','蜂评数'])
df.to_excel(view)
print('创建excel完成')

def check_file(file_path):
quanzhou = pd.DataFrame(pd.read_excel('D:\python\quanzhou.xlsx'))
print('\n====各列的空值情况如下：====')
print(quanzhou.isnull()) #统计空值情况
print(quanzhou.duplicated()) #查找重复值
print(quanzhou.isna().head()) #统计缺失值 # 得出结果为False则不为空值

print(quanzhou.corr()) #两个变量的相关系数
print(quanzhou.describe()) #打印出数据

def chart():
plt.rcParams['font.sans-serif']=['SimHei'] #设置字体

filename = 'D:\python\quanzhou.xlsx'
colnames=['排名','景点名字','蜂评数']
df = pd.read_excel(filename)

X=df.loc[1:8,'景点名字']
Y=df.loc[1:8,'蜂评数']

plt.bar(X,Y)
plt.title("泉州热门景点柱状图")
plt.show

def main():
list = []
url = "http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E"
get(url,list,8)
create_file('D:\python\quanzhou.xlsx',list)
check_file('D:\python\quanzhou.xlsx')
chart()
chart2()

main()

四，

1.经过对主题数据的分析与可视化，可以得到哪些结论？

通过python可以获取自己想要的数据，可以节省很多工作量，提高效率。

2.对本次程序设计任务完成的情况做一个简单的小结。

通过这次作业，让我掌握了新的技巧，学到了更多的知识，以及对python的加深了解

对热门景点进行爬虫（泉州）

猜你喜欢