Final assignment|Python crawler data collection visual analysis project full version

Please add a picture description

This is my data visualization final assignment for this semester. If it is helpful to you, give it a follow! !

Work effect display:

Histogram:
insert image description here
Column Chart:
insert image description here
Line Chart:
insert image description here
Line Chart:
insert image description here
Line Chart:
insert image description here
Dataset:
insert image description here
Three tables in total: Perform data analysis
insert image description here

code:

The first data set is crawled by crawlers:
get data code:

import bs4
import pandas as pd
import requests

def head(url):
    header={
    
    
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
    }
    reqs = requests.get(url,headers=header)
    return reqs
def run():
    global ws
    bank = []  # 世界排名
    country = []  # 国家
    num = []  # 人口基数
    index = []  # 增长率
    p = []  # 密度

    for i in range(1,13):
        url = "https://web.phb123.com/city/renkou/rk_"+str(i)+".html"
        r = requests.get(url)
        html = bs4.BeautifulSoup(r.text.encode("utf-8"),"html.parser")
        # 获得排名
        ranklist = html.find_all(class_="xh")
        for rank in ranklist:
            bank.append(rank.get_text('td'))
        # 获得国家名
        namelist = html.find_all(class_="cty")
        for name in namelist:
            country.append(name.text.strip())
        # 获得人口基数
        for i in range(0,20):
            numlist = html.find_all('td')[2+5*i].text
            num.append(numlist)
        # 增长率
        for i in range(0, 20):
            indexlist = html.find_all('td')[3 + 5 * i].text
            index.append(indexlist)
#         人口密度
        for i in range(0, 20):
            plist = html.find_all('td')[4 + 5 * i].text
            p.append(plist)
    data = {
    
    
        "排行榜": bank,
        "国家": country,
        "人口": num,
        "增长率": index,
        "密度": p
    }
    df = pd.DataFrame(data)
    df.to_excel("世界人口数据1.xlsx",index=False)

if __name__ == '__main__':
    run()

Data processing:

import pandas as pd

# 加载原始的 Excel 文件数据
df = pd.read_excel('世界人口数据1.xlsx')

# 进行数据预处理
df = df.dropna(subset=['国家'])  # 去除国家为空的行
df = df[df['人口'] != 0]  # 去除人口为0的行
df = df[df['增长率'] != '0.00%']  # 去除增长率为0的行
df = df[df['密度'] != 0]  # 去除密度为0的行

# 存储处理后的数据到新的 Excel 文件
df.to_excel('处理后的世界人口数据.xlsx', index=False)

data visualization:

import re
import openpyxl
from pyecharts.charts import Map, Bar, Pie, Page, Line, Geo, Tab
from pyecharts.charts import Line
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType

from pyecharts.charts import Page, Pie

# 首先处理数据转化为字典格式
############表格数据提取########################
data = openpyxl.load_workbook('处理后的世界人口数据.xlsx')
table = data['Sheet1']
row_Num = table.max_row
col_Num = table.max_column
s = []
key = [table.cell(1, i).value for i in range(1, col_Num + 1)]
for j in range(2, row_Num + 1):
    d = {
    
    }
    values = [table.cell(j, x).value for x in range(1, col_Num + 1)]
    for x in range(col_Num):
        d[key[x]] = values[x]
    s.append(d)

from pyecharts import options as opts

def zhuzhuangtu(data):
    num_score_010 = []
    num_score_020 = []
    num_score_030 = []
    num_score_040 = []
    num_score_050 = []
    num_score_060 = []
    num_score_070 = []
    num_score_080 = []
    num_score_090 = []
    num_score_05 = []
    num_score_10 = []

    # 将各个分段的国家名添加到对应列表
    for i in data:
        score = str(i["人口"])
        score1 = re.sub(r"[^a-zA-Z0-9]", "", score)
        score2 = int(score1)
        if score2 in range(0, 10000000):
            num_score_010.append(i["国家"])
        elif score2 in range(10000000, 20000000):
            num_score_020.append(i["国家"])
        elif score2 in range(20000000, 30000000):
            num_score_030.append(i["国家"])
        elif score2 in range(40000000, 50000000):
            num_score_040.append(i["国家"])
        elif score2 in range(50000000, 60000000):
            num_score_050.append(i["国家"])
        elif score2 in range(60000000, 70000000):
            num_score_060.append(i["国家"])
        elif score2 in range(70000000, 80000000):
            num_score_070.append(i["国家"])
        elif score2 in range(80000000, 90000000):
            num_score_080.append(i["国家"])
        elif score2 in range(90000000, 500000000):
            num_score_090.append(i["国家"])
        elif score2 in range(500000000, 1000000000):
            num_score_05.append(i["国家"])
        elif score2 in range(1000000000, 2000000000):
            num_score_10.append(i["国家"])

    # 构建两个列表用以后期建表方便
    bar_x_axis_data = ["0-1kw", "1kw-2kw", "2kw-3kw", "3kw-4kw", "4kw-5kw", "5kw-6kw", "6kw-7kw", "7kw-8kw", "8kw-9kw",
                       "9kw-5e", "5e-10e"]
    bar_y_axis_data = [
        len(num_score_010), len(num_score_020), len(num_score_030), len(num_score_040), len(num_score_050),
        len(num_score_060), len(num_score_070), len(num_score_080), len(num_score_090), len(num_score_05), len(num_score_10)
    ]

    # 定义提示框的显示内容
    def tooltip_formatter(params):
        index = params[0]["dataIndex"]
        segment = bar_x_axis_data[index]
        country_list = []
        if index == 0:
            country_list = num_score_010
        elif index == 1:
            country_list = num_score_020
        elif index == 2:
            country_list = num_score_030
        elif index == 3:
            country_list = num_score_040
        elif index == 4:
            country_list = num_score_050
        elif index == 5:
            country_list = num_score_060
        elif index == 6:
            country_list = num_score_070
        elif index == 7:
            country_list = num_score_080
        elif index == 8:
            country_list = num_score_090
        elif index == 9:
            country_list = num_score_05
        elif index == 10:
            country_list = num_score_10

        tooltip = f"{
      
      segment}: {
      
      params[0]['value']}"
        tooltip += "<br/>"
        for country in country_list:
            tooltip += f"{
      
      country}<br/>"

        return tooltip

    # 柱状图
    c = (
        Bar()
        .add_xaxis(bar_x_axis_data)
        .add_yaxis("全球人口分布图", bar_y_axis_data, color="#CCCC66")
        .set_global_opts(
            title_opts=opts.TitleOpts(title="全球国家人口数量分布柱状图"),
            xaxis_opts=opts.AxisOpts(name="分段"),
            yaxis_opts=opts.AxisOpts(
                name="数量",
                axislabel_opts=opts.LabelOpts(formatter="{value}"),
                splitline_opts=opts.SplitLineOpts(is_show=True),
            ),
            legend_opts=opts.LegendOpts(pos_right="center", pos_top="top"),
            toolbox_opts=opts.ToolboxOpts(is_show=True),
            datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=100),
            tooltip_opts=opts.TooltipOpts(
                is_show=True,
                trigger="axis",
                axis_pointer_type="shadow",
                formatter=tooltip_formatter,
            ),
        )
        .reversal_axis()
        .set_series_opts(
            label_opts=opts.LabelOpts(
                position="right",
                formatter="{b}: {c}",
                font_size=12,
                font_weight="bold",
            )
        )
        .render("横向柱状图.html")
    )

def pie_charts(data) -> Pie:
    country = [i['国家'] for i in data]
    num = [int(re.sub(r"[^a-zA-Z0-9]", "", str(i["人口"]))) for i in data]
    data_pie = [list(i) for i in zip(country, num)]
    c1 = (
        Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
        .add("", data_pie, radius=['40%', '75%'])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="各国人口分布饼状图", pos_left='center', pos_top='20px'),
            legend_opts=opts.LegendOpts(orient="vertical", pos_top="middle", pos_left="right"),
            toolbox_opts=opts.ToolboxOpts(),
            tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"),
            visualmap_opts=opts.VisualMapOpts(
                min_=min(num),
                max_=max(num),
                orient="horizontal",
                pos_left="center",
                pos_bottom="bottom",
                range_color=['#e0ffff', '#006edd'],
            ),
        )
        .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{d}%'))
    )

    return c1

def create_page(data):
    pie = pie_charts(data)

    tab = (
        Tab()
        .add(pie, "饼状图")
    )
    tab.render("可视化分析.html")

def zhexian(data):
    country = [i['国家'] for i in data]
    p = [i['密度'] for i in data]
    index = [i['增长率'] for i in data]

    c = (
        Line()
        .add_xaxis(country)
        .add_yaxis("人口密度", p, is_smooth=True, label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts(title="全球国家人口密度分析"),
            xaxis_opts=opts.AxisOpts(
                name="国家",
                axislabel_opts=opts.LabelOpts(formatter="{value}", rotate=270),
            ),
            yaxis_opts=opts.AxisOpts(
                name="人口密度",
                axislabel_opts=opts.LabelOpts(formatter="{value}"),
                splitline_opts=opts.SplitLineOpts(is_show=True),
            ),
            legend_opts=opts.LegendOpts(pos_right="center", pos_top="top"),
            tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
            toolbox_opts=opts.ToolboxOpts(is_show=True),
        )
    )
    c.render("折线图.html")


# zhexian(data)


if __name__ == '__main__':
    print(s)
    # zhuzhuangtu(s)
    create_page(s[:15])
#     zhexian(s[:30])


Second Dataset: Data Visualization:


# 首先处理数据转化为字典格式
###########表格数据提取########################
import openpyxl

data = openpyxl.load_workbook('世界人口数据1.xlsx')
table = data['Sheet2']
row_Num = table.max_row
col_Num = table.max_column
s = []
key = [table.cell(1, i).value for i in range(1, col_Num + 1)]
for j in range(2, row_Num + 1):
    d = {
    
    }
    values = [table.cell(j, x).value for x in range(1, col_Num + 1)]
    for x in range(col_Num):
        d[key[x]] = values[x]
    s.append(d)


import plotly.graph_objects as go
import pandas as pd

# 创建数据帧
data = pd.DataFrame(s)

# 绘制折线图
fig = go.Figure()

fig.add_trace(go.Scatter(x=data['年份'], y=data['人口'], mode='lines+markers', name='人口'))
fig.add_trace(go.Scatter(x=data['年份'], y=data['出生率'], mode='lines+markers', name='出生率'))
fig.add_trace(go.Scatter(x=data['年份'], y=data['死亡率'], mode='lines+markers', name='死亡率'))
fig.add_trace(go.Scatter(x=data['年份'], y=data['国民总收入'], mode='lines+markers', name='国民总收入'))

# 设置布局和样式
fig.update_layout(
    title='人口、出生率、死亡率和国民总收入趋势',
    xaxis=dict(title='年份'),
    yaxis=dict(title='值'),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

# 保存为 HTML 文件
fig.write_html('中国历史人口数据分析.html')

The third data set data visualization:


# 首先处理数据转化为字典格式
###########表格数据提取########################
import openpyxl

data = openpyxl.load_workbook('世界人口数据1.xlsx')
table = data['Sheet3']
row_Num = table.max_row
col_Num = table.max_column
s = []
key = [table.cell(1, i).value for i in range(1, col_Num + 1)]
for j in range(2, row_Num + 1):
    d = {
    
    }
    values = [table.cell(j, x).value for x in range(1, col_Num + 1)]
    for x in range(col_Num):
        d[key[x]] = values[x]
    s.append(d)

import plotly.graph_objects as go

# 提取数据
ages = [entry['年龄'] for entry in s]
medical_expenses = [entry['医疗消费'] for entry in s]

# 创建柱状图
fig = go.Figure(data=go.Bar(x=ages, y=medical_expenses))

# 设置图表布局和样式
fig.update_layout(
    title='医疗消费与年龄的关系',
    xaxis=dict(title='年龄'),
    yaxis=dict(title='医疗消费'),
    barmode='group',
    showlegend=False
)

# 生成HTML文件
fig.write_html('医疗水平消费.html')

If you need a dataset, you can add me on WeChat to get it

Program Design Report:

Program structure description document:
analysis of three data sets, source of three data sets:
Crawling the ranking of world population countries through crawlers:
the remaining two data sets of population data of world countries are data sets downloaded from the Internet, one of which is China in recent decades The population data set,
and the other is the medical consumption level data set.
The first data set: population data set of world countries Document description:
detailed program design
(1) flow chart of program writing ideas:
(2) design code implementation:
crawler technology: python
data storage: xlsx file
data visualization processing: pyechatrs module
1 .Crawl and store the required data through python.
Content of the target webpage: import
related modules:
import bs4
import pandas as pd
import requests
Data crawling process: disguise request URL request header
def head(url):
header={ ' User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54' } reqs = requests.get( url , headers=header)



return reqs
uses the BeautifulSoup module to lock and process the requested tags and store the data
def run():
global ws
bank = [] # world ranking
country = [] # country
num = [] # population base
index = [] # growth rate
p = [] # density

for i in range(1,8):
    url = "https://web.phb123.com/city/renkou/rk_"+str(i)+".html"
    r = requests.get(url)
    html = bs4.BeautifulSoup(r.text.encode("utf-8"),"html.parser")
    # 获得排名
    ranklist = html.find_all(class_="xh")
    for rank in ranklist:
        bank.append(rank.get_text('td'))
    # 获得国家名
    namelist = html.find_all(class_="cty")
    for name in namelist:
        country.append(name.text.strip())
    # 获得人口基数
    for i in range(0,20):
        numlist = html.find_all('td')[2+5*i].text
        num.append(numlist)
    # 增长率
    for i in range(0, 20):
        indexlist = html.find_all('td')[3 + 5 * i].text
        index.append(indexlist)
	# 人口密度
    for i in range(0, 20):
        plist = html.find_all('td')[4 + 5 * i].text
        p.append(plist)
data = {
    "排行榜": bank,
    "国家": country,
    "人口": num,
    "增长率": index,
    "密度": p
}
df = pd.DataFrame(data)
df.to_json("世界人口数据.xls",encoding="utf-8")

2. Visualize the collected data through the pyecharts module Import related modules: import re import xlrd from pyecharts import options as opts from pyecharts.charts import Map,
Bar, Pie, Page, Line, Geo First process the content saved in the xls file Data and convert it into dictionary format
First process the data into dictionary format
table data extraction data=xlrd.open_workbook('world population data.xls') table = data.sheet_by_name('Sheet1') row_Num=table.nrows col_Num= table .ncols
s=[] key = table.row_values(0) j = 1 for i in range(row_Num-1):
d = {}
values ​​= table.row_values(j)
for x in range(col_Num):
# put key The value corresponding to the value is assigned to the key, and each line loops
d[key[x]] = values[x]
j += 1
#Add the dictionary to the list
s.append(d) for visual data analysis: histogram

Line chart effect display:

Pie chart effect display:

(3) Troubleshooting:
1. Difficulties: multi-page crawling data, page turning by observing the rules of the webpage:

Complete page turning by changing the rules of url for i in range(1,8):
url = “https://web.phb123.com/city/renkou/rk_”+str(i)+“.html”
r = requests.get(url)
html = bs4.BeautifulSoup(r.text.encode(“utf-8”), “html.parser”)
2. Difficulty: collect and crawl target data analysis of specific tag data:

Filter the required content through the rules and add it to the list # Get the ranking
ranklist = html.find_all(class_="xh")
for rank in ranklist:
bank.append(rank.get_text('td'))
# Get the country name
namelist = html.find_all(class_="cty")
for name in namelist:
country.append(name.text.strip())
# Get population base
for i in range(0,20):
numlist = html.find_all(' td')[2+5*i].text
num.append(numlist)
# growth rate
for i in range(0, 20):
indexlist = html.find_all('td')[3 + 5 * i].text
index.append(indexlist)
#population density
for i in range(0, 20):
plist = html.find_all('td')[4 + 5 * i].text
p.append(plist)
3. Difficulty: pair list Write the data into the xls file by converting the list data into a dictionary and write it into the storage file data = { "ranking list": bank,

"Country": country,
"Population": num,
"Growth Rate": index,
"Density": p } df = pd.DataFrame(data) df.to_json("World Population Data.xls", encoding="utf- 8")

The second data set: a data set of China's population in recent decades

Line chart effect display:

The third data set: medical consumption level data set

Histogram effect display:

Guess you like

Origin blog.csdn.net/weixin_54174102/article/details/131398345