一 前言
本篇的文章意指进行python的爬虫学习实战,意指提供个人学习,不得将本文内容用于商业运营和非法营业;
学习完本篇内容,读者将获得基本爬取网站的思路,python基本语法和其它第三方库的使用的整合熟练能力;
二 预备知识
前提知识,python基础语法,request基本操作,psycopg2 基本操作,BeautifulSoup 基本操作,time 基本使用;除了基本语法知识追寻者没有时间写完,其余都能在知识追寻者的专栏找到
三 网页分析
福州市天气地址 https://www.tianqi.com/fuzhou/
3.1 页面数据与建表
如上图中可以很详细的知道需要的数据 时间 ,星期,天气情况,温度,风向;以此为基准我们需要的建表语句如下(其中市区知识追寻者这边就指定了福州市,根据地址后缀可以随意切换城市,等于学会爬取福州市的天气预报,天气网站的所有市区的天气预报都一个模式)
CREATE TABLE "public"."weather" (
"id" serial8,
"city" varchar(255) COLLATE "pg_catalog"."default",
"date" timestamp(6),
"week_day" varchar(255) COLLATE "pg_catalog"."default",
"temperature" varchar(255) COLLATE "pg_catalog"."default",
"wind" varchar(255) COLLATE "pg_catalog"."default",
"weather" varchar(255) COLLATE "pg_catalog"."default",
CONSTRAINT "weather_pkey" PRIMARY KEY ("id")
)
;
ALTER TABLE "public"."weather"
OWNER TO "postgres";
COMMENT ON COLUMN "public"."weather"."id" IS '主键';
COMMENT ON COLUMN "public"."weather"."city" IS '城市';
COMMENT ON COLUMN "public"."weather"."date" IS '日期';
COMMENT ON COLUMN "public"."weather"."week_day" IS '星期';
COMMENT ON COLUMN "public"."weather"."temperature" IS '温度';
COMMENT ON COLUMN "public"."weather"."wind" IS '风向';
COMMENT ON COLUMN "public"."weather"."weather" IS '天气';
COMMENT ON TABLE "public"."weather" IS '天气预报';
3.2 html文档分析
打开开发者工具,或者F12,查看html代码,使用页面元数点击3.1中图片的所示的网页位置;
可以找到很清晰的页面结构
首先 第一个标签 ul 属性 class 是 week ;ul 标签 里面有 7 个 li 标签 ;每个 li 标签 里面有 b 标签 (存储时间)和 span 标签(存储星期);后面的标签都是类似结;
基本思路如下:
- 使用 BeautifulSoup 查找 ul 标签 通过属性区分
- 查找ul 底下的所有li标签
- 提取 li 标签文本或者查找li标签底下的所有 b 和 span标签 ,然后提取标签文本
四 详细步骤
4.1 公共代码部分
公共代码就是使用requests请求ur将其转化为html文本;这边解析库使用的不够完美,读者可以使用lxml,由于知识追寻者虚拟环境安装问题麻烦就从简了;其次就是数据库的插入数据操作;
# -*- coding: utf-8 -*-
import psycopg2
import requests
from bs4 import BeautifulSoup
import time
""" 数据库插入语句 """
def insert(params):
# 获得连接
conn = psycopg2.connect(database="zszxz", user="postgres", password="", host="ip", port="5432")
# 获得游标对象,一个游标对象可以对数据库进行执行操作
cursor = conn.cursor()
# sql语句 建表
sql ="""INSERT INTO weather (city, date,week_day,temperature,wind,weather)
VALUES (%(city)s, %(date)s,%(week_day)s,%(temperature)s,%(wind)s,%(weather)s)"""
# 执行语句
cursor.execute(sql,params)
print("successfully:"+params['week_day'])
# 事物提交
conn.commit()
# 关闭数据库连接
conn.close()
url = 'https://www.tianqi.com/fuzhou/'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
request = requests.get(url, headers=headers, )
# 设置编码
request.encoding = 'UTF-8'
html = request.text
soup = BeautifulSoup(html,'html.parser')
4.2 获取时间和星期
- 查找属性 class 为week 的 ul 标签
- 查找ul标签下面所有li标签
- 使用两个列表分别存储日期和星期
- 遍历li标签获得b标签内的文本和span标签内的文本
- 引起时间模块处理为规范的时间
- 将 时间,星期储存进列表,打印
# 获取日期和时间
soup_week = soup.find(name='ul',attrs={'class':'week'})
soup_week_lis = soup_week.find_all(name='li')
# 储存时间
date_list = []
# 列表用于存储日期
week_list = []
for soup_week_li in soup_week_lis:
date_untackle = soup_week_li.find(name='b').text
week = soup_week_li.find(name='span').text
# 将日期追加进列表
week_list.append(week)
year = str(time.localtime().tm_year)
date = year+ '-' + date_untackle.replace('月','-',1).replace('日','',1)
# 将时间追加进列表
date_list.append(date)
print(date_list)
print(week_list)
输出
['2020-02-15', '2020-02-16', '2020-02-17', '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21']
['星期六', '星期日', '星期一', '星期二', '星期三', '星期四', '星期五']
4.2 获取天气情况
- 查找属性 class 为txt txt2 的 ul 标签
- 查找ul标签下面所有li标签并提起文本储存进列表
# 获取天气情况
soup_weather = soup.find(name='ul',attrs={'class':'txt txt2'})
soup_weather_lis = soup_weather.find_all(name='li')
# 用于存储天气
weather_list = []
for soup_weather_li in soup_weather_lis:
weather = soup_weather_li.text
weather_list.append(weather)
print(weather_list)
输出
['小雨到中雨', '小雨到中雨', '多云', '晴', '晴', '晴', '阴']
4.3 获取温度
提取思路类似
# 获取温度
soup_temperature = soup.find(name='div',attrs={'class':'zxt_shuju'})
soup_temperature_lis = soup_temperature.find_all(name='li')
# 用于存储天气
temperature_list = []
for soup_temperature_li in soup_temperature_lis:
# 低温
lower = soup_temperature_li.find(name='b').text
# 高温
uppper = soup_temperature_li.find(name='span').text
temperature = lower + '~' + uppper
temperature_list.append(temperature)
print(temperature_list)
输出
['11~20', '6~11', '5~13', '4~13', '7~17', '11~19', '13~20']
4.4 获取风向
这边的注意点是取第二个ul,原有是跟天气情况的ul的属性重合了;提取思路类似;
# 获取风向
soup_wind = soup.find_all(name='ul',attrs={'class':'txt'})[1]
soup_wind_lis = soup_wind.find_all(name='li')
# 用于存储风向
wind_list = []
for soup_wind_li in soup_wind_lis:
wind = soup_wind_li.text
wind_list.append(wind)
输出
['东风', '北风', '东北风', '东北风', '东北风', '东北风', '北风']
4.5 列表数据组装入库
经过一个range内置板块循环遍历读取各个list 组装成字典入库
for num in range(7):
in_date = date_list[num]
in_week = week_list[num]
in_weather = weather_list[num]
in_temperture = temperature_list[num]
in_wind = wind_list[num]
param = {'city':'福州市', 'date':in_date, 'week_day':in_week, 'temperature':in_temperture, 'wind':in_wind,'weather':in_weather}
print(param)
#insert(param)
输出
{'city': '福州市', 'date': '2020-02-15', 'week_day': '星期六', 'temperature': '11~20', 'wind': '东风', 'weather': '小雨到中雨'}
{'city': '福州市', 'date': '2020-02-16', 'week_day': '星期日', 'temperature': '6~11', 'wind': '北风', 'weather': '小雨到中雨'}
{'city': '福州市', 'date': '2020-02-17', 'week_day': '星期一', 'temperature': '5~13', 'wind': '东北风', 'weather': '多云'}
{'city': '福州市', 'date': '2020-02-18', 'week_day': '星期二', 'temperature': '4~13', 'wind': '东北风', 'weather': '晴'}
{'city': '福州市', 'date': '2020-02-19', 'week_day': '星期三', 'temperature': '7~17', 'wind': '东北风', 'weather': '晴'}
{'city': '福州市', 'date': '2020-02-20', 'week_day': '星期四', 'temperature': '11~19', 'wind': '东北风', 'weather': '晴'}
{'city': '福州市', 'date': '2020-02-21', 'week_day': '星期五', 'temperature': '13~20', 'wind': '北风', 'weather': '阴'}
五整体代码
整体代码如下,知识追寻者将所有的数据都提前提取到list然后通过,遍历list组装字典数据入库;
# -*- coding: utf-8 -*-
import psycopg2
import requests
from bs4 import BeautifulSoup
import time
""" 数据库插入语句 """
def insert(params):
# 获得连接
conn = psycopg2.connect(database="zszxz", user="postgres", password="", host="ip", port="5432")
# 获得游标对象,一个游标对象可以对数据库进行执行操作
cursor = conn.cursor()
# sql语句 建表
sql ="""INSERT INTO weather (city, date,week_day,temperature,wind,weather)
VALUES (%(city)s, %(date)s,%(week_day)s,%(temperature)s,%(wind)s,%(weather)s)"""
# 执行语句
cursor.execute(sql,params)
print("successfully:"+params['week_day'])
# 事物提交
conn.commit()
# 关闭数据库连接
conn.close()
url = 'https://www.tianqi.com/fuzhou/'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
request = requests.get(url, headers=headers, )
# 设置编码
request.encoding = 'UTF-8'
html = request.text
soup = BeautifulSoup(html,'html.parser')
# 获取日期和时间
soup_week = soup.find(name='ul',attrs={'class':'week'})
soup_week_lis = soup_week.find_all(name='li')
# 储存时间
date_list = []
# 列表用于存储日期
week_list = []
for soup_week_li in soup_week_lis:
date_untackle = soup_week_li.find(name='b').text
week = soup_week_li.find(name='span').text
# 将日期追加进列表
week_list.append(week)
year = str(time.localtime().tm_year)
date = year+ '-' + date_untackle.replace('月','-',1).replace('日','',1)
# 将时间追加进列表
date_list.append(date)
# 获取天气情况
soup_weather = soup.find(name='ul',attrs={'class':'txt txt2'})
soup_weather_lis = soup_weather.find_all(name='li')
# 用于存储天气
weather_list = []
for soup_weather_li in soup_weather_lis:
weather = soup_weather_li.text
weather_list.append(weather)
# 获取温度
soup_temperature = soup.find(name='div',attrs={'class':'zxt_shuju'})
soup_temperature_lis = soup_temperature.find_all(name='li')
# 用于存储天气
temperature_list = []
for soup_temperature_li in soup_temperature_lis:
# 低温
lower = soup_temperature_li.find(name='b').text
# 高温
uppper = soup_temperature_li.find(name='span').text
temperature = lower + '~' + uppper
temperature_list.append(temperature)
# 获取风向
soup_wind = soup.find_all(name='ul',attrs={'class':'txt'})[1]
soup_wind_lis = soup_wind.find_all(name='li')
# 用于存储风向
wind_list = []
for soup_wind_li in soup_wind_lis:
wind = soup_wind_li.text
wind_list.append(wind)
for num in range(7):
in_date = date_list[num]
in_week = week_list[num]
in_weather = weather_list[num]
in_temperture = temperature_list[num]
in_wind = wind_list[num]
param = {'city':'福州市', 'date':in_date, 'week_day':in_week, 'temperature':in_temperture, 'wind':in_wind,'weather':in_weather}
insert(param)