环境配置
安装Python3.7.3,beautifulsoup4, requests
请求打印html页面内容
import requests
url = 'http://www.eastmoney.com/'
req = requests.get(url)
req.encoding = req.apparent_encoding
html = req.text
执行print(html)就能打印网页内容
网页解析
解析网页源码模块用Beautifulsoup模块 提取东方财富网首页的消息为例,右键点击对应的元素,选择检查,然后我们就可以看到网页的源代码了。
我们发现对应的元素都被<div class="nlist">选定,相应的我们可以把相应的代码筛选出来。
from bs4 import BeautifulSoup
bf = BeautifulSoup(html, 'lxml')
nmlist = bf.find_all(class_ = 'nlist')
发现消息的标题和连接<a>给限定出来,用find_all方法获取
a = nlist.find_all('a')
for each in a:
print(each.string, each.get('href'))
存储CSV
import csv
date = open('test.csv','w')
writer = csv.writer(date)
date.close()
完整代码如下
# -*- coding: utf-8 -*-
# @Time : 2019/4/8 17:40
# @Author : linjingtu
# @Email : [email protected]
# @File : test.py
# @Software: PyCharm
import requests
import lxml
from bs4 import BeautifulSoup
import csv
date = open('F:\\test.csv', 'w+')
writer = csv.writer(date)
url = 'http://www.eastmoney.com/'
req = requests.get(url)
req.encoding = req.apparent_encoding
html = req.text
bf = BeautifulSoup(html, 'lxml')
nlist = bf.find_all(class_ = 'nlist')[0]
a = nlist.find_all('a')
for each in a:
a_list = []
a_list.append(each.string)
a_list.append(each.get('href'))
writer.writerow(a_list)
date.close()
#print(nlist)
深圳程序员交流群550846167