Python爬虫汽车之家新闻消息

#coding:utf-8
#----------------------------------------------------------------------------------------------------------
#                                         功能:爬取汽车之家的新闻
#----------------------------------------------------------------------------------------------------------
# pip3 install requests
# pip3 install BeautifulSoup4

import requests
from bs4 import BeautifulSoup

# 获取一个新闻标题
# response = requests.get('http://www.autohome.com.cn/news/')
# response.encoding = 'gbk'
# soup = BeautifulSoup(response.text, 'html.parser')# 解析成对象
# tag = soup.find(id='auto-channel-lazyload-article')
# h3 = tag.find(name='h3')
# print(h3)

#找到所有的新闻,包括标题、简洁、url、图片

response = requests.get('http://www.autohome.com.cn/news/')
response.encoding = 'gbk'       # 注意编码方式
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')# 解析成对象
li_list = soup.find(id='auto-channel-lazyload-article').find_all(name='li')
i = 1
for li in li_list:
    title = li.find('h3')           # 查找新闻标题[根据标签]
    if not title:
        continue
    # print(title.text)

    summary = li.find('p').text     # 查找新闻简介[根据标签]
    # print(summary)

    # li.find('a').attrs,得到一个字典
    # li.find('a').attrs['href'],和下面的效果一样
    url = li.find('a').get('href')      # 查找新闻标题的超链接url[根据属性]
    # print(url)


    img = li.find('img').get('src')     # 查找图片,其实得到的也是url
    # print(img)

    # -------------------------要保存图片的话需要再次发送请求,保存到本地-------------------------------------

    img = 'https:' + img    # 补全url
    # print(img)
    # print(title.text, url, summary, img)


    res = requests.get(img)             # 发送请求
    file_name = "%s.jpg" % (i,)         # 设置图片名称为1,2,3,。。
    i+=1
    # print(file_name)

    with open(file_name, 'wb') as f:    # 保存图片到当前文件夹下
        f.write(res.content)            # 注意这里是二进制[res.content]
View Code

猜你喜欢

转载自www.cnblogs.com/heimu24/p/9198958.html