python 爬取网页新浪新闻

爬取到网页的详细三级目录导航：重点只在于获取DOM节点
初学python 代码有点难看：先贴代码再分析网页

#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Lilu'

import os
import re
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from urllib import request
import sys
import itertools
import mysql.connector
from datetime import datetime
# 这是引入自己写的JournalismText模板：用于解析正文并下载图片，然后过滤正文中的class以及style,id等敏感词汇，
sys.path.append(r'E:\Python\cocn\venv\Demo')
import JournalismText

url = 'http://news.sina.com.cn/world/'
header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,'
                      ' like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
target_req = request.Request(url=url, headers=header)

target_response = request.urlopen(target_req, timeout=5)
#将网页的源码html读取出来
target_html = target_response.read().decode('utf-8', 'ignore')
#通过BeautifulSoup来解析读取出来的target_html 
soups = BeautifulSoup(target_html, 'lxml')
#解析过后就可以通过选择器进行抓取了
data = soups.select('div[class="wrap"]', limit=1)
soup = BeautifulSoup(str(data), 'lxml')
begin_flag = False
num = 0
#这里就处理爬取出来的内容了
for child in soup.div.children:
    # 滤除回车
    if child != '\n':
        begin_flag = True
        # 爬取链接并下载内容
        if begin_flag == True and child != None:
            if num == 0:
                num += 1
                continue
            # 获取到一级标题名
            ch_name = child.string
            # 获取到一级标题路径
            ch_url = child.get('href')
            print(ch_url, '````````````````````````````````````')
            dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            value = [str(ch_name), ch_url, str(dt)]
            # 根据获取到的二级目录的URL再进入二级目录
            download_req2 = request.Request(url=ch_url, headers=header)

            download_response2 = request.urlopen(download_req2)
            # 读取二级目录内容
            download_html2 = download_response2.read().decode('utf-8', 'ignore')
            # 解析
            soups1 = BeautifulSoup(download_html2, 'lxml')
            # 利用select选择器抓取节点
            data1 = soups1.select('div[class="links"]', limit=1)
            print(data1)
            soup1 = BeautifulSoup(str(data1), 'lxml')
            begin_flag1 = False
            for child1 in soup1.div.children:
                # 滤除回车
                if child1 != '\n':
                    begin_flag1 = True
                    # 爬取链接并下载内容
                    if begin_flag1 == True and child1 != None:
                        # 获取到一级标题名
                        ch_name1 = child1.string
                        # 获取到一级标题路径
                        ch_url1 = child1.get('href')
                        dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        value = [str(ch_name1), ch_url1, str(ch_name), str(dt)]
                        for i in value:
                            print(type(i))
                        #获取新闻详情列表
                        header = {
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,'
                                          ' like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
                        # 根据爬取出来的详情列表URL 再去请求
                        download_req3 = request.Request(url=ch_url1, headers=header)

                        download_response3 = request.urlopen(download_req3)
                        # 读取出详情页URL
                        download_html3 = download_response3.read().decode('gbk', 'ignore')
                        # 解析成可选择对象
                        soups = BeautifulSoup(download_html3, 'lxml')
                        # 抓住URL
                        da = soups.find_all('div', class_='listBlk')

                        soup = BeautifulSoup(str(da), 'lxml')

                        begin_flag2 = False
                        # 处理好URL格式
                        for child2 in soup.ul.children:
                            # 滤除回车
                            if child2 != '\n':
                                begin_flag2 = True
                                # 爬取链接并下载内容
                                if begin_flag2 == True and child2.a != None:
                                    child_name = child2.a.string
                                    child_url = child2.a.get('href')
                                    chid_time = child2.span.string
                                    print(child_name, child_url, chid_time)
                                    dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                                    value = [str(child_name), child_url, str(chid_time), 
                                    # 获取详情
                                    # 将URL传入JournalismText模板 解析正文
                                    lis = JournalismText.getJournalismText(child_url, child_name)

JournalismText 模板：

#! usr/bin/env python3
# -*- coding:utf-8 -*-

__author__ = 'Lilu'

import os
import re
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from urllib import request
import pandas
import mysql.connector
from datetime import datetime
import urllib.request

def getJournalismText(url, child_name):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,'
                      ' like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
    target_req = request.Request(url=url, headers=header)

    target_response = request.urlopen(target_req, timeout=5)
    target_html = target_response.read().decode('utf-8', 'ignore')

    # 解析获取到的target_html
    datas = BeautifulSoup(target_html, 'html.parser')
    # 获取包裹正文的div
    data = datas.select('div[class="article"]', limit=1)
    # 获取包含图片的标签
    dataimg = datas.select('div[class="img_wrapper"]')

    # 利用正则将图片的URL取出来进行格式化并下载到本地
    reg = r'(http:[^\s]*?(jpg|png|gif))'
    imgre = re.compile(reg)
    imglist = imgre.findall(str(dataimg))
    num = 1
    l = []
    for img, t in imglist:
        s = str(img).split('/')
        name = s.pop(),
        f = open('D:/Workspaces/MyEclipseProfessional2014/imageStatic/img/%s' %name, 'wb')
        l.append('D:/Workspaces/MyEclipseProfessional2014/imageStatic/img/%s' %name)
        req = urllib.request.urlopen(img)
        buf = req.read()
        f.write(buf)
        num += 1

    # 修改掉文中的图片路径
    for i in range(len(dataimg)):
        img = dataimg[i].select('img')[0]
        for a in range(len(l)):
            if i == a:
                img.attrs['src'] = l[a]

    # 过滤掉文中的class字样
    text = str(data)
    re_class = re.compile('class=(\"[^><]*\")')
    text = re_class.sub("", text)
    # 获取标题
    title = datas.select('h1')[1].text.strip()
    # 获取时间
    nt = datas.select('.date')[0].contents[0].strip()
    newsAr = getnewsArticle(datas.select('p')[:-4])
    newsArticle = newsAr[-1]
    dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    value = [title, "".join(text), nt, newsArticle, str(child_name), str(dt)]
    for i in value:
        print(i)
    return value


def getnewsArticle(news):
    newsArticle = []
    for p in news:
        newsArticle.append(p.text.strip())
    return newsArticle

满满的干货完全可以copy：
图解：
1，第一步：获取的第一级中的div并迭代出其中的a标签的值和URL
第一级导航
2，第二步：根据第一步获取的URL来获取每一个的二级菜单：获取其中的（同上）
第二级导航
3，第三步：根据第二步获取的URL来获取每一个的列表菜单：获取其中的（同上）
第三级导航
4，第四步：根据第三步获取的URL来获取每一个的列表菜单的正文：这下就到底了
第四级导航

心得：就是要获取到相关的节点然后通过python的库来解析，其中的数据类型一定要熟悉

python 爬取网页新浪新闻

猜你喜欢