爬取新浪、网易、今日头条、UC四大网站新闻标题及内容

首先说明一下，文件的命名不能含有:?|"*<>\等英文字符，所以保存为文件的时候需要预处理一下。以下贴的代码都是爬取相应网站的社会新闻内容

新浪：

新浪网的新闻比较好爬取，我是用BeautifulSoup直接解析的，它并没有使用JS异步加载，直接爬取就行了。

from bs4 import BeautifulSoup
from urllib import request
def download(title, url,m):
    req = request.Request(url)
    response = request.urlopen(req)
    response = response.read().decode('utf-8')
    soup = BeautifulSoup(response,'lxml')
    tag = soup.find('div',class_='article')
    if tag == None:
        return 0
    #print(type(tag))
    #print(tag.get_text())
    title = title.replace(':','')
    title = title.replace('"','')
    title = title.replace('|','')
    title = title.replace('/','')
    title = title.replace('\\','')
    title = title.replace('*','')
    title = title.replace('<','')
    title = title.replace('>','')
    title = title.replace('?','')
    #print(tag.get_text())
    filename = r'D:\code\python\spider_news\sina_news\sociaty\\' +title+'.txt'
    with open(filename,'w',encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', m,'个新闻',title)
    return 0
if __name__ == '__main__':
    target_url = 'http://news.sina.com.cn/society/'
    req = request.Request(target_url)
    response = request.urlopen(req)
    response = response.read().decode('utf8')
    #print(response)
    soup = BeautifulSoup(response,'lxml')
    #print(soup.prettify())
    #file = open('d:\\test2.txt','w',encoding='utf8')
    #file.write(soup.prettify())
    y = 0
    for tag in soup.find_all('div',class_='news-item'):
        if tag.a != None:
            if len(tag.a.string) > 8:
                 #print(tag.a.string,tag.a.get('href'))
                 temp = tag.a.string
                 y += 1
                 download(temp,tag.a.get('href'),y)

网易：

网易新闻的标题及内容是使用js异步加载的，单纯的下载网页源代码是没有标题及内容的，我们可以在Network的js中找到我们需要的内容，这里我使用了正则表达式来获取我们需要的标题及其链接，并使用了BeautifulSoup来获取相应标题的内容。

import re
from urllib import request
from bs4 import BeautifulSoup

def download(title, url):
    req = request.urlopen(url)
    res = req.read()
    soup = BeautifulSoup(res,'lxml')
    #print(soup.prettify())
    tag = soup.find('div',class_='post_text')
    #print(tag.get_text())
    title = title.replace(':','')
    title = title.replace('"','')
    title = title.replace('|','')
    title = title.replace('/','')
    title = title.replace('\\','')
    title = title.replace('*','')
    title = title.replace('<','')
    title = title.replace('>','')
    title = title.replace('?','')
    #print(title)
    file_name = r'D:\code\python\spider_news\NetEase_news\sociaty\\' +title + '.txt'
    file = open(file_name,'w',encoding = 'utf-8')
    file.write(tag.get_text())
if __name__ == '__main__':
    urls = ['http://temp.163.com/special/00804KVA/cm_shehui.js?callback=data_callback',
            'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback',
            'http://temp.163.com/special/00804KVA/cm_shehui_03.js?callback=data_callback']
    for url in urls:
    #url = 'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback'
        req = request.urlopen(url)
        res = req.read().decode('gbk')
        #print(res)
        pat1 = r'"title":"(.*?)",'
        pat2 = r'"tlink":"(.*?)",'
        m1 = re.findall(pat1,res)
        news_title = []
        for i in m1:
            news_title.append(i)
        m2 = re.findall(pat2,res)
        news_url = []
        for j in m2:
            news_url.append(j)
        for i in range(0,len(news_url)):
            #print(news_title[i],news_body[i])
            download(news_title[i],news_url[i])
            print('正在爬取第' + str(i) + '个新闻',news_title[i])

头条：

头条的新闻跟前两个也都不一样，它的标题和链接是封装到json文件中的，但是他json文件的url参数是通过一个js随机算法变化的，所以我们需要模拟json文件的参数，否则我们找不到json文件的具体url，我是通过http://www.jianshu.com/p/5a93673ce1c0这篇博客才了解到url获取方法的，而且也解决了总是下载重复新闻的问题，该网站自带反爬机制，需要添加cookie。关于新闻的内容，我用了正则表达式提取了中文。

#coding:utf-8
from urllib import request
import requests
import json
import time
import math
import hashlib
import re
from bs4 import BeautifulSoup
def get_url(max_behot_time, AS, CP):
    url = 'https://www.toutiao.com/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' \
          '&max_behot_time={0}' \
          '&max_behot_time_tmp={0}' \
          '&tadrequire=true' \
          '&as={1}' \
          '&cp={2}'.format(max_behot_time, AS, CP)
    return url

def get_ASCP():
    t = int(math.floor(time.time()))
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS,CP
    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'AL'+ s + e[-3:]
    CP = e[0:3] + r + 'E1'
   # print("AS:"+ AS,"CP:" + CP)
    return AS,CP

def download(title, news_url):
   # print('正在爬')
    req = request.urlopen(news_url)
    if req.getcode() != 200:
        return 0

    res = req.read().decode('utf-8')
    #print(res)
    pat1 = r'content:(.*?),'
    pat2 = re.compile('[\u4e00-\u9fa5]+')
    result1 = re.findall(pat1,res)
    #print(len(result1))
    if len(result1) == 0:
        return 0
    print(result1)
    result2 = re.findall(pat2,str(result1))
    result3 = []
    for i in result2:
        if i not in result3:
            result3.append(i)
    #print(result2)
    title = title.replace(':','')
    title = title.replace('"','')
    title = title.replace('|','')
    title = title.replace('/','')
    title = title.replace('\\','')
    title = title.replace('*','')
    title = title.replace('<','')
    title = title.replace('>','')
    title = title.replace('?','')
    with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object:
        file_object.write('\t\t\t\t')
        file_object.write(title)
        file_object.write('\n')
        file_object.write('该新闻地址：')
        file_object.write(news_url)
        file_object.write('\n')
        for i in result3:
            #print(i)
            file_object.write(i)
            file_object.write('\n')
       # file_object.write(tag.get_text())
    #print('正在爬取')


def get_item(url):
    #time.sleep(5)
    cookies = {'tt_webid': '6478612551432734221'}
    wbdata = requests.get(url,cookies = cookies)
    wbdata2 = json.loads(wbdata.text)
    data = wbdata2['data']
    for news in data:
        title = news['title']
        news_url = news['source_url']
        news_url = 'https://www.toutiao.com' + news_url
        print(title, news_url)
        if 'ad_label' in news:
            print(news['ad_label'])
            continue
        download(title,news_url)
    next_data = wbdata2['next']
    next_max_behot_time = next_data['max_behot_time']
   # print("next_max_behot_time:{0}".format(next_max_behot_time))
    return next_max_behot_time

if __name__ == '__main__':

    refresh = 50
    for x in range(0,refresh+1):

        print('第{0}次：'.format(x))
        if x == 0:
            max_behot_time = 0
        else:
            max_behot_time = next_max_behot_time
            #print(next_max_behot_time)
        AS,CP = get_ASCP()
        url = get_url(max_behot_time,AS,CP)
        next_max_behot_time = get_item(url)

UC和新浪差不多，没有太复杂的反爬虫，直接解析爬取就好

#coding:utf-8
from bs4 import BeautifulSoup
from urllib import request

def download(title,url):
    req = request.Request(url)
    response = request.urlopen(req)
    response = response.read().decode('utf-8')
    soup = BeautifulSoup(response,'lxml')
    tag = soup.find('div',class_='sm-article-content')
    if tag == None:
        return 0
    title = title.replace(':','')
    title = title.replace('"','')
    title = title.replace('|','')
    title = title.replace('/','')
    title = title.replace('\\','')
    title = title.replace('*','')
    title = title.replace('<','')
    title = title.replace('>','')
    title = title.replace('?','')
    with open(r'D:\code\python\spider_news\UC_news\society\\' + title + '.txt','w',encoding='utf-8') as file_object:
        file_object.write('\t\t\t\t')
        file_object.write(title)
        file_object.write('\n')
        file_object.write('该新闻地址：')
        file_object.write(url)
        file_object.write('\n')
        file_object.write(tag.get_text())
    #print('正在爬取')

if __name__ == '__main__':
    for i in range(0,7):

        url = 'https://news.uc.cn/c_shehui/'
    #    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
    #               "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"}
    #    res = request.Request(url,headers = headers)
        res = request.urlopen(url)
        req = res.read().decode('utf-8')
        soup = BeautifulSoup(req,'lxml')
        #print(soup.prettify())
        tag = soup.find_all('div',class_ = 'txt-area-title')
        #print(tag.name)
        for x in tag:
            news_url = 'https://news.uc.cn' + x.a.get('href')
            print(x.a.string,news_url)
            download(x.a.string,news_url)

爬取新浪、网易、今日头条、UC四大网站新闻标题及内容

猜你喜欢