使用xpath的@class=""以及@href等属性爬取最新经济学人blogs、news. etc

先把图上了:
这里写图片描述

1、首先从最新文章列表页爬取最新文章的链接

import json
import re
import urllib.request
from lxml import etree
import random
import requests
import time
import os
paperRecords = {}

with open('spiRecords.json','r') as fel:
    paperRecords = json.load(fel)

try:
    lastLst = paperRecords['lastLst']
except Exception as err:
    lastLst = []
dateStr = '2018-05-02'
#dateStr = time.strftime('%Y-%m-%d',time.localtime(time.time()))
toYear,toMonth,toDay = list(map(int,dateStr.split('-')))
strY = 'a' + str(toYear)
strM = 'a' + str(toMonth)
strD = 'a' + str(toDay)
try:
    if paperRecords[strY]:
        pass
    else:
        paperRecords[strY] = {}

except Exception as err:
    paperRecords[strY] = {}

try:
    if paperRecords[strY][strM]:        pass
    else:
        paperRecords[strY][strM] = {}

except Exception as err:
    paperRecords[strY][strM] = {}

try:
    if paperRecords[strY][strM][strD]:
        pass
    else:
        paperRecords[strY][strM][strD] = []

except Exception as err:
        paperRecords[strY][strM][strD] = []


headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',}

def get_ahrefs():
    url = 'https://economist.com/latest-updates'
    #url = 'https://economist.com'
    req = urllib.request.Request(url=url,headers=headers, method='GET')
    response = urllib.request.urlopen(req)
    html = response.read()
    selector = etree.HTML(html.decode('utf-8'))
    aa='//a[@class="teaser__link"]/@href'
    ahref = selector.xpath(aa)
    arr = []
    for a in ahref:
        arr.append(a)
        print(a)
    return arr

2、然后根据链接列表挨个爬取每篇文章的标题,dec以及文章正文

def get_artiles(url):
    req = urllib.request.Request(url=url,headers=headers, method='GET')
    try:
        response = urllib.request.urlopen(req)
        html = response.read()
        pout = []
        hout = []
        selector = etree.HTML(html.decode('utf-8'))
        headline = '//h1[@class="flytitle-and-title__body"]'
        head = selector.xpath(headline)
        for item in head:
            a = item.xpath('span[1]/text()')[0]
            b = item.xpath('span[2]/text()')[0]
            hout.append(a)
            hout.append(b)
        des = '//p[@class="blog-post__rubric"]/text()'
        desa = selector.xpath(des)
        for de in desa:
            hout.append(de)
        print(hout)
        ps = '//div[@class="blog-post__text"]'
        p=selector.xpath(ps)
        for item in p:
            pt = item.xpath('p/text()')
            for po in pt:
               pout.append(po)
        print(pout)
        result = {'head':hout,'ps':pout}
        return result
    except Exception as err:
        print(err,url)

3、运行起来:

if __name__ == '__main__':
    linkArr = getMain()
    time.sleep(20)
    tmpLast = []
    toDayDir = './mds/' + dateStr +'/papers/'
    if not os.path.exists(toDayDir):
        os.makedirs(toDayDir)
    for item in linkArr:
        print(item)
        if item not in lastLst:
            tmpLast.append(item)
            url = 'https://economist.com' + item
            article = getPaper(url)
            try:
                paperRecords[strY][strM][strD].append([item,article['head'][1]])
                paperName = '_'.join(article['head'][1].split(' '))
                saveMd = toDayDir + paperName+'.md'
                result = article['head']+article['ps']
                output = '\n\n'.join(result)
                with open(saveMd,'w') as fw:
                    fw.write(output)
                time.sleep(60)
            except Exception as err:
                print(err)

    paperRecords['lastLst'] = tmpLast
    with open('spiRecords.json','w') as fwp:
        json.dump(paperRecords,fwp)

猜你喜欢

转载自blog.csdn.net/lockey23/article/details/80202143
etc
今日推荐