爬取博客园最新文章

遇到的问题:

目标div块循环中,有其它杂div,如何排除?

 解决方法:

    for i in range(1, 40, 2):
        infos = selector.xpath('//*[@id="kb_list"]/div[{}]'.format(str(i)))
        for info in infos:
            ###

 源代码:

import csv
import os
from lxml import etree
import requests
import time

#创建csv
outPath = 'D://hotwords_data.csv'
if (os.path.exists(outPath)):
os.remove(outPath)
fp = open(outPath, 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('date', 'title', 'link','summary'))

# 加入请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

def get_info(url):
res = requests.get(url, headers=headers)
selector = etree.HTML(res.text)
for i in range(1, 40, 2):
infos = selector.xpath('//*[@id="kb_list"]/div[{}]'.format(str(i)))
for info in infos:
title = info.xpath('h2/a[@class="kb-title"]/text()')[0]
print(title)
link=info.xpath('h2/a[@class="kb-title"]/@href')[0]
parselink="https:"+link
#print(parselink)
summary=info.xpath('div[1]/text()')[0]
#print(summary)
date=info.xpath('div[2]/span[@class="green"]/text()')[0]
#print(date)
# 写入csv
writer.writerow((date, title, parselink,summary.strip()))

#程序主入口
if __name__ == '__main__':
urls = ['https://home.cnblogs.com/kb/page/{}/'.format(str(i)) for i in range(1,101)]
for url in urls:
print(url)
get_info(url)
time.sleep(2)

数据截图:

猜你喜欢

转载自www.cnblogs.com/sengzhao666/p/12349286.html
今日推荐