python3 爬虫爬取blog内容

#!/usr/bin/python3
# _*_ coding:UTF-8 _*_


import requests
from bs4 import BeautifulSoup


class Downloader(object):
def __init__(self):
self.server = 'https://blog.csdn.net/zhangyun75'
self.urls = []

def get_download_url(self):
req = requests.get(url = self.server)
html = req.text
div_bf = BeautifulSoup(html, "lxml")
div = div_bf.find_all('div', class_ = 'article-list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
link = each.get('href')
if self.urls.count(link) == 0:
self.urls.append(link)


def get_contents(self, target):
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html, "lxml")
title = bf.find_all('h1', class_ = 'title-article')
title = title[0].text
texts = bf.find_all('div', class_ = 'htmledit_views')
texts = texts[0].text.replace('\n', '')
# print(target, texts)
return title, texts


dl = Downloader()
dl.get_download_url()
print('start downloading:')
#print(dl.urls)
i = 0
for url in dl.urls:
title, text = dl.get_contents(url)
# print(title, text, url)
with open("./downfile/file"+str(i)+".txt", 'w', encoding='utf-8') as f:
f.write(text+'\n')
i = i + 1
print("已下载:%.3f%%" % (100 * float(i/len(dl.urls))))
print('finish downloading:')

猜你喜欢

转载自blog.csdn.net/zhangyun75/article/details/80818911