版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/csm201314/article/details/82935282
缘由
闲得无聊,恰巧最近接触了一下py的bs4爬虫,以及实验室一些工作需要用py写个简单的脚本,今晚写完博客之后,突发奇想顺手写个抓取CSDN网页实现刷阅读量的脚本,记录一下(不过最好慎用,不知道会不会被封IP或者封号n_n,反正我就写着玩的)。
python代码如下(python2)
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import json
import urllib2
import re
import requests
from lxml import html
import time
import sys
reload(sys)
sys.setdefaultencoding( "gbk" )
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
# headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'}
def getPage(url):
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
contents = response.read()
soup = BeautifulSoup(contents, "html.parser")
for h4 in soup("h4"):
url = h4("a")[0]["href"]
if url.find("csm201314") == -1:
continue
print url
r = urllib2.Request(url, headers=headers)
urllib2.urlopen(r)
time.sleep(1)
if __name__ == '__main__':
for i in range(1, 4):
getPage("https://blog.csdn.net/csm201314/article/list/%d" % i)