#coding=utf-8
"""
author:susu
date:2018/6/25
function:get the links of blog
"""
import re
import urllib.request
def getlink(url):
#模拟成浏览器
headers = ("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#将opener 安装为全局
urllib.request.install_opener(opener)
file = urllib.request.urlopen(url)
data = str(file.read())
#根据需求构建匹配表达式
pattern = '(http://+[a-zA-Z])'
link =re.compile(pattern).findall(data)
#去除重复元素
link = list(set(link))
return link,data
if __name__ =='__main__':
url ='http://blog.csdn.net/'
#获取对应网页的地址链接
linklist ,date= getlink(url)
print (date)
print (linklist)
for link in linklist:
print(link)
链接爬虫实战(CSDN博客)
猜你喜欢
转载自blog.csdn.net/weixin_40411446/article/details/80799271
周排行