链接爬虫实战(CSDN博客)

#coding=utf-8
"""
author:susu
date:2018/6/25
function:get the links of blog
"""
import re
import urllib.request
def getlink(url):
  #模拟成浏览器
  headers = ("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36 SE 2.X MetaSr 1.0")
  opener = urllib.request.build_opener()
  opener.addheaders = [headers]
  #将opener 安装为全局
  urllib.request.install_opener(opener)
  file = urllib.request.urlopen(url)
  data = str(file.read())
  #根据需求构建匹配表达式
  pattern = '(http://+[a-zA-Z])'
  link =re.compile(pattern).findall(data)
  #去除重复元素
  link = list(set(link))
  return link,data
if __name__ =='__main__':
    url ='http://blog.csdn.net/'
    #获取对应网页的地址链接
    linklist ,date= getlink(url)
    print (date)
    print (linklist)
    for link in linklist:
        print(link)

猜你喜欢

转载自blog.csdn.net/weixin_40411446/article/details/80799271