微信爬虫实战

所谓微信爬虫，及自动获取微信的相关文章信息的一种爬虫。微信对我们的限制是很多的，所以，我们需要采取一些手段解决这些限制，主要包括伪装浏览器、使用代理IP等方式。

import re

import urllib.request

import time

import urllib.error

#自定义函数，功能为使用代理服务器爬取一个网址

def use_proxy(IP,url):

try:

req=urllib.request.Request(url)

req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0")

proxy=urllib.request.ProxyHandler({"http":IP})

opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)

#添加为全局

urllib.request.install_opener(opener)

data=urllib.request.urlopen(url).read()

data=data.decode("utf-8","ignore")

return data

except urllib.error.URLError as e:

if hasattr(e,"code"):

print(e.code)

if hasattr(e,"reason"):

print(e.reason)

#若为URLError异常，延时10秒执行

time.sleep(10)

except Exception as e:

print("exception: "+str(e))

#若为Exception 异常，延时1秒执行

time.sleep(1)

#设置关键词

key="Python"

#设置代理服务器，该代理服务器有可能失效，需要换成新的有效代理服务器

proxy="139.129.99.9:3128"

#爬多少页

for i in range(0,10):

key=urllib.request.quote(key)

thispageurl="http://weixin.sougou.com/weixin?type=2&query="+key+"&page="+str(i)

#a="http://biog.csdn.net"

thispagedata=use_proxy(proxy,thispageurl)

print(len(str(thispagedata)))

pat1='<a href="(.*?)"'

#模式修正符re.S设置.可以匹配多行

rs1=re.compile(pat1,re.S).findall(str(thispagedata))

if(len(rs1)==0):

print("此次（"+str(i)+"页）没成功！！！")

continue

for j in range(0,len(rs1)):

thisurl=rs1[j]

#需要将每个文章地址的部分进行替换得到真正的地址

thisurl=thisurl.replace("amp;","")

file=r"C:\Users\Mr.Ma\Desktop\Wei\第"+str(i)+"页第"+str(j)+"篇文章.html"

thisdata=use_proxy(proxy,thisurl)

print(len(thisdata))

try:

fh=open(file,"wb")

fh.write(thisdata)

fh.close()

print("第"+str(i)+"页第"+str(j)+"篇文章成功！")

except Exception as e:

print(e)

print("第"+str(i)+"页第"+str(j)+"篇文章失败！")

猜你喜欢