Python爬虫段子网全代码

代码如下：

import itchat
import requests
from bs4 import BeautifulSoup

itchat.auto_login(enableCmdQR='-1',hotReload=True)

def send(url):
    users=itchat.search_friends(name=u'发送对象微信名')
    userName=users[0]['UserName']
    start_html = requests.get(url)   
    soup = BeautifulSoup(start_html.text, 'lxml')
    list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
         itchat.send(content, toUserName=userName)
@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
	if y.isdigit():
    	url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
	    url=url.format(y)
    	send(str(url))  
itchat.run()

1、微信登录

 itchat.auto_login(enableCmdQR='-1',hotReload=True)

2、获取发送的对象

users=itchat.search_friends(name=u'发送对象微信名')
userName=users[0]['UserName']

3、获取要爬虫的网页对象


   start_html = requests.get(url)   
   soup = BeautifulSoup(start_html.text, 'lxml')

4、解析网页内容并发送

 list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
                 itchat.send(content, toUserName=userName)

其中

url=soup.find(attrs={'class': 'next'}).get('href')

是获取下一页的链接，本想实现微信输入下一页，自动发送下一页网页内容，但是没能实现。
5、获取对方微信发送的内容，根据内容拼接网页连接，调用发送程序。

@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
    url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
    url=url.format(y)
    send(str(url))

6、由于这个网页总共40多页，所以微信输入的数字要在这之内，输入其他内容，不会爬取网页内容。另外，为了方便测试，可以将发送对象改为文件传输助手，代码如下：

import itchat
import requests
from bs4 import BeautifulSoup

itchat.auto_login(enableCmdQR='-1',hotReload=True)

def send(url):
    #users=itchat.search_friends(name=u'filehelper')
    #userName=users[0]['UserName']
    start_html = requests.get(url)   
    soup = BeautifulSoup(start_html.text, 'lxml')
    list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
         itchat.send(content, toUserName='filehelper')
@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
	if y.isdigit():    
		url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
    	url=url.format(y)
    	send(str(url))  
itchat.run()

Python爬虫段子网全代码

猜你喜欢