爬虫爬取QQ号

这个爬虫是拿来练手的，可以爬取网络中的QQ号，然后存储到本地。

import urllib.request
import ssl
import re
import os
from collections import deque #导入队列库

def writeFileBytes(htmlBytes,topath):
    with open(topath,'wb') as f:
        f.write(htmlBytes)
def writeFileStr(htmlBytes,topath):
    with open(topath, 'w') as f:
        f.write(str(htmlBytes))

def QQCrawler(url, topath):
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }          #修改一下浏览器表头（User-Agent）
    req = urllib.request.Request(url, headers=headers)  # 请求体
    context = ssl._create_unverified_context()          #加上此行可以爬取https格式
    response = urllib.request.urlopen(req, context=context)
    HTMLStr = response.read()
    # writeFileBytes(HTMLStr,r"G:\python代码\爬虫\QQ号.html")
    # writeFileStr(HTMLStr,r"G:\python代码\爬虫\QQ号.txt")

    htmlStr=str(HTMLStr)

    pat=r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:[\w\d]+[\w\d]+|([^[:punct:]\s]|/)))'  
    #网址的正则表达式
    re_url=re.compile(pat)
    urlsList=re_url.findall(htmlStr)
    urlsList = list(set(urlsList))


    #找到所有的QQ
    pat=r"[1-9][\d]{5,9}"        #QQ号的正则表达式
    re_q=re.compile(pat)         #编译
    qqList=re_q.findall(htmlStr)
    qqList=list(set(qqList))   #过滤重复QQ号
    f=open(topath,'a')
    #写入文件
    for qq in qqList:
        f.write(qq+"\n")
    f.close()


    return urlsList  #返回一个存储这个网页中的超链接



#队列，得到一页中的网址，然后加入队列，爬取每个网页的QQ号
def center(url,topath):
    queue=deque()
    queue.append(url)
    while len(queue)!=0:
        targetUrl=queue.popleft()
        urlList=QQCrawler(targetUrl, path)

        for item in urlList:
            tempUrl=item[0]
            queue.append(tempUrl)

url = r"https://www.douban.com/group/topic/17359302/" #爬取链接
path = r"G:\python代码\爬虫\QQ号.txt" #存储路径
# QQCrawler(url,path)
center(url,path)

还存在一些问题：url的正则表达式存在问题，不能匹配所有的链接。运行时也存在问题，爬取到本地的数据只有2M多，程序就停止运行了。

猜你喜欢