爬虫苦训第二天，终篇！！！

'''
目的：找个网页循环的爬去qq号码。
结果：有点错误，不过还是实现了中央控制器调度循环爬取得功能
'''
from urllib import request
import ssl
import re
import os
from collections import deque

def getUrlBytes(url):
    # 设置请求头
    headers = {
        'Accept': 'text/html, application/xhtml+xml, */*',
        # 'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2545.400',
        'DNT': '1',
        'Connection': 'Keep-Alive'
    }
    #设置为经过验证的上下文
    context = ssl._create_unverified_context()
    # 设置请求体
    req = request.Request(url, headers=headers)
    # 得到回复内容
    response = request.urlopen(req, timeout=1,context=context)
    return response.read().decode("utf-8")

def writeFileBytes(htmlBytes,topath):
    with open(topath, 'wb') as fp:
        fp.write(htmlBytes)
def writeFileString(htmlString,topath):
    with open(topath, 'w') as fp:
        fp.write(htmlString.decode("utf-8"))

def writeFileString(qqString,topath):
    with open(topath, 'a+') as fp:
        fp.write(str(qqString))
        #fp.write(str(htmlBytes))

def qqCrawker(url, toPath):
    len = 0
    qqData = getUrlBytes(url)
    if qqData == None:
        return None

    #拿到qq并且去重
    re_qq = re.compile('[1-9]\d{4,9}')
    qqList = re_qq.findall(qqData)
    qqList = list(set(qqList))
    for qq in qqList:
        len += 1
        writeFileString(qq.ljust(11),toPath)

    #匹配http url：拿到链接地址URL
    #re_url = re.compile('\b(([\w-]+://?|www[.])[^\s()<>]+(?:[\w\d]+[\w\d]+|([^[:punct:]\s]|/)))')
    re_url = re.compile('(((http|ftp|https)://)(([a-zA-Z0-9\._]+\.[a-zA-Z]{2,6})|'
                        '([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)')
    urlList = re_url.findall(qqData)
    print(len)
    return urlList

url = 'https://www.douban.com/group/topic/110094603/'
toPath = r'F:\编程语言学习\Python的学习\2019年3月份的学习\qqCrawker\QQ.txt'
#qqCrawker(url, toPath)

def centerControl(url, toPath):
    queue = deque()
    queue.append(url)
    while len(queue) != 0:
        targetUrl = queue.popleft()
        urlList = qqCrawker(targetUrl, toPath)
        if urlList == None:
            continue
        for item in urlList:
            #第一个位置才是url
            tempUrl = item[0]
            queue.append(tempUrl)

centerControl(url, toPath)

　　总结：爬虫部分暂时告一段落，我将继续python的基础知识学习了。等我学习完基础知识，再来补充爬虫部分的内容。经历了两天，甚至三天的时光，我终于对爬虫有了基础的了解。爬虫就是通过程序模拟浏览器去网站上自动获取我们想要的数据。看似简单，实际上许多的细节包含其中，对Web结构的分析，定义获取数据的规则（目前使用正则去定义）,然后处理存入文件等，还有此处的通过中央调度器去实现循环爬取，会发现效率时非常慢的。等之后学写了多线程在回过头来修改，我将继续前行，加油！

爬虫苦训第二天，终篇！！！

猜你喜欢