python 爬虫封装自己的常用方法

import urllib
import urllib.request
import ssl
import re
from collections import deque

def writeFile2Strs(url,topath):
    with open(topath,"w") as f:
        f.write(getHtmlBytes(url).decode("utf-8"))

def writeFile2Bytes(url,topath):
    with open(topath,"wb") as f:
        f.write(getHtmlBytes(url))

def getHtml_Str(url,decode="utf-8"):
    return getHtmlBytes(url).decode(decode)

def getURL_list(strs):
    parUrl = r"(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)"
    re_URL = re.compile(parUrl)
    listURL = list(set(re_URL.findall(strs))) #这里的listURL中每个元素都是又个一个列表
    listURLs = []
    for URLi in listURL: #取每个元素列表的[0]
        listURLs.append(URLi[0])
    return listURLs
def getQQ_list(strs):
    pat = r"[1-9]\d{4,10}"
    re_pat = re.compile(pat)
    listQQ = re_pat.findall(strs)
    listQQ = list(set(listQQ))
    return  listQQ

def proceedAllUrlList(url,urlProceed):
    #筛选所有符合的列表
    dq = deque()
    dq.append(url)
    
    while len(dq)!= 0:
        targeturl = dq.popleft()
        urlList = getURL_list(getHtml_Str(url))
        urlProceed(url)
        for oneURL in urlList:
            dq.append(oneURL)

def getHtmlBytes(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    req = urllib.request.Request(url, headers=headers)

    # 使用ssl创建未验证的上下文
    context = ssl._create_unverified_context()
    try:
        response = urllib.request.urlopen(req, timeout=5,context=context)
    except:
        print("爬取超时！关闭线程")
        return -1
    return  response.read()
python 爬虫 封装自己的常用方法

猜你喜欢

python 爬虫封装自己的常用方法