import urllib
import urllib.request
import ssl
import re
from collections import deque
def writeFile2Strs(url,topath):
with open(topath,"w") as f:
f.write(getHtmlBytes(url).decode("utf-8"))
def writeFile2Bytes(url,topath):
with open(topath,"wb") as f:
f.write(getHtmlBytes(url))
def getHtml_Str(url,decode="utf-8"):
return getHtmlBytes(url).decode(decode)
def getURL_list(strs):
parUrl = r"(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)"
re_URL = re.compile(parUrl)
listURL = list(set(re_URL.findall(strs))) #这里的listURL中每个元素都是又个一个列表
listURLs = []
for URLi in listURL: #取每个元素列表的[0]
listURLs.append(URLi[0])
return listURLs
def getQQ_list(strs):
pat = r"[1-9]\d{4,10}"
re_pat = re.compile(pat)
listQQ = re_pat.findall(strs)
listQQ = list(set(listQQ))
return listQQ
def proceedAllUrlList(url,urlProceed):
#筛选所有符合的列表
dq = deque()
dq.append(url)
while len(dq)!= 0:
targeturl = dq.popleft()
urlList = getURL_list(getHtml_Str(url))
urlProceed(url)
for oneURL in urlList:
dq.append(oneURL)
def getHtmlBytes(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
req = urllib.request.Request(url, headers=headers)
# 使用ssl创建未验证的上下文
context = ssl._create_unverified_context()
try:
response = urllib.request.urlopen(req, timeout=5,context=context)
except:
print("爬取超时!关闭线程")
return -1
return response.read()
python 爬虫 封装自己的常用方法
猜你喜欢
转载自blog.csdn.net/weixin_40938748/article/details/85324861
今日推荐
周排行