python3 with urllib grab paste it and QQ mailbox examples

In this article we order in small series on the use of python3 urllib crawl paste it and QQ mailbox instance, a need friends can learn under
our first look at the example code:

import urllib
import urllib.request
import re
from urllib import parse
 
#抓取贴吧页面数量信息
def gettiebalistnumbers(name):  #计算搜索的关键词有多少页 输入名字 返回页数
  url="https://tieba.baidu.com/f?"
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
  word = {"kw": name} # 接口  贴吧的名字
  word = parse.urlencode(word) # 编码成字符串
  url = url + word # 拼接url
  request = urllib.request.Request(url, headers=headers) # 发送请求
  # 也可以通过调用Request.add_header() 添加/修改一个特定的 header
  request.add_header("Connection", "keep-alive") # 一直活着
  response = urllib.request.urlopen(request) # 打开请求
  data = response.read().decode("utf-8") # 读取数据
  print(response.code) # 可以查看相应状态码
 
  restr = "<span class=\"card_infoNum\">([\s\S]*?)</span>" # 正则这个贴吧有多少帖子
  regex = re.compile(restr, re.IGNORECASE)
  mylist = regex.findall(data) #寻找页面所有符合条件的
  tienumbers = mylist[0].replace(",","") #替换逗号
  tienumbers = eval(tienumbers)  #str转化为数字
  #print(tienumbers)
 
  restr = "<span class=\"card_menNum\">([\s\S]*?)</span>" # 正则关注贴吧的数
  regex = re.compile(restr, re.IGNORECASE)
  mylist = regex.findall(data) # 寻找页面所有符合条件的
  Peoplenumbers = mylist[0].replace(",", "") # 替换逗号
  Peoplenumbers = eval(Peoplenumbers) # str转化为数字
  #print(Peoplenumbers)
 
  return tienumbers,Peoplenumbers
 
def gettiebalist(name):  #抓取所有的符合name的页数 输入搜索关键词,返回所有的页数url
  numberstuple=gettiebalistnumbers(name)  #(元组)
  tienumbers=numberstuple[1]  #帖子的数量
  tiebalist = []
  if tienumbers%54==0:  #生成页面列表
    for i in range(tienumbers//54):
      tiebalist.append("https://tieba.baidu.com/f?kw="+name+"&pn="+str(i*50))
  else:
    for i in range(tienumbers//54+1):
      tiebalist.append("https://tieba.baidu.com/f?kw="+name+"&pn="+str(i*50))
  #print(tiebalist)
  return tiebalist
def geturllistformpage(url):   #抓取页面的每个帖子url 输入一页url 返回列表内的的所有url
  headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
  request = urllib.request.Request(url, headers=headers) # 发起请求,
  # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
  response=urllib.request.urlopen(request)
  data=response.read().decode("utf-8","ignore")#打开请求,抓取数据
  #print(response.code) # 可以查看响应状态码
 
  restr = "<ul id=\"thread_list\" class=\"threadlist_bright j_threadlist_bright\">([\s\S]*?)<div class=\"thread_list_bottom clearfix\">" # 正则表达式,()只要括号内的数据
  regex = re.compile(restr, re.IGNORECASE)
  mylist = regex.findall(data)
  #print(mylist[0])#抓取整个表格
 
  restr = "href=\"/p/(\d+)\"" # 正则表达式,()只要括号内的数据
  regex = re.compile(restr, re.IGNORECASE)
  urltitlelist = regex.findall(data)
  #print(urltitlelist)   #抓取的url变化的数字
  urllist=[]
  for title in urltitlelist:
    urllist.append("http://tieba.baidu.com/p/"+title)  #拼接链接
  #print(urllist) #得到每个页面的帖子url列表
  return urllist
def getallurllist(url):     #获取每一页里面的分页  输入一个帖子url 输出所有分页url链接
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
  request = urllib.request.Request(url, headers=headers) # 发送请求
  # 也可以通过调用Request.add_header() 添加/修改一个特定的 header
  response = urllib.request.urlopen(request) # 打开请求
  tiebadata = response.read().decode("utf-8", "ignore") # 读取数据
  allurllist1=[]
 
  restr = "共<span class=\"red\">(\d+)</span>页</li>" # 正则表达式,()只要括号内的数据
  regex = re.compile(restr, re.IGNORECASE)
  numalllist = regex.findall(tiebadata)
  nums=eval(numalllist[0])
  for i in range(1,nums+1):
    allurllist1.append(url+"?pn="+str(i))
 
  return allurllist1
 
  # print(urltitlelist)   #抓取的url变化的数字
 
 
def getpagedata(url):
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
  request = urllib.request.Request(url, headers=headers) # 发送请求
  # 也可以通过调用Request.add_header() 添加/修改一个特定的 header
  response = urllib.request.urlopen(request) # 打开请求
  pagedata = response.read().decode("utf-8","ignore") #读取数据
  return pagedata
def getemaillistfrompage(pagedata): #在帖子内页面,把每一个邮箱抓取下来  输入一个帖子url 返回邮箱
  emaillist = []
  restr = "[A-Z0-9._%+-]+[@][A-Z0-9.-]+\.[A-Z]{2,4}" # 正则表达式,()只要括号内的数据
  regex = re.compile(restr, re.IGNORECASE)
  emaillist = regex.findall(pagedata)
  return emaillist   #返回提取的邮箱列表
 
def QQlistfrompage(url): #在帖子内页面,把每一个邮箱抓取下来  输入一个帖子url 返回QQ
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
  request = urllib.request.Request(url, headers=headers)
  response = urllib.request.urlopen(request)
  #data = response.read().decode("utf-8","ignore") #读取数据
  QQlist = []
  while True:
    line = response.readline()
    line = line.decode('utf-8')
    if not line:
      break
    if line.find("QQ") != -1 or line.find("Qq") != -1 or line.find("qq") != -1:
      restr = "[1-9]\\d{4,10}" # 正则表达式,()只要括号内的数据
      regex = re.compile(restr, re.IGNORECASE)
      templist = regex.findall(line)
      QQlist.extend(templist)
  return QQlist
 
#print(gettiebalistnumbers("python"))
 
#print(gettiebalist("python3"))
 
 
#mylist=gettiebalist("python3")
#for line in mylist:
#  print(line)
#geturllistformpage("https://tieba.baidu.com/f?kw=python3&ie=utf-8&pn=4000")
#print(getemaillistfrompage(getpagedata("http://tieba.baidu.com/p/6490450301")))
#print(QQlistfrompage("http://tieba.baidu.com/p/3950107421"))
""" 
name="qqmail"
emailalllist=[]
for numberurl in gettiebalist(name):  #取出这个关键词 所有页面的url
  tieziurllist=geturllistformpage(numberurl) #取出每个页面的 帖子url
  for fentieziurllist in tieziurllist:
    tieziurllist1=getallurllist(fentieziurllist)
     
    for pagetext in tieziurllist1:   
      pagedata=getpagedata(pagetext)  #取出每个页面的代码
      datas=getemaillistfrompage(pagedata) #正则提取邮箱
      if len(datas) !=0:    #如果提取的里面一个页面上的一个帖子 邮箱不是空的话
        emailalllist.append(datas[0])
print(emailalllist)       #测试可以提取一个 贴吧的所有邮箱
 """
"""
name="qqmail"
QQalllist=[]
for numberurl in gettiebalist(name):  #取出这个关键词 所有页面的url
  tieziurllist=geturllistformpage(numberurl) #取出每个页面的 帖子url
  for url in tieziurllist:
    QQnumberlist=QQlistfrompage(url)  #提取的里面一个页面上的一个帖子的QQ
    #print(QQnumberlist)
    if len(QQnumberlist) != 0:  #如果一个页面QQ不为空的话
      for qqdata in QQnumberlist:  #一个页面QQ列表遍历
        QQalllist.append(qqdata)  #添加到列表中
     #  qq=QQalllist.append(QQnumberlist[0])
#print(QQalllist)# #提取一个贴吧的所有QQ 测试成功
"""
 
name="qqmail"
savefilepath="qqmail_qq.txt"
savefile=open(savefilepath,"wb")
for numberurl in gettiebalist(name):  #取出这个关键词 所有页面的url
  tieziurllist=geturllistformpage(numberurl) #取出每个页面的 帖子url
  for fenurl in tieziurllist:
    tieziurllist1=getallurllist(fenurl)  #一个页面分页的所有链接
    for url in tieziurllist1:
      QQnumberlist=QQlistfrompage(url)  #提取的里面一个页面上的一个帖子的QQ
      #print(QQnumberlist)
      if len(QQnumberlist) != 0:  #如果一个页面QQ不为空的话
        print(QQnumberlist)
        qqstr=" ".join(QQnumberlist)
        savefile.write((qqstr+"\r\n").encode("utf-8"))
 
     #  qq=QQalllist.append(QQnumberlist[0])
#最后写入文件测试, 写入qq.txt 69K
# TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
#可优化为timeout=  或者导入 import time 进行time.sleep(3) 睡眠定时访问操作,
#为避免出错,还需再访问url时加入 try  except 出错避过

Knowledge expansion:

Proxy settings

urllib2 default environment variable http_proxy will be used to set the HTTP Proxy. If you want to explicitly control program Proxy without being affected by the environment variables, you can use the following way

import urllib2
enable_proxy = True
proxy_handler = urllib2.ProxyHandler({"http" : 'http://some-proxy.com:8080'})
null_proxy_handler = urllib2.ProxyHandler({})
 
if enable_proxy:
  opener = urllib2.build_opener(proxy_handler)
else:
  opener = urllib2.build_opener(null_proxy_handler)
 
urllib2.install_opener(opener)

Here we must note a detail, using urllib2.install_opener () sets the global opener urllib2. Such use of the latter will be very convenient, but can not do more fine-grained control, for example, want to use two different Proxy settings in the program. Good practice is to not use install_opener to change global settings, but only calls the open method opener instead of global urlopen methods directly.
I write to you, for everyone to recommend a very wide python learning resource gathering, click to enter , there is a senior programmer before learning to share experiences, study notes, there is a chance of business experience, and for everyone to carefully organize a python zero the basis of the actual project data, daily python to you on the latest technology, prospects, learning to leave a message of small details

These are the python3 with urllib grab paste it and QQ mailbox details examples

Published 43 original articles · won praise 30 · views 60000 +

Guess you like

Origin blog.csdn.net/haoxun09/article/details/104806925