【Python3爬虫(三)】【urlib.request模块】【cookie+Request】

上一篇:【Python3爬虫(二)】【urlib.request模块】【付费代理+auth认证】

++++++++++开始线++++++++++++++++++

一、 cookie

1.1 认识cookie

01-cookies.py

import urllib.request

# 1.数据url
url = 'https://www.yaozh.com/member/'
# 2.添加请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/70.0.3538.67 Safari/537.36 '
}

# 3.构建请求对象
request = urllib.request.Request(url,headers=headers)

# 4.发送请求对象
response = urllib.request.urlopen(request)

# 5.读取数据
data = response.read()
print(type(data))

# 保存到文件中 验证数据
with open('01cook.html', 'wb') as f:
    f.write(data)

01cook.html

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "//www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html class="ready-hide" xmlns="//www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="robots" content="nofollow" />
<title>消息提示</title>
<link type="text/css" rel="stylesheet" href="/public/css/core.css"/>
<link rel="stylesheet" href="//static.yaozh.com/css/app.css">
<link rel="stylesheet" href="/public/css/index.new2.css?t=20180803">
</head>
<script>
//设置domain
document.domain = location.host.replace(/(\w|-)+\./,'');

//全局变量 Global Variables
var GV = {
   JS_ROOT : '/public/js/',//js目录
   JS_VERSION : "0D0AEBFF77",
   TOKEN : "0D0AEBFF77",  //token $.ajaxSetup data
   FILE_TYPE : "*.gif;*.jpg;*.png;*.zip;*.rar;*.txt;*.doc;*.docx;*.pdf;*.xls;*.xlsx;",
   FILE_SIZE : "10 MB",
   UID : parseInt(""),       //用户空间(参数 : uid)
   URL : {
      IMAGE : '/images/',                               //登录地址
      QUICK_LOGIN : '/ajax/login/?backurl=/login'
   }
};
</script>
<script src="/public/js/yaozh.js?v=20130531"></script>
<script>
Wind.use("jquery",function(){
  if(!!window.frameElement){
    $('html').addClass('iframe');
  }    
});

</script>  <script>
    var config = {
      baseUrl : '//static.yaozh.com/js',
      deps : ['/public/js/newindex2.js']
    }
  </script>
  <script src="//static.yaozh.com/js/app.js?t=20150610"></script>
<body>
<!-- header -->
<div class="header">
   <div class="wrapper">
     <div class="header-left">
                            
                            <a target="_blank" class="item" href="https://news.yaozh.com">新闻资讯</a>
              
                            <a target="_blank" class="item" href="https://db.yaozh.com">药智数据</a>
              
                            <a target="_blank" class="item" href="https://patent.yaozh.com">专利通</a>
              
                            <a target="_blank" class="item" href="https://zx.yaozh.com/?yaozh">药智咨询</a>
              
                            <a target="_blank" class="item" href="https://www.yaozh.com/zhihui/?yaozh">药智汇</a>
              
                            <a target="_blank" class="item" href="https://s.yaozh.com">药智通</a>
              
                            <a target="_blank" class="item" href="https://edu.yaozh.com/">药智学院</a>
              
                            <a target="_blank" class="item" href="https://bbs.yaozh.com">论坛交流</a>
              
                            <a target="_blank" class="item" href="https://club.yaozh.com/">俱乐部</a>
              
                            <a target="_blank" class="item" href="https://www.yaohaiwai.com/?ga_source=www&ga_name=top_navigationbar">海外智通</a>
              
                            <a target="_blank" class="item" href="https://gu.yaozh.com/ ">药智谷</a>
              
                            <a target="_blank" class="item" href="https://nav.yaozh.com/">药智搜</a>
              
                            <a target="_blank" class="item" href="https://yaozh.com/zt/index">专题中心</a>
              
                            <a target="_blank" class="item" href="https://job.yaozh.com/">药智人才</a>
              
            </div>
     <div class="header-right">
                              <a class="item" target="_blank" href="//www.yaozh.com/login/">登录</a>
               <a target="_blank" class="item" href="//www.yaozh.com/register/">注册</a>
                             <span class="item app">
  <i class="fa"></i>
  <span><a target="_blank" href="https://db.yaozh.com/app?ga_source=www&ga_name=sumlink_wwwty_top">药智数据APP</a></span>
  <div class="hover-layer">
    <div class="box">
      <a target="_blank" href="https://itunes.apple.com/cn/app/id1025304074" onclick='ga("send", "event", "button", "click","appdownload_ios_wwwty_top")' target='_blank' class="btn btn-sm ios btn-gray">
        <i class="fa"></i>
        <span>iOS下载</span>
      </a>
      <a target="_blank" href="//static.yaozh.com/yaozh_latest.apk" onclick='ga("send", "event", "button", "click","appdownload_android_wwwty_top")' class="btn btn-sm android mt10 btn-green">
        <i class="fa"></i>
        <span>Android下载</span>
      </a>
      <div class="tc mt10">
        <img src="/public/images/sumscan_wwwty_top.png?_v=1.5.26" alt="" style="width: 150px;">
      </div>
    </div>

  </div>
</span>                

     </div>
   </div>
</div>
  <!-- header-bar -->
  <div class="header-layer" data-widget="sticky" data-wrapperclass="header-bar">
    <form class="wrapper" action="//db.yaozh.com/Search" target='_blank'>
      <a target="_blank" href="//www.yaozh.com" class="logo" title="药智网" style="padding: 0;"></a>        
      <div class="search-box">
          <select class="search-type-select ignore-focus" name="btn_jiansuo" data-widget="dropdownSelect">
            <option value="1" data-action="//db.yaozh.com/Search" data-key="content">数据库</option>
            <option value="2" data-action="//s.yaozh.com/Index/search" data-key="search">药智通</option>
            <option value="3" data-action="//www.yaozh.com/list/" data-key="keytitle">药智汇</option>
            <!-- <option value="4" data-action="//bbs.yaozh.com/search.php?searchsubmit=yes" data-key="srchtxt">论坛</option> -->
          </select>
          <input class="search ignore-focus" name="content" type="text">
          <a href="javascript:;" class="search-btn">搜索</a>
      </div>
      <a target="_blank" class="app_link" href="//db.yaozh.com/app">
        <img src="/public/images/app_link.gif" alt="">
      </a>
       
    </form>
    <div class="m-show action-bar">
      <a href="javascript:;" class="show-search"><i class="fa"></i></a>
      <a href="javascript:;" class="show-menu"><i class="fa"></i></a>
    </div>
  </div>
<div class="showmsg_warp clearfix">
    <div class="showmsg_box warning">
        <dl class="showmsgs">
            <dt><p class="showmsg_left_bg"></p></dt>
            <dd>
                <div class="upheight">
                    <p class="tit">您还未登录!</p>
                    <p class="con">你可以给<a href="#">小智</a>吐个槽<br>紧急状况请联系药智客服:400-678-0778</p>
                </div>
                <p class="link"><span id="J_timer" class="Y_red">3</span>秒之后返回,<a id="J_url" data-url="/login" href="/login">如果没有跳转,请点击</a></p>
            </dd>
        </dl>
    </div>
</div>
<div class="footer">
  <div class="footer_1200">
    <div class="footer_fr">
      <div class=""><img width="111" height="111" src="/public/images/scan_wwwty_bottom_1.png?_v=1.5.26" alt=""></div>
      <div>
        药智数据APP
      </div>
    </div>
<p>
   <a href="http://about.yaozh.com/about.html">关于我们</a><span class="line">|</span>
    <a href="http://about.yaozh.com/contact.html">联系我们</a><span class="line">|</span>
    <a href="http://about.yaozh.com/qualification.html">企业资质</a><span class="line">|</span>
    <a href="http://about.yaozh.com/join.html">诚聘英才</a><span class="line">|</span>
    <a href="http://about.yaozh.com/link.html">友情链接</a><span class="line">|</span>
     <a href="//help.yaozh.com/" target="_blank">帮助中心</a><span class="line">|</span>
    <a href="//bbs.yaozh.com/forum-123-1.html">媒体报道</a><span class="line">|</span>
    <a href="//bbs.yaozh.com">药智论坛</a><span class="line">|</span>
    <a href="//about.yaozh.com/about/sitemap">全站导航</a><span class="line">|</span>
    <a href="//www.yaozh.com/ued?source=www&name=zhihui_footer">用户体验提升计划</a>
    <a href="//www.yaozh.com/xuan">业务介绍</a>
</p>
<p>互联网增值电信业务许可证编号:渝B2-20120028 | 渝ICP备10200070号  互联网药品信息服务资格证:(渝)-经营性-2016-0011 渝公网备:50010801500236</p>
<p>康洲大数据   版权所有   Copyright © 2009-2020 药智网YAOZH.COM All Rights Reserved.    法律顾问:上海锦天城(重庆)律师事务所 程建律师
<script>
/*GoogleAnaly*/
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
    ga('create', 'UA-73321472-5', 'auto');
    ga('send', 'pageview');

</script>
<span style="display:none"><script type="text/javascript">
var _bdhmProtocol = (("https:" == document.location.protocol) ? " https://" : " //");
document.write(unescape("%3Cscript src='" + _bdhmProtocol + "hm.baidu.com/h.js%3F65968db3ac154c3089d7f9a4cbb98c94' type='text/javascript'%3E%3C/script%3E"));
</script>
</span>
</p>
<p>客户服务热线:400-678-0778    E-mail:[email protected]  商务合作QQ:845146016</p>
<a id="netsafe" target="_blank" href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=50010802001068"><img src="//www.yaozh.com/public/images/netsafe.png"/><p>渝公网安备 50010802001068号</p></a>
<!-- <a id="outer-anquan"  key ="58fd5825efbfb064f4599465"  logo_size="83x30"  logo_type="realname"  href="http://www.anquan.org" ><script src="//static.anquan.org/static/outer/js/aq_auth.js"></script></a> -->
<a id="outer-anquan" target="cyxyv" href="https://v.yunaq.com/certificate?domain=www.yaozh.com&from=label&code=90020" rel="nofollow"> <img height="30" src="https://aqyzmedia.yunaq.com/labels/label_sm_90020.png"></a>
<div class="tousu tousu1">
  <img src="/public/images/tousu.png" alt="">
  <div class="tousu-msg">
    <p>投诉热线: 02362308742</p>
    <p>邮箱: [email protected] </p>
    <p> QQ: 914894005</p>
  </div>
</div>




  </div>
</div>
<script>
  (function(){
  var bp = document.createElement('script');
  var curProtocol = window.location.protocol.split(':')[0];
  if (curProtocol === 'https'){
  bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
    }
  else{
  bp.src = 'http://push.zhanzhang.baidu.com/push.js';
    }
  var s = document.getElementsByTagName("script")[0];
  s.parentNode.insertBefore(bp, s);
  })();

  // 网站底部投诉信息
  $(function () {
    var tousu=$('.tousu')[0];
    var tousumsg=$('.tousu-msg')[0];
    tousu.onmouseover=function(){
        tousumsg.style.display='block';
    }
    tousumsg.onmouseout=function(){
        tousumsg.style.display='none';
    }
    function stop(e){
        e.stopPropagation();
    }
    $(tousu).on('click',stop);
    $(tousumsg).on('click',stop);
    $(document).on('click',function(){
        tousumsg.style.display='none';
    })
  })
</script>

<script>
Wind.use("jquery", "global", function(){
   window.history.forward(1);
   var timer = parseInt($("#J_timer").html()),
      url = $("#J_url").data("url");
   function jump(){
    if(window.frameElement){
      window.frameElement.trigger("refresh");
    }else if(url){
      window.location.href = url;
      }else{
         history.go(-1);
      }
   }
   if(!timer){
      timer = 3;
   }
   var m_timer = setInterval(function(){
      timer = timer - 1;
      if(timer <=0){
         jump();
         clearInterval(m_timer);
      }else{
         $("#J_timer").html(timer);
      }
   },1000); //指定1秒刷新一次
});
</script>
</body>
</html>

在本地浏览器打开01cook.html
在这里插入图片描述

1.2 破解cookie

登录到个人中心打开F12-找到cookie复制添加到headers
在这里插入图片描述

01-cookies2.py

"""
    直接获取 个人中心的页面
    手动粘贴 复制 PC 抓包的 cookies
    放在  request对象的请求头里面 

"""

import urllib.request

# 1.数据url
url = 'https://www.yaozh.com/member/'
# 2.添加请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/70.0.3538.67 Safari/537.36 '
    ,
    'Cookie': 'acw_tc=2f624a2e15971256643447545e7f81ef33b63b4856e8b873b4cc16cfd536b5; '
              'PHPSESSID=1cpe128vekurm2cph66psd64l3; _ga=GA1.2.2061322105.1597125665; '
              '_gid=GA1.2.2134737262.1597125665; _gat=1; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1597125665; '
              'Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1597125669; yaozh_logintime=1597125674; '
              'yaozh_user=966627%09linguoqing; yaozh_userId=966627; '
              'yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyXnoaabZtrl5uHnKZxanJT1qeSoMZYoNdzaJFyVM'
              '%2FO0MjZ09Kg05yHn9ibbHFXpJLUrZCnyqPKhnSqm2linYe42DC0C2f7CB153A9DB6B8F574399B6C9Tlp2bkmmaaJ6Vh5ymcWlyU9WinpiDcdieamqbWmOYnpmSlpmXbpprlpyHnLA%3D6b69834e71bb94c162ef96c9de07166a; db_w_auth=806631%09linguoqing; UtzD_f52b_saltkey=dWE3MyHj; UtzD_f52b_lastvisit=1597122076; UtzD_f52b_lastact=1597125676%09uc.php%09; UtzD_f52b_auth=1cd7AgQ6z%2BljwCPcjsOKGsNXC7G%2B9Sd0L375kOtE1RuiRxgDK%2BOOhkzRfqT9Fpf0V7Uol9YdGivvbUC1NLd%2BVcQ1mxI; yaozh_uidhas=1; yaozh_mylogin=1597125678; acw_tc=2f624a2e15971256643447545e7f81ef33b63b4856e8b873b4cc16cfd536b5 '
}

# 3.构建请求对象
request = urllib.request.Request(url, headers=headers)

# 4.发送请求对象
response = urllib.request.urlopen(request)

# 5.读取数据
data = response.read()
print(type(data))

# 保存到文件中 验证数据
with open('01cook.html', 'wb') as f:
    f.write(data)

在本地浏览器打开01cook.html
在这里插入图片描述
登陆成功!

1.3 再破解cookie

01-cookies3.py

"""
    获取 个人中心的页面
    1. 代码登录  登录成功 cookie(有效)
    2. 自动带着cookie 去请求个人中心

    cookiejar 自动保存这个cookie
"""

import urllib.request
from http import cookiejar
from urllib import parse

# 找登录 参数

# 后台 根据你发送的请求方式来判断的 如果你是get(登录页面),如果POST(登录结果)

# 1. 代码登录
# 1.1 登录的网址(此时的请求是GET)
login_url = 'https://www.yaozh.com/login'  # 登录界面的url
# 1.2 登录的参数
login_form_data = {
    "username": "[email protected]",  # 账号
    "pwd": "874475806lin",  # 密码
    "formhash": "D7C67699B1",  # 在登录界面查找
    "backurl": "https%3A%2F%2Fwww.yaozh.com%2Fmember%2F"  # 在登录界面查找
}
# 1.3 发送登录请求POST(这是登录之后查看到的请求是POST)
cook_jar = cookiejar.CookieJar()

# 自定义有添加cook功能的处理器
cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar)
# 自定义根据处理器生成opener
opener = urllib.request.build_opener(cook_hanlder)

# 带着参数发送post请求
# 添加请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

# 1.参数需要转译转码;
# 2. post请求的data要求是bytes
login_str = parse.urlencode(login_form_data).encode('utf-8')

login_request = urllib.request.Request(login_url, headers=headers, data=login_str)

# 如果登录成功, cookjar自动保存cookie
opener.open(login_request)

# 2. 代码带着cooke去访问个人中心
center_url = 'https://www.yaozh.com/member/'
center_request = urllib.request.Request(center_url, headers=headers)
response = opener.open(center_url)

# bytes -->str
data = response.read().decode()

with open('02cook.html', 'w') as f:  
    f = open("02cook.html", "w", encoding='utf-8')
    f.write(data)

在本地浏览器打开02cook.html

在这里插入图片描述

1.4 URLError

# urlib.request  提示错误 HTTPError UrlError
"""
url = 'https://affdsfsfsdfd.cn'
     raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>
    
url = 'https://blog.csdn.net/zjsxxzh/article/details/110'
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

"""

import urllib.request

try:
    response = urllib.request.urlopen(url)

except urllib.request.HTTPError as error:
    print(error.code)

except urllib.request.URLError as error:
    print(error)

二、 有关Request

2.1 基本属性

03-requests_use1.py

# 1.记得安装 第三方 模块 requests
# pip install requests

import requests

url = 'http://www.baidu.com'
response = requests.get(url)

# content属性返回的类型是bytes
data = response.content.decode('utf-8')

# text属性返回的类型是文本str
data = response.text

print(type(data))
print(data)

2.2 添加请求头

03-requests_use2.py

# 1.记得安装 第三方 模块 requests
# pip install requests

import requests


class RequestSpider(object):
    def __init__(self):
        url = 'https://www.baidu.com'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/70.0.3538.67 Safari/537.36 '
        }
        self.response = requests.get(url, headers=headers)

    def run(self):
        data = self.response.content

        # 1.获取请求头
        request_headers = self.response.request.headers

        # 2.获取相应头
        code_response_headers = self.response.headers

        # 3.响应状态码
        code = self.response.status_code

        # 4.请求的cookie
        request_cookie = self.response.request._cookies
        print(request_cookie)

        # 5. 响应的cookie
        response_cookie = self.response.cookies
        print(response_cookie)


RequestSpider().run()

2.3 自动转译

03-requests_use3.py

# https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3&rsv_spt=1&rsv_iqid=0xefb8b43600013949&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&oq=%25E5%25A4%25B4%25E6%259D%25A1&rsv_t=6e3aSjYtw0WgEg7MAIuUlOc3D5lwFBJUVw3KsdkhkWYhZWcNMn9kLBO12GflHlOeUHxx&inputT=506&rsv_pq=81d8f9470001b348&rsv_sug3=19&rsv_sug1=16&rsv_sug7=100&bs=%E5%A4%B4%E6%9D%A1

import requests

# 参数自动转译
# url = 'https://www.baidu.com/s?wd=美女'

url = 'https://www.baidu.com/s'

params = {
    'wd': "美女"
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/70.0.3538.67 Safari/537.36 '
}

response = requests.get(url, headers=headers, params=params)

data = response.content.decode()

with open('baidu.html', 'w') as f:
    f.write(data)

# 发送post 和添加参数
# requests.post(url,data=(参数{}),json=(参数))

2.4 jason

03-requests_use4.py

# https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3&rsv_spt=1&rsv_iqid=0xefb8b43600013949&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&oq=%25E5%25A4%25B4%25E6%259D%25A1&rsv_t=6e3aSjYtw0WgEg7MAIuUlOc3D5lwFBJUVw3KsdkhkWYhZWcNMn9kLBO12GflHlOeUHxx&inputT=506&rsv_pq=81d8f9470001b348&rsv_sug3=19&rsv_sug1=16&rsv_sug7=100&bs=%E5%A4%B4%E6%9D%A1

import requests
import json

url = 'https://api.github.com/user'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/70.0.3538.67 Safari/537.36 '
}

# 这个网址返回的内容不是html 而是标准的json
response = requests.get(url, headers=headers)

# str
# data = response.content.decode()

# str转换为dict
# data_dict = json.loads(data)
# print(data_dict)

# json() 自动将json字符串 转换成Python dict list
data = response.json()
print(type(data))
print(data['message'])

++++++++++结束线++++++++++++++++++

猜你喜欢

转载自blog.csdn.net/qq_42893334/article/details/107933691
今日推荐