Day01 爬取视频+POST请求自动登录GitHub

先对requests请求库快速安装pip3 install requests

导入包:

import requests
import time

爬虫三部曲:

1、发送请求

def get_page(url):
    response = requests.get(url)
    return response

2、解析数据

import re
def parse_index(html):
    #findall匹配所有
    # re.findall('正则匹配规则','匹配文本','匹配模式')
    #re.S:对全部文本进行搜索匹配
    detail_urls=re.findall('div class="items"><a class="imglink" href="(.*?)"', html, re.S)
    return detail_urls
#解析详情页
def parse_detail(html):
    movie_url=re.findall('<source src="(.*?)"',html,re.S)
    if movie_url:
        return movie_url[0]

3、保存数据

import uuid    #uuid.uuid4()根据时间戳生成一段世界上唯一一段字符串
def save_video(content):
    with open(f'{uuid.uuid4()}.mp4','wb') as f:
        f.write(content)

实战案例:对校花网进行爬取

# main+回车键
if __name__ == '__main__':
    for line in range(6):
        url=f'http://www.xiaohuar.com/list-3-{line}.html'
        #发送请求
        response = get_page(url)
        # print(response)
        # #返回响应状态码
        # print(response.status_code)
        # #返回响应文本
        # print(response.text)

        #解析主页页面
        detail_urls = parse_index(response.text)
        #循环遍历详情页url
        for detail_url in detail_urls:
            # print(detail_url)
            #往每一个详情页发送请求
            detail_res = get_page(detail_url)
            # print(response.text)

            #解析详情页获取视频url
            movie_url = parse_detail(detail_res.text)

            #判断url存在打印视频
            if movie_url:
                print(movie_url)

                #往视频url发送请求获取视频二进制流
                movie_res = get_page(movie_url)
                #把视频的二进制流给save_video函数去保存到本地
                save_video(movie_res.content)

二、POST请求自动登录GitHub

1. 获取token字符串

'''
1.访问登录页面获取token字符串
请求URL:
https://github.com/login
请求方式:
GET
请求头:
Cookies
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
2.解析并提取token字符串
# 正则
<input type="hidden" name="authenticity_token" value="(.*?)"/>
'''
import requests
import re
login_url = 'https://github.com/login'
#login页面请求头信息
login_headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
login_res = requests.get(url=login_url,headers=login_headers)
# print(longin_res.text)

#
解析提取token字符串 authenticity_token = re.findall( '<input type="hidden" name="authenticity_token" value="(.*?)" />', login_res.text, re.S )[0] print(authenticity_token)
#获取login页面的cookies信息 # print(type(login_res.cookies)) # print(type(login_res.cookies.get_dict())) login_cookies = login_res.cookies.get_dict()
2、开始登录github
'''
POST请求自动登录hithub:
请求url:
https://github.com/session/
请求方式:
POST
请求头:
cookie
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36

请求体:
commit:Sign in
utf8:✓
authenticity_token:28M+Bm0xt10QgEidyrICyo/53xxYWV0deet0sGQKPdoQG9FXPOqFHZjMQPHHc+RBlOfJTMplbpyJI7yoBZH0zw==
login:*****
password:********
webauthn-support:unsupported
'''
#session登录url
session_url = 'https://github.com/session/'

#请求头信息
session_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

}
#请求体信息
form_data = {
    "commit": "Sign in",
    "utf8": "",
    "authenticity_token": authenticity_token,
    "login": "[email protected]", #能成功登录的账号密码
    "password": "SHAOhh170326",
    "webauthn-support": "supported"
}
session_res = requests.post(url=session_url,headers=session_headers,cookies=login_cookies,data=form_data)
with open('github.html','w',encoding='utf-8') as f:
    f.write(session_res.text)

在生成的github.html文件里用浏览器打开可以看到不用输账号密码可以直接登录github

在刚开始尝试的时候用的是QQ浏览器,最后登录时会显示账号密码错误,后面换成谷歌浏览器,并把User-Agent等数据修改后就可以正常操作了

猜你喜欢

转载自www.cnblogs.com/tanknb/p/11115839.html