路飞学城-爬虫集训营-第一章

学习心得:
1.课程讲的十分详细,对初学者来说是个不错的选择
2.wusir直播时讲拉钩登陆讲了很多注意项,让自己也发现了自己的错误点
3.alex鸡汤太少了!!alex鸡汤太少了!!alex鸡汤太少了!!alex鸡汤太少了!!

第三方库---requests
  1.安装
    pip install requests
    注意使用python3时在linux系统下可能是 pip3

  2.requests部分常用参数
    1.url 必填 需要访问的网址
    2.params 选填 发送请求的参数,有的URL已经拼接好不需要重复发送(GET请求)
    3.data 选填 发送请求的参数,有的URL已经拼接好不需要重复发送(POST请求)
    4.headers 选填 请求头,有些网站必须验证请求头比如USER-AGENT
    5.cookies 选填 用户数据 有的网站会验证是否登陆可以使用cookie
    补充:
      1.post请求里data为 http请求里的 body /r/n/r/n body



  3.发送请求
    1.get请求
      response = requests.get(url='www.baidu.com',params={'例子':'例子'},headers={'host':'www.baidu.com')#发送请求
      response.encoding = 'utf-8'#设定编码
      response.text #返回文本

    2.post请求
      response = requests.post(url='www.baidu.com',data={'例子':'例子'},headers={'host':'www.baidu.com')
      ....

第三方库---beautifulsoup
  1.安装
    pip install bs4

  2.分析网页
    soup = BeautifulSoup(html,'lxml')#html为请求后的text内容(html源码),lxml为解析库
    soup.find(name='标签名',attrs={'id':'values'})#name标签名,attrs标签属性


实例:模拟登陆github
#-*- coding: utf-8 -*-
# Author:w k

'''
1.访问登陆页面并且分析得到authenticity_token
2.把附带刚刚authenticity_token的cookie和账号密码参数POST请求
3.请求后获取登陆的用户名~
4.访问用户名的个人空间并且获取信息
'''


import requests
from bs4 import BeautifulSoup
from config import *


def login(username, password, token, cookies):
doc = BeautifulSoup(token, 'lxml')
token = doc.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
data = {
"commit": "Sign+in",
"utf8": "✓",
"authenticity_token": token,
"login": username,
"password": password,
}
result = requests.post(url=LOGIN_URL, data=data, headers=HEADERS, cookies=cookies)
if result.status_code == 200 or 302:
# 如果返回的状态吗是200或者302则代表登陆成功,接着分析网页获取当前登陆的用户名返回
status = BeautifulSoup(result.text, 'lxml')
user = status.find(name='strong', attrs={"class": "css-truncate-target"}).text
return user, result.cookies.get_dict()
return None


def getPage(url, cookie=None, need_cookie=False):
'''
获取页面html代码,有的网页需要登陆所以可以选择是否传入cookie
need_cookie可以在获取后返回cookie
:param url:
:param cookie:
:param need_cookie:
:return:
'''
try:
if cookie:
response = requests.get(url, cookies=cookie, headers=HEADERS)
else:
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
if need_cookie:
return response.text, response.cookies.get_dict()
return response.text
return None
except Exception as E:
print('请求失败~')
return None

def parseInformation(html):
'''
使用try可以防止有的用户没有填写那处的资料出错
:param html:
:return:
'''
doc = BeautifulSoup(html, 'lxml')
nickname = doc.find(name='span', attrs={'itemprop': 'name'}).string
username = doc.find(name='span', attrs={'itemprop': 'additionalName'}).string

try:
bio = doc.find(name='div', attrs={'class': 'd-inline-block mb-3 js-user-profile-bio-contents'}).div.string
except AttributeError:
bio = None

try:
company = doc.find(name='span', attrs={'class': 'p-org'}).div.string
except AttributeError:
company = None

try:
location = doc.find(name='span', attrs={'class': 'p-label'}).string
except AttributeError:
location = None

try:
email = doc.find(name='a', attrs={'class': 'u-email'}).string
except AttributeError:
email = None

try:
url = doc.find(name='a', attrs={'class': 'u-url'}).string
except AttributeError:
url = None

Information = '''=====用户个人信息====
昵称:{nickname}
用户名:{username}
bio:{bio}
公司:{company}
位置:{location}
邮箱:{email}
网址:{url}
'''.format(nickname=nickname, username=username, bio=bio, company=company, location=location, email=email, url=url)
return Information

def main(user,pwd):
'''
程序入口:
先获取登陆token,然后传入账号密码和cookie,token登陆后访问用户主页获取用户资料
:return:
'''
try:
result, cookie = getPage(url=TOKEN_URL, need_cookie=True)
except Exception:
print('无法获得Token或者Cookie')
exit(-1)
result = login(username=user, password=pwd, token=result, cookies=cookie)
if result:
# result[0]为用户名,[1]为cookie
user_url = 'https://github.com/' + result[0]
html = getPage(url=user_url, cookie=result[1])
if html:
information = parseInformation(html)
print(information)
else:
print('无法解析个人空间页面请重试')
exit(-1)
else:
print('登陆出错,请检查用户名或者密码或者网络再重新尝试~')
exit(-1)

if __name__ == '__main__':
main(user,pwd)

猜你喜欢

转载自www.cnblogs.com/flower-poison/p/9298781.html