网络爬虫笔记3-相关库以及登录问题

Requests

发送请求,传递URL参数,读取响应内容(文本/二进制/Json),定制请求头部,Post请求,响应状态码,重定向和历史,超时…

import json
import requests
from PIL import Image
from io import BytesIO


# print(dir(requests))
url = "http://www.baidu.com"
r = requests.get(url)
# print(r.status_code)
# print(r.text)
# print(r.encoding)

#传递参数:比如htttp://aaa.com?pageId=1&type=content
'''
params = {'k1': 'v1', 'k2':'v2'}
r = requests.get('http://httpbin.org/get', params)
print(r.url)
#out[]: http://httpbin.org/get?k1=v1&k2=v2
params = {'k1': 'v1', 'k2': None}
r = requests.get('http://httpbin.org/get', params)
print(r.url)
#out[]: http://httpbin.org/get?k1=v1
params = {'k1': 'v1', 'k2': [1, 2, 3]}
r = requests.get('http://httpbin.org/get', params)
print(r.url)
#out[]: http://httpbin.org/get?k1=v1&k2=1&k2=2&k2=3
'''

#二进制数据
'''
r = requests.get('https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1532488268&di=1d2c2f507c149bfa09eefa8f217e9c24&src=http://img3.duitang.com/uploads/item/201504/27/20150427220929_ZXRCk.jpeg')
image = Image.open(BytesIO(r.content))
image.save('meinv.jpg')
'''

#json处理
'''
r = requests.get('https://github.com/timeline.json')
print(type(r.json))
print(r.text)
'''

#原始数据处理
'''
r = requests.get('https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1532488268&di=1d2c2f507c149bfa09eefa8f217e9c24&src=http://img3.duitang.com/uploads/item/201504/27/20150427220929_ZXRCk.jpeg')
with open('meinv2.jpg', 'wb+')as f:
    for chunk in r.iter_content(1024):
        f.write(chunk)
'''

#提交表单
'''
form = {'username':'user', 'password': 'pass'}
r = requests.post('http://httpbin.org/post', data = form)
print(r.text) #data是字典,放到了form中
r = requests.post('http://httpbin.org/post', data = json.dumps(form))
print(r.text)#data是纯文本,放到了data中

'''

#cookie
'''
url = 'http://www.baidu.com'
r = requests.get(url)
cookies = r.cookies #cookies相当于一个字典
for k, v in cookies.get_dict().items():#遍历字典
    print(k, v)
'''
'''
cookies = {'c1': 'v1', 'c2': 'v2'}
r = requests.get('http://httpbin.org/cookies', cookies=cookies)
print(r.text)
'''

#重定向和重定向历史
'''
r = requests.head('http://github.com', allow_redirects = True)#请求网址
print(r.url)#跳转的网址
print(r.status_code)#状态码
print(r.history)
'''

#代理
'''
proxies = {'http': ',,,', 'https': '...'}
r = requests.get('...', proxies = proxies)
'''

BeautifulSoup

是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.
格式化后浏览数据,访问Tag,访问属性,获取文本,注释处理,搜索,css选择器…

from bs4 import BeautifulSoup

soup = BeautifulSoup(open('test.html'))
# print(soup.prettify())#重新排版

print(type(soup.title))
print(soup.title)
print(soup.title.name)
#out[]:
# <class 'bs4.element.Tag'>
# <title>The Dormouse's story</title>
# title
print(soup.title.text)
#out[]:
# The Dormouse's story

#String
print(type(soup.title.string))
print(soup.title.string)
#out[]:
# <class 'bs4.element.NavigableString'>
#
# The Dormouse's story


#Comment
print(type(soup.a.string))
print(soup.a.string)#把注释符去掉,直接打印注释的内容
#out[]:
# <class 'bs4.element.Comment'>
#  Elsie

for item in soup.body.contents:
    # print(item)
    print(type(item))
#out[]:
# <class 'bs4.element.NavigableString'>
# <class 'bs4.element.Tag'>
# <class 'bs4.element.NavigableString'>
# <class 'bs4.element.Tag'>
# <class 'bs4.element.NavigableString'>
# <class 'bs4.element.Tag'>

#css查询
#.class查找
print(soup.select('.sister'))
##id查找
print(soup.select('#link1'))
#通过父子关系查找
print(soup.select('head > title'))

a_s = soup.a#返回第一个a标签
a_s = soup.select('a')#根据元素选择,返回列表
for a in  a_s:
    print(a)

用到的’test.html’文件

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>

HTMLParse

pip install HTMLParse 后会报错ModuleNotFoundError: No module named ‘markupbase’
网站下载,然后将_markupbase.py解压到python目录下/Lib/site-packages下,重命名为markupbase.py即可。

from HTMLParser import HTMLParser

class MyParser(HTMLParser):
    def handle_decl(self, decl):
        HTMLParser.handle_decl(self, decl)
        print('decl: %s' % decl)

    def handle_starttag(self, tag, attrs):
        HTMLParser.handle_starttag(self, tag, attrs)
        print('<' + tag + '>')

    def handle_endtag(self, tag):
        HTMLParser.handle_endtag(self, tag)
        print('<' + tag + '>')

    def handle_data(self, data):
        HTMLParser.handle_data(self, data)
        print('data: %s'% data)

    #</br>
    def handle_startendtag(self, tag, attrs):
        HTMLParser.handle_startendtag(self, tag, attrs)


    def handle_comment(self, data):
        HTMLParser.handle_comment(self, data)
        print('data: %s' % data)


    def close(self):
        HTMLParser.close(self)
        print('Close')

demo = MyParser()
demo.feed(open('test.html').read())
demo.close()

sqlite3

import sqlite3

conn = sqlite3.connect('test.db')#打开数据库,没有就创建一个
create_sql = 'create table company(id int primary key not null, emp_name text not null );'
conn.execute(create_sql)
insert_sql = 'insert into company values(?, ?)'
conn.execute(insert_sql, (100, 'LY'))
conn.execute(insert_sql, (200, 'July'))
cursors = conn.execute('SELECT id, emp_name FROM  company')
for row in cursors:
    print(row[0], row[1])
conn.close()

#out[]:
# 100 LY
# 200 July

与mysql的区别:

  • Mysql登录时需要:host(ip,port) , username, password
  • Mysql execute后需要cnn.commit()才能修改数据库中的数据

豆瓣登录

构建表单登录

  1. 登录豆瓣
  2. F12,使用开发者工具查看相关信息,查看下图中的信息(图中是使用的是谷歌浏览器):
    这里写图片描述

demo

import requests
from bs4 import BeautifulSoup
from selenium import webdriver

s = requests.Session()
#使用开发者工具F12,点击网络->all,查看login中表单信息,头信息等。
url_login = 'https://accounts.douban.com/login'

#构建表单
formdata = {
    'redir': 'https://www.douban.com',
    'form_email': '*******',
    'form_password':'******',
    'user_login': u'登录'
}
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

r = s.post(url_login, data = formdata, headers = headers)
content = r.text
soup = BeautifulSoup(content, 'html5lib')
captcha = soup.find('img', id = 'captcha_image')#查找验证码图片
if captcha:
    captcha_url = captcha['src']
    browser = webdriver.Chrome()
    browser.set_page_load_timeout(30)
    browser.get(captcha_url)#用浏览器打开验证码所在网址

    captcha_text = input('Please input the captcha:')
    browser.close()
    formdata['captcha-solution'] = captcha_text

    r = s.post(url_login, data=formdata, headers=headers)#重新发送表单
with open('contacts.txt', 'w+', encoding='utf-8') as f:
    f.write(r.text)#将登录后的网页内容写进"contacts.txt"文件中

利用Cookie登录

  1. 先登录豆瓣
  2. 查看cookie信息和header信息,并复制粘贴,再退出豆瓣,如下图:
    这里写图片描述

demo

import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
cookies = {'cookie': 'll="118220"; bid=kMODi3vlz60; _vwo_uuid_v2=D2A6CEB9BA1F28CB6C72479CD072C2988|98a680477d118b228e02f28a166f6936; ps=y; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18185; ap=1; _ga=GA1.2.979972519.1531812222; _gid=GA1.2.356926711.1532706699; __utmc=30149280; __utmz=30149280.1532740641.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=30149280.979972519.1531812222.1532740641.1532746238.4; as="https://www.douban.com/"'}
url = 'http://www.douban.com'
r = requests.get(url, cookies = cookies, headers = headers)
with open('douban_2.txt', 'wb+') as f:
    f.write(r.content)

获取豆瓣电影评分top250

import requests
from lxml import etree
import pandas as pd

s = requests.Session()
#找到排行榜网址:https://movie.douban.com/top250
#观察每页网址的变化:
#第一页:https://movie.douban.com/top250?start=0
#第二页:https://movie.douban.com/top250?start=25
#共十页
names= []
ratings = []
for id in range(0, 250, 25):
    url = 'https://movie.douban.com/top250/?start=' + str(id)
    r = s.get(url)
    r.encoding = 'utf-8'
    root = etree.HTML(r.content)
    name = root.xpath('//*[@id="content"]/div/div[1]/ol/li/div[1]/div[2]/div[1]/a/span[1]/text()')
    rating = root.xpath('//*[@id="content"]/div/div[1]/ol/li/div[1]/div[2]/div[2]/div[1]/span[2]/text()')
    print(name)
    names.extend(name)
    ratings.extend(rating)
data = {'name': names, 'rating': ratings}
df = pd.DataFrame(data)
#将影片按评分降序排列并存入到rating.csv文件中
df = df.sort_values(by='rating',axis = 0,ascending = False)
df.to_csv('rating.csv', index=False, encoding='gbk')

输出结果如下图所示:

这里写图片描述

猜你喜欢

转载自blog.csdn.net/qq_38195197/article/details/81203045