Python notes (3) - request, crawler, socket, multithreading

Table of contents

1. Use requests to send http requests

1-1) Send a get request

1-2) Send a post request

1-3) Send get request to download network pictures

1-4) Use post to upload files

1-5) Ways to automatically maintain sessions

2. Use os.popen to execute cmd commands

3. A crawler based on beautifulSoup

3-1) Initialize the parser

3-2) Find page elements

3-2-1)find、findAll

3-2-2) Use css selectors to search

4, socket communication

5. Multithreading

Multithreading improves socket, one-to-many communication:


1. Use requests to send http requests

> pip install requests to install the module

import requests

1-1) Send a get request

params = {}
headers = {}
resq = requests.get('https://www.baidu.com', params=params, headers=headers)
resq.encoding = 'utf-8'
# print(resq.text)              # 返回的数据 html内容
print(resq.headers)             # 响应头
# print(resq.content)             # 返回响应的内容
# print(resq.status_code)         # 返回状态码 200

1-2) Send a post request

data = {'name':'张三','age':23,'gender':'男'}
headers = {}
req = requests.post('http://www.test.cn', data=data,headers=headers)
print(req.text)
print(req.headers)

1-3) Send get request to download network pictures

resq = requests.get('https://p.qqan.com/up/2018-5/2018050911304322378.jpg')
with open('./img_download2.png','wb') as f:
    f.write(resq.content)           # resq.content 得到的是二进制流;resq.text 得到的是文本型的响应

1-4) Use post to upload files

# 文件上传  先登录,再上传
cookie = resq.cookies             # 获取等录后的cookie
file = {'file':open('G:/test_img.xls', 'rb')}
data = {'name':'testFile'}
res = requests.post(url='http://target.cn', data=data, files=file, cookies=cookie)
print(res.status_code)
print(res.text)

1-5) Ways to automatically maintain sessions

        Use requests.session() to get the session object, use the session object to send the request, you can maintain cookies and session, you don't have to operate it yourself.

session = requests.session()
data = {'user':'admin', password:'123112233'}
req = session.post('http://denglu.cn',data=data)    # 登录

file = {'file':open('G:/test_img.xls', 'rb')}
data = {'name':'testFile'}
res = session.post(url='http://target.cn',data=data,files=file)    # 上传文件
print(res.status_code)
print(res.text)

2. Use os.popen to execute cmd commands

import os

ipconfig = os.popen('ipconfig').read()      # 使用popen不会直接输出,把执行结果赋值给变量
print(ipconfig)

# os.popen('notepad.exe')         # 打开记事本
os.popen('explorer.exe E:\\')       # 打开资源管理器并定位到 E 盘目录下

3. A crawler based on beautifulSoup

> pip install bs4

> pip install lxml installs the parser lxml

Parsers can use: html.parser, lxml, xml, html5lib 

from bs4 import BeautifulSoup
import requests

# 淘宝
res = requests.get('https://www.taobao.com/')
res.encoding = 'utf8'

3-1) Initialize the parser

# html是 bs4.BeautifulSoup 实例,是经过解析的 HTML 代码
html = BeautifulSoup(res.text, 'lxml')

3-2) Find page elements

print(html.head.title)          # <title>淘宝</title>
print(html.head.title.string)   # 淘宝
print(html.div)         # 获取到的是文档中的第一个div元素及其所有子元素
print(html.div.div)     # 获取到文档中的第一个div元素下的第一个div子元素

General methods for finding page elements: find, findAll, select
1, find, findAll Search based on tags, attributes, etc. (find is to find the first match; findAll is to find all, and will return in the form of a list) 2. Select
uses css selector to find: div #id .class

3-2-1)find、findAll

Find by tag name

links = html.findAll('a')     # 使用标签名来查找所有的链接
for link in links:
    print(link['href'])

Use the class name class to find

# 使用类名来进行查找 注意是 class_
hots = html.find(class_='search-hots-fline')
print(hots)

use id to find

'''
查找第一个 id=’J_Search‘ 的标签,再逐层往下找
'''
j_search = html.find(id='J_Search')         # 使用id查找
res= j_search.findAll('input')         # 使用标签名查找
print(res)                  # 返回的是包含若干 input 元素的列表
print(res[0])

Specify the content innerText to find

# 通过 innerText 进行查找
content = html.find(string='新款连衣裙')
print(content)      # 如果查找到,则返回的是:新款连衣裙
print(content.parent)   # 以 新款连衣裙 为innerText 的标签

Specify the tag name and attribute value to search

 # 返回第一个符合的form表单,data-sg-type属性的值为form
form = html.find('form',{'data-sg-type':'form'}) 
print(form)

3-2-2) Use css selectors to search
'''
使用css选择器来查找
'''
# 返回 .search-hots-lines 下 .search-hots-fline 标签的全部内容
flines = html.select('.search-hots-lines .search-hots-fline')
print(flines)           # 是列表
print(len(flines))



lis = html.select('.search-hots-lines div a')      # div > div > a
print(lis)                      # 所有符合条件的 a 标签
print(lis[0].string)            # 新款连衣裙

4, socket communication

Server:

import socket

s = socket.socket()

# 绑定任何源都可以连接,开放 5544 端口用于该 socket 通信
s.bind(('0.0.0.0', 6660))

# 开始监听
s.listen()

# 只能接受一个客户端的连接,返回的第一个数据代表通信通道,可以回复信息;第二个数据是用户的信息(对方ip地址和端口号)
chanel, client = s.accept()

while True:
    # chanel, client = s.accept()           # 如果在这里接收,则客户端只能发送一次消息,然后循环回到accept时,accept会阻塞当前的连接,等待新的连接
    print('连接方信息:',client, end=' ;')                  # 打印用户ip和端口号
    receive = chanel.recv(1024).decode()        # 解码用户发生过来的信息,最大接收1024字节
    print(f'接受到消息:{receive}')               # 打印用户发过来的信息
    reply = receive.replace('吗','!')
    chanel.send(reply.encode())             # 返回信息

client

import socket

s = socket.socket()

s.connect(('192.168.1.11', 6660))       # 目标ip地址,目标端口号

while True:
    msg = input()                       # 在控制台中输入哟啊发送的内容
    s.send(msg.encode())                # 发送
    receive = s.recv(1024)             # 接收返回的信息,10240是接受的最大数据量(字节
    print(f'对方回复:{receive.decode()}')

5. Multithreading

import random,threading,time

def test(num):
    time.sleep(random.choice([0.1, 0.3, 0.6, 1]))
    print(f'当前线程为:{threading.currentThread().getName()},{num ** 2}')

if __name__ == '__main__':
    print(f'当前执行的线程为:{threading.currentThread().getName()}')
    for i in range(9):
        # 创建一个线程,该线程需要执行的函数是 test,传递的参数为 i
        thread = threading.Thread(target=test, args=(i + 1,))
        thread.start()              # 开始执行该线程

Multithreading improves socket, one-to-many communication:

Server:

import threading,socket

s = socket.socket()
s.bind(('0.0.0.0',6660))
s.listen()

def socket_server():
    chanel, client = s.accept()
    while True:
        req = chanel.recv(1024).decode()
        if req == '':
            print(f'{client} 断开了连接')
            break
        print(f'接收到连接:{client};收到消息:{req}')
        str = req.replace('?','.')
        chanel.send(str.encode())
    socket_server()

for i in range(2):
    thread = threading.Thread(target=socket_server)
    thread.start()

Several clients reuse the following code:

import socket

s = socket.socket()

s.connect(('192.168.1.2',6660))

while True:
    msg = input()
    s.send(msg.encode())
    res = s.recv(1024).decode()
    print(f'收到对方回复:{res}.')

Guess you like

Origin blog.csdn.net/hao_13/article/details/132608653