Requests is a Python HTTP client library.
Request supports HTTP connection retention and connection pooling, supports the use of cookies to maintain sessions, supports file uploads, supports automatic response content encoding, and supports internationalized URL and POST data automatic encoding.
On the basis of python built-in modules, a high degree of encapsulation is carried out to make python make network requests more humane. Using Requests can easily complete any operation that the browser has. Modern, international and friendly.
table of Contents
1. Requests basics
2. Send request and receive response (basic GET request)
Three, send request and receive response (basic POST request)
Four, response attributes
Five, agency
Six, cookie and session
Seven, case
1. Requests basics
1. Install the Requests library
pip install requests
2. Use the Requests library
import requests
2. Send request and receive response (basic GET request)
response = requests.get(url)
1. Transmit parmas parameters
- The parameters are included in the url
response = requests.get("http://httpbin.org/get?name=zhangsan&age=22") print(response.text)
- Pass parameters through the get method
data = { "name": "zhangsan", "age": 30 } response = requests.get("http://httpbin.org/get", params=data) print(response.text)
2. Simulate sending request headers (transfer headers parameters)
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } response = requests.get("http://httpbin.org/get", headers=headers) print(response.text)
Three, send request and receive response (basic POST request)
response = requests.post(url, data = data, headers=headers)
Four, response attributes
Five, agency
proxies = { "http": "https://175.44.148.176:9000", "https": "https://183.129.207.86:14002" } response = requests.get("https://www.baidu.com/", proxies=proxies)
Six, cookie and session
- The use of cookies and session benefits: Many websites must log in (or obtain certain permissions) before they can request relevant data.
- Disadvantages of the cookies and sessions used: A set of cookies and sessions often correspond to a user. Requests are too fast, and the number of requests is too many, which is easy to be recognized by the server as a crawler, which will damage the account.
1. Try not to use cookies when cookies are not needed.
2. In order to get the login page, we must send a request with cookies. At this time, in order to ensure account security, we should minimize the data
Acquisition speed.
1.cookie
(1) Obtain cookie information
response.cookies
2.session
(1) Construct the session object
session = requests.session()
Example:
def login_renren(): login_url = 'http://www.renren.com/SysHome.do' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } session = requests.session() login_data = { "email": "账号", "password": "密码" } response = session.post(login_url, data=login_data, headers=headers) response = session.get("http://www.renren.com/971909762/newsfeed/photo") print(response.text) login_renren()
Seven, case
Case 1: Baidu Tieba page crawling (GET request)
import requests import sys class BaiduTieBa: def __init__(self, name, pn, ): self.name = name self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn= {}".format(name, pn) self.headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari /537.36" # Use an older version of the request header, the browser does not support js "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" } self.url_list = [self.url + str (pn*50) for pn in range(pn)] print(self.url_list) def get_data(self, url): """ Request data" :param url: :return: """ response = requests.get(url, headers=self.headers) return response.content def save_data(self, data, num): """ 保存数据 :param data: :param num: :return: """ file_name = "./pages/" + self.name + "_" + str(num) + ".html" with open(file_name, "wb") as f: f.write(data) def run(self): for url in self.url_list: data = self.get_data(url) num = self.url_list.index(url) self.save_data(data, num) if __name__ == "__main__": name = sys.argv[1] pn = int(sys.argv[2]) baidu = BaiduTieBa(name, pn) baidu.run()
Case 2: Kingsoft PowerWord Translation (POST request)
import requests import sys import json class JinshanCiBa: def __init__(self, words): self.url = "http://fy.iciba.com/ajax.php?a=fy" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", "X-Requested-With": "XMLHttpRequest" } self.post_data = { "f": "auto", "t": "auto", "w": words } def get_data(self): """ :return: : param url: Request data """ response = requests.post(self.url, data=self.post_data, headers=self.headers) return response.text def show_translation(self): """ 显示翻译结果 :param data: :param num: :return: """ response = self.get_data() json_data = json.loads(response, encoding='utf-8') if json_data['status'] == 0: translation = json_data['content']['word_mean'] elif json_data['status'] == 1: translation = json_data['content']['out'] else: translation = None print(translation) def run(self): self.show_translation() if __name__ == "__main__": words = sys.argv[1] ciba = JinshanCiBa(words) ciba.run()
Case 3: Baidu Tieba image crawling
(1) Normal version
Extract the url from the downloaded page to crawl the image (see case 1 for page download method)
from lxml import etree import requests class DownloadPhoto: def __init__(self): pass def download_img(self, url): response = requests.get(url) index = url.rfind('/') file_name = url[index + 1:] print("下载图片:" + file_name) save_name = "./photo/" + file_name with open(save_name, "wb") as f: f.write(response.content) def parse_photo_url(self, page): html = etree.parse(page, etree.HTMLParser()) nodes = html.xpath("//a[contains(@class, 'thumbnail')]/img/@bpic") print(nodes) print(len(nodes)) for node in nodes: self.download_img(node) if __name__ == "__main__": download = DownloadPhoto() for i in range(6000): download.parse_photo_url("./pages/校花_{}.html".format(i))
(2) Multi-threaded version
main.py
import requests from lxml import etree from file_download import DownLoadExecutioner, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutioner() def start(self): self.download_executioner.start() self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 获取下一页的连接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download.py
import requests import threading from queue import Queue def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutioner(threading.Thread): def __init__(self): super().__init__() self.q = Queue(maxsize=50) # 图片保存目录 self.save_dir = './img/' # 图片计数 self.index = 0 def put_task(self, urls): if isinstance(urls, list): for url in urls: self.q.put(url) else: self.q.put(urls) def run(self): while True: url = self.q.get() content = file_download(url) # The name of the captured image index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name,'wb+') as f: f.write (content) self.index += 1 print(save_name + "downloaded successfully! The total number of downloaded pictures:" + str(self.index))
(3) Thread pool version
main.py
import requests from lxml import etree from file_download_pool import DownLoadExecutionerPool, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutionerPool() def start(self): self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 获取下一页的连接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download_pool.py
import requests import concurrent.futures as futures def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutionerPool(): def __init__(self): super().__init__() # 图片保存目录 self.save_dir = './img_pool/' # 图片计数 self.index = 0 # 线程池 self.ex = futures.ThreadPoolExecutor(max_workers=30) def put_task(self, urls): if isinstance(urls, list): for url in urls: self.ex.submit(self.save_img, url) else: self.ex.submit(self.save_img, urls) def save_img(self, url): content = file_download (url) # The name of the captured image index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name,'wb+') as f: f.write(content ) self.index += 1 print(save_name + "downloaded successfully! The total number of downloaded pictures:" + str(self.index))
Author: Recalcitrant
Link: https://www.jianshu.com/p/140... is a Python HTTP client library.
Request supports HTTP connection retention and connection pooling, supports the use of cookies to maintain sessions, supports file uploads, supports automatic response content encoding, and supports internationalized URL and POST data automatic encoding.
A high degree of encapsulation is carried out on the basis of python's built-in modules, so that when python makes network requests, it becomes humane. Using Requests can easily complete any operation that the browser can have. Modern, international and friendly.
requests will automatically implement persistent connection keep-alive
table of Contents
1. Requests basics
2. Send request and receive response (basic GET request)
Three, send request and receive response (basic POST request)
Four, response attributes
Five, agency
Six, cookie and session
Seven, case
1. Requests basics
1. Install the Requests library
pip install requests
2. Use the Requests library
import requests
2. Send request and receive response (basic GET request)
response = requests.get(url)
1. Transmit parmas parameters
- The parameters are included in the url
response = requests.get("http://httpbin.org/get?name=zhangsan&age=22") print(response.text)
- Pass parameters through the get method
data = { "name": "zhangsan", "age": 30 } response = requests.get("http://httpbin.org/get", params=data) print(response.text)
2. Simulate sending request headers (transfer headers parameters)
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } response = requests.get("http://httpbin.org/get", headers=headers) print(response.text)
Three, send request and receive response (basic POST request)
response = requests.post(url, data = data, headers=headers)
Four, response attributes
Property description response.text gets the response of str type (Unicode encoding) response.content gets the response of bytes type response.status_code gets the response status code response.headers gets the response header response.request gets the response corresponding to the request
Five, agency
proxies = { "http": "https://175.44.148.176:9000", "https": "https://183.129.207.86:14002" } response = requests.get("https://www.baidu.com/", proxies=proxies)
Six, cookie and session
- The use of cookies and session benefits: Many websites must log in (or obtain certain permissions) before they can request relevant data.
- Disadvantages of the cookies and sessions used: A set of cookies and sessions often correspond to a user. Requests are too fast, and the number of requests is too many, which is easy to be recognized by the server as a crawler, which will damage the account.
1. Try not to use cookies when cookies are not needed.
2. In order to get the login page, we must send a request with cookies. At this time, in order to ensure account security, we should minimize the data
Acquisition speed.
1.cookie
(1) Obtain cookie information
response.cookies
2.session
(1) Construct the session object
session = requests.session()
Example:
def login_renren(): login_url = 'http://www.renren.com/SysHome.do' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } session = requests.session() login_data = { "email": "账号", "password": "密码" } response = session.post(login_url, data=login_data, headers=headers) response = session.get("http://www.renren.com/971909762/newsfeed/photo") print(response.text) login_renren()
Seven, case
Case 1: Baidu Tieba page crawling (GET request)
import requests import sys class BaiduTieBa: def __init__(self, name, pn, ): self.name = name self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn= {}".format(name, pn) self.headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari /537.36" # Use an older version of the request header, the browser does not support js "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" } self.url_list = [self.url + str (pn*50) for pn in range(pn)] print(self.url_list) def get_data(self, url): """ Request data" :param url: :return: """ response = requests.get(url, headers=self.headers) return response.content def save_data(self, data, num): """ 保存数据 :param data: :param num: :return: """ file_name = "./pages/" + self.name + "_" + str(num) + ".html" with open(file_name, "wb") as f: f.write(data) def run(self): for url in self.url_list: data = self.get_data(url) num = self.url_list.index(url) self.save_data(data, num) if __name__ == "__main__": name = sys.argv[1] pn = int(sys.argv[2]) baidu = BaiduTieBa(name, pn) baidu.run()
Case 2: Kingsoft PowerWord Translation (POST request)
import requests import sys import json class JinshanCiBa: def __init__(self, words): self.url = "http://fy.iciba.com/ajax.php?a=fy" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", "X-Requested-With": "XMLHttpRequest" } self.post_data = { "f": "auto", "t": "auto", "w": words } def get_data(self): """ :return: : param url: Request data """ response = requests.post(self.url, data=self.post_data, headers=self.headers) return response.text def show_translation(self): """ 显示翻译结果 :param data: :param num: :return: """ response = self.get_data() json_data = json.loads(response, encoding='utf-8') if json_data['status'] == 0: translation = json_data['content']['word_mean'] elif json_data['status'] == 1: translation = json_data['content']['out'] else: translation = None print(translation) def run(self): self.show_translation() if __name__ == "__main__": words = sys.argv[1] ciba = JinshanCiBa(words) ciba.run()
Case 3: Baidu Tieba image crawling
(1) Normal version
Extract the url from the downloaded page to crawl the image (see case 1 for page download method)
from lxml import etree import requests class DownloadPhoto: def __init__(self): pass def download_img(self, url): response = requests.get(url) index = url.rfind('/') file_name = url[index + 1:] print("下载图片:" + file_name) save_name = "./photo/" + file_name with open(save_name, "wb") as f: f.write(response.content) def parse_photo_url(self, page): html = etree.parse(page, etree.HTMLParser()) nodes = html.xpath("//a[contains(@class, 'thumbnail')]/img/@bpic") print(nodes) print(len(nodes)) for node in nodes: self.download_img(node) if __name__ == "__main__": download = DownloadPhoto() for i in range(6000): download.parse_photo_url("./pages/校花_{}.html".format(i))
(2) Multi-threaded version
main.py
import requests from lxml import etree from file_download import DownLoadExecutioner, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutioner() def start(self): self.download_executioner.start() self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 获取下一页的连接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download.py
import requests import threading from queue import Queue def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutioner(threading.Thread): def __init__(self): super().__init__() self.q = Queue(maxsize=50) # 图片保存目录 self.save_dir = './img/' # 图片计数 self.index = 0 def put_task(self, urls): if isinstance(urls, list): for url in urls: self.q.put(url) else: self.q.put(urls) def run(self): while True: url = self.q.get() content = file_download(url) # The name of the captured image index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name,'wb+') as f: f.write (content) self.index += 1 print(save_name + "downloaded successfully! The total number of downloaded pictures:" + str(self.index))
(3) Thread pool version
main.py
import requests from lxml import etree from file_download_pool import DownLoadExecutionerPool, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutionerPool() def start(self): self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 获取下一页的连接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download_pool.py
import requests import concurrent.futures as futures def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutionerPool(): def __init__(self): super().__init__() # 图片保存目录 self.save_dir = './img_pool/' # 图片计数 self.index = 0 # 线程池 self.ex = futures.ThreadPoolExecutor(max_workers=30) def put_task(self, urls): if isinstance(urls, list): for url in urls: self.ex.submit(self.save_img, url) else: self.ex.submit(self.save_img, urls) def save_img(self, url): content = file_download (url) # The name of the captured image index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name,'wb+') as f: f.write(content ) self.index += 1 print(save_name + "downloaded successfully! The total number of downloaded pictures:" + str(self.index))