Python crawler, shigen
notes organized on May 27, 2023, are now shared.
requests
pip install requests
beautifulSoup
Reference article: BeautifulSoup of Python Crawler Develop Paper
Case: Crawling Douban top250 movie titles
Check the user-agent gadget: get browser UA (userAgent) information
import requests
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=header)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
all_titles = soup.findAll('span', attrs={'class': 'title'})
for title in all_titles:
print(title.get_text())
xpath
# html/xml解析
pip install lxml
from lxml import etree
tree = etree.HTML(content)
titles = tree.xpath('//*[@id="content"]/div/div[1]/ol//li/div/div[2]/div[1]/a/span[1]/text()')
print(titles)
Handling of cookies
session = requests.session()
# 会携带上下文的cookie
Anti-leech
Traceability, the upper level of the current request
In the case, use the anti-leech link to download pear video
# -*- encoding: utf-8 -*-
__date__ = '2023/05/28 10:33:03'
# this python shell is used to download video from pearvideo
import requests
url = 'https://www.pearvideo.com/video_1413858'
contant_id = url.split('/')[-1].split('_')[-1]
session = requests.Session()
session.get(url)
request_json_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=1413858&mrd=0.051782001313967596'
headers = {'Referer': url}
res_json = session.get(url=request_json_url, headers=headers).json()
print(res_json)
systemTime = res_json['systemTime']
srcUrl = res_json['videoInfo']['videos']['srcUrl']
videoImg = res_json['videoInfo']['video_image']
videoUrl = srcUrl.replace(systemTime, 'cont-'+contant_id)
print(videoUrl, videoImg)
basePath = './files/' + contant_id
import os
if not os.path.exists(basePath):
os.makedirs(basePath)
# download video image
img_save_path = os.path.join(basePath, videoImg.split('/')[-1])
with open(img_save_path, 'wb') as file:
file.write(session.get(videoImg).content)
# download video
video_save_path = os.path.join(basePath, videoUrl.split('/')[-1])
with open(video_save_path, 'wb') as file:
file.write(session.get(videoUrl).content)
acting
Access the real website through proxy ip
Domestic free HTTP proxy: The latest free HTTP proxy IP in China at 7:00 on May 28, 2023
url = 'http://www.baidu.com'
import requests
proxy_ip = 'https:114.251.193.153:3128'
proxies = {
"https": proxy_ip
}
response = requests.get(url=url, proxies=proxies)
# 直接爬取会出现乱码的问题
text = response.content.decode('utf-8')
print(text)
Multithreading
The relationship between threads and processes
thread:
process:
Creating and using multiple threads
from threading import Thread
def func(name):
for i in range(100):
print('func() called', i)
class MyThread(Thread):
def run(self):
for i in range(100):
print('mythread func() called', i)
if __name__ == '__main__':
# t = Thread(target=func, args=('shigen',))
# t.start()
t = MyThread()
t.start()
for i in range(100):
print(i)
multi-Progress
Similar to the multi-threaded API
from multiprocess import Process
Thread pool and process pool
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def fn(name):
for i in range(100):
print(name, i)
if __name__ == "__main__":
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(fn, name=f'线程执行{i}')
print('all done')
Obtain thread running result information:
# 创建一个包含5条线程的线程池
executor = ThreadPoolExecutor(max_workers=5)
lists = [1, 2, 3, 4, 5, 7, 8, 9, 10]
start_time = time.time()
result = [data for data in executor.map(action, lists)]
print(result)
executor.shutdown()
print(time.time() - start_time)
coroutine
When the program encounters IO operations, it can selectively switch to other tasks
under the condition of single thread
- Microscopically: one task is switched by one task, and the switching condition is generally IO operation
- Macroscopically: Multiple tasks are executed asynchronously together
import asyncio
async def download(url):
print(f'start downloading {url}')
await asyncio.sleep(2)
print('downloading finished')
async def main():
urls = [
'http://www.google.com',
'http://www.baidu.com',
'http://www.xiaomi.com'
]
tasks = []
for url in urls:
tasks.append(asyncio.create_task(download(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
http ctrip
import asyncio
import aiohttp
basepath = './files/'
headers = {'Referer': 'https://xxxx.cn/'}
async def download(url):
print(f'start downloading {url}')
filename = url.split('/')[-2]
async with aiohttp.ClientSession() as session:
async with session.get(url=url, headers=headers) as resp:
with open(basepath + filename, mode='wb') as f:
f.write(await resp.content.read())
print(f'downloading {url} finished')
async def main():
urls = []
tasks = []
for url in urls:
tasks.append(asyncio.create_task(download(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
selenium
Automated testing, open the browser, operate the browser like a human
pip install selenium
install driver
- Download address: chromedriver.storage.googleapis.com
- Reference article: Python - Selenium of reptiles - Nuggets
Unzip to the location of the python interpreter and rename
tar -zxvf chromedriver_mac_arm64.zip
ll
which python3
mv chromedriver ~/opt/anaconda3/bin
test
from selenium.webdriver import Chrome
# 创建浏览器对象
web = Chrome()
web.get('http://www.baidu.com')
print(web.get_window_size())
print(web.title)
Crawl Jingdong Books
Crawl the book information about python on JD.com and save it as a csv file.
todo: Some pages have different page elements, which leads to problems in finding elements.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://www.jd.com/')
# 获取搜索框
search = driver.find_element(By.XPATH, '//*[@id="key"]')
# 获取查询按钮
button = driver.find_element(By.XPATH, '//*[@id="search"]/div/div[2]/button')
# 在搜索框中输入 Python
search.send_keys('Python')
# 点击查询按钮
button.click()
time.sleep(2)
# 获得商品
goods_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
datas = []
for good in goods_list[:-1]:
# 此处用css选择器比较保险
img = good.find_element(By.CSS_SELECTOR, 'a > img').get_attribute('src')
name = good.find_element(By.CSS_SELECTOR, 'div > div.p-name.p-name-type-2 > a > em').text
prize = good.find_element(By.TAG_NAME, 'i').text
url_element = good.find_element(By.CSS_SELECTOR, 'div > a')
url = url_element.get_attribute('href')
url_element.click()
# 进入到新窗口
driver.switch_to.window(driver.window_handles[-1])
book_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-3"]/div[2]/div').text
author_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-4"]/div[2]/div').text
book_menu = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-6"]/div[2]/div').text
# 打开详情页获得内容简介、作者简介和目录
print(img, name, prize, url)
datas.append({
'img': img,
'name': name,
'prize': prize,
'url': url,
'book_detail': book_detail,
'author_detail': author_detail,
'book_menu': book_menu,
})
# 关闭并切回到原来的窗口
driver.close()
driver.switch_to.window(driver.window_handles[0])
df = pd.DataFrame(datas)
df.to_csv('book_detail.csv', index=True, header=True)
Temporarily unable to display this content outside the Feishu document
headless browser
Without any interface, operate the browser
# 创建配置对象
options = webdriver.ChromeOptions()
# 配置对象添加开启无界面模式的命令
options.add_argument('--headless')
# 配置对象添加禁用gpu的命令
options.add_argument('--disable-gpu')
# 实例化带有配置对象的driver对象
driver =Chrome(chrome_options=options)
proxy ip
# 配置代理ip
options.add_argument('--proxy-server=http://150.138.253.70:808')
Replace user-agent
# 更换User-Agent
options.add_argument('--user-agent=Opera/9.23 (X11; Linux x86_64; U; en)')
Crack Captcha
Use ocr or super eagle platform to identify the verification code
Recognize image clicks
The tutorial talks about using selenium to automatically log in to 12306, China Railway 12306 website