Notes at the bottom of the python crawler press box

Python crawler, shigennotes organized on May 27, 2023, are now shared.

requests

pip install requests

beautifulSoup

Reference article: BeautifulSoup of Python Crawler Develop Paper

Case: Crawling Douban top250 movie titles

Check the user-agent gadget: get browser UA (userAgent) information

import requests
from bs4 import BeautifulSoup

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=header)
content = response.text

soup = BeautifulSoup(content, 'html.parser')
all_titles = soup.findAll('span', attrs={'class': 'title'})

for title in all_titles:
    print(title.get_text())

xpath

# html/xml解析
pip install lxml
from lxml import etree
tree = etree.HTML(content)
titles = tree.xpath('//*[@id="content"]/div/div[1]/ol//li/div/div[2]/div[1]/a/span[1]/text()')
print(titles)

Handling of cookies

session = requests.session()
# 会携带上下文的cookie

Anti-leech

Traceability, the upper level of the current request

In the case, use the anti-leech link to download pear video

# -*- encoding: utf-8 -*-
__date__ = '2023/05/28 10:33:03'

# this python shell is used to download video from pearvideo

import requests

url = 'https://www.pearvideo.com/video_1413858'
contant_id = url.split('/')[-1].split('_')[-1]
session = requests.Session()
session.get(url)

request_json_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=1413858&mrd=0.051782001313967596'
headers = {'Referer': url}

res_json = session.get(url=request_json_url, headers=headers).json()
print(res_json)

systemTime = res_json['systemTime']
srcUrl = res_json['videoInfo']['videos']['srcUrl']
videoImg = res_json['videoInfo']['video_image']

videoUrl = srcUrl.replace(systemTime, 'cont-'+contant_id)
print(videoUrl, videoImg)

basePath = './files/' + contant_id 
import os

if not os.path.exists(basePath):
    os.makedirs(basePath)
# download video image
img_save_path = os.path.join(basePath, videoImg.split('/')[-1])
with open(img_save_path, 'wb') as file:
    file.write(session.get(videoImg).content)

# download video
video_save_path = os.path.join(basePath, videoUrl.split('/')[-1])
with open(video_save_path, 'wb') as file:
    file.write(session.get(videoUrl).content)

acting

Access the real website through proxy ip

Domestic free HTTP proxy: The latest free HTTP proxy IP in China at 7:00 on May 28, 2023

url = 'http://www.baidu.com'

import requests

proxy_ip = 'https:114.251.193.153:3128'

proxies = {
    "https": proxy_ip
}

response = requests.get(url=url, proxies=proxies)
# 直接爬取会出现乱码的问题
text = response.content.decode('utf-8')
print(text)

Multithreading

The relationship between threads and processes

thread:

process:

Creating and using multiple threads

from threading import Thread

def func(name):
    for i in range(100):
        print('func() called', i)

class MyThread(Thread):

    def run(self):
        for i in range(100):
            print('mythread func() called', i)

if __name__ == '__main__':
    # t = Thread(target=func, args=('shigen',))
    # t.start()
    
    t = MyThread()
    t.start()

    for i in range(100):
        print(i)

multi-Progress

Similar to the multi-threaded API

from multiprocess import Process

Thread pool and process pool

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def fn(name):
    for i in range(100):
        print(name, i)

if __name__ == "__main__":
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(fn, name=f'线程执行{i}')
    print('all done')

Obtain thread running result information:

# 创建一个包含5条线程的线程池
executor = ThreadPoolExecutor(max_workers=5)
lists = [1, 2, 3, 4, 5, 7, 8, 9, 10]
start_time = time.time()
result = [data for data in executor.map(action, lists)]
print(result)
executor.shutdown()
print(time.time() - start_time)

coroutine

When the program encounters IO operations, it can selectively switch to other tasks

under the condition of single thread

Microscopically: one task is switched by one task, and the switching condition is generally IO operation
Macroscopically: Multiple tasks are executed asynchronously together

import asyncio

async def download(url):
    print(f'start downloading {url}')
    await asyncio.sleep(2)
    print('downloading finished')
    
    
async def main():
    urls = [
        'http://www.google.com',
        'http://www.baidu.com',
        'http://www.xiaomi.com'
    ]
    tasks = []
    for url in urls:
        tasks.append(asyncio.create_task(download(url)))
    await asyncio.wait(tasks)
    
if  __name__ == '__main__':
    asyncio.run(main())

http ctrip

import asyncio

import aiohttp

basepath = './files/'

headers = {'Referer': 'https://xxxx.cn/'}

async def download(url):
    print(f'start downloading {url}')
    filename = url.split('/')[-2]
    async with aiohttp.ClientSession() as session:
        async with session.get(url=url, headers=headers) as resp:
            with open(basepath + filename, mode='wb') as f:
                f.write(await resp.content.read())
    print(f'downloading {url} finished')

async def main():
    urls = []
    tasks = []
    for url in urls:
        tasks.append(asyncio.create_task(download(url)))
    await asyncio.wait(tasks)

if __name__ == '__main__':
    asyncio.run(main())

selenium

Automated testing, open the browser, operate the browser like a human

pip install selenium

install driver

Download address: chromedriver.storage.googleapis.com
Reference article: Python - Selenium of reptiles - Nuggets

Unzip to the location of the python interpreter and rename

tar -zxvf chromedriver_mac_arm64.zip
ll
which python3
mv chromedriver ~/opt/anaconda3/bin

test

from selenium.webdriver import Chrome

# 创建浏览器对象
web = Chrome()

web.get('http://www.baidu.com')

print(web.get_window_size())
print(web.title)

Possible problems: Solve the error that mac cannot open chromedriver, because the developer cannot be verified, so "chromedriver" cannot be opened_webdriver mac cannot be opened_Naimoxi's Blog-CSDN Blog

Crawl Jingdong Books

Crawl the book information about python on JD.com and save it as a csv file.

todo: Some pages have different page elements, which leads to problems in finding elements.

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

driver = webdriver.Chrome()

driver.get('https://www.jd.com/')
# 获取搜索框
search = driver.find_element(By.XPATH, '//*[@id="key"]')
# 获取查询按钮
button = driver.find_element(By.XPATH, '//*[@id="search"]/div/div[2]/button')
# 在搜索框中输入 Python
search.send_keys('Python')
# 点击查询按钮
button.click()
time.sleep(2)

# 获得商品
goods_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
datas = []
for good in goods_list[:-1]:
    # 此处用css选择器比较保险
    img = good.find_element(By.CSS_SELECTOR, 'a > img').get_attribute('src')
    name = good.find_element(By.CSS_SELECTOR, 'div > div.p-name.p-name-type-2 > a > em').text
    prize = good.find_element(By.TAG_NAME, 'i').text
    url_element = good.find_element(By.CSS_SELECTOR, 'div > a')
    url = url_element.get_attribute('href')
    url_element.click()
    # 进入到新窗口
    driver.switch_to.window(driver.window_handles[-1])
    book_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-3"]/div[2]/div').text
    author_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-4"]/div[2]/div').text
    book_menu = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-6"]/div[2]/div').text
    # 打开详情页获得内容简介、作者简介和目录
    print(img, name, prize, url)

    datas.append({
        'img': img,
        'name': name,
        'prize': prize,
        'url': url,
        'book_detail': book_detail,
        'author_detail': author_detail,
        'book_menu': book_menu,
    })
    # 关闭并切回到原来的窗口
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    
df = pd.DataFrame(datas)
df.to_csv('book_detail.csv', index=True, header=True)

Temporarily unable to display this content outside the Feishu document

headless browser

Without any interface, operate the browser

# 创建配置对象
options = webdriver.ChromeOptions()
# 配置对象添加开启无界面模式的命令
options.add_argument('--headless')
# 配置对象添加禁用gpu的命令
options.add_argument('--disable-gpu')

# 实例化带有配置对象的driver对象
driver =Chrome(chrome_options=options)

proxy ip

# 配置代理ip
options.add_argument('--proxy-server=http://150.138.253.70:808')

Replace user-agent

# 更换User-Agent
options.add_argument('--user-agent=Opera/9.23 (X11; Linux x86_64; U; en)')

Crack Captcha

Use ocr or super eagle platform to identify the verification code

Super Eagle: Super Eagle verification code recognition-professional verification code cloud recognition service, making verification code recognition faster, more accurate and more powerful

Recognize image clicks

The tutorial talks about using selenium to automatically log in to 12306, China Railway 12306 website