17 爬虫进阶

1. 多线程爬虫

  • 函数实现
import threading
import time


def coding():
    for x in range(3):
        print('正在写代码%s' % x)
        time.sleep(1)


def drawing():
    for x in range(3):
        print('正在画画%s' % x)
        time.sleep(1)


if __name__ == '__main__':
    t1 = threading.Thread(target=coding)
    t2 = threading.Thread(target=drawing)
    t1.start()
    t2.start()

  • 类实现
import threading
import time


class CodingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s' % threading.current_thread())
            time.sleep(1)


class DrawingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在画画%s' % threading.current_thread())
            time.sleep(1)


if __name__ == '__main__':
    t1 = CodingThread()
    t2 = DrawingThread()
    t1.start()
    t2.start()

  • Lock版本的生产者和消费者模式
import random
import threading
import time

gMoney = 1000
gLock = threading.Lock()
gTotalTimes = 10
gTimes = 0


class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes

        while True:
            money = random.randint(100, 1000)
            gLock.acquire()
            if gTimes >= gTotalTimes:
                gLock.release()
                break
            gMoney += money
            print('%s生产了%d元钱, 剩余%d元钱' % (threading.current_thread(), money, gMoney))
            gTimes += 1
            gLock.release()
            time.sleep(0.5)


class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100, 1000)
            gLock.acquire()
            if gMoney >= money:
                gMoney -= money
                print('%s消费者消费了%d元钱, 剩余%d元钱' % (threading.current_thread(), money, gMoney))
                
            else:
                if gTimes >= gTotalTimes:
                    gLock.release()
                    break
                print('%s消费者准备消费%d元钱,剩余%d元钱,不足!' % (threading.current_thread(), money, gMoney))

            gLock.release()
            time.sleep(0.5)

if __name__ == '__main__':
    for x in range(5):
        t = Producer()
        t.start()

    for x in range(3):
        t = Consumer()
        t.start()

  • Condition版本的生产者与消费者模式
import random
import threading
import time

gMoney = 1000
gCondition = threading.Condition()
gTotalTimes = 10
gTimes = 0


class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes

        while True:
            money = random.randint(100, 1000)
            gCondition.acquire()
            if gTimes >= gTotalTimes:
                gCondition.release()
                break
            gMoney += money
            print('%s生产了%d元钱, 剩余%d元钱' % (threading.current_thread(), money, gMoney))
            gTimes += 1
            # 生产完就通知消费者
            gCondition.notify_all()
            gCondition.release()
            time.sleep(0.5)


class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100, 1000)
            gCondition.acquire()
            # 在这里不能用if,会有问题
            while gMoney < money:
                if gTimes >= gTotalTimes:
                    gCondition.release()
                    return
                print('%s准备消费%d元钱, 剩余%d元钱, 不足!' % (threading.current_thread(), money, gMoney))
                gCondition.wait()
            gMoney -= money
            print('%s消费了%d元钱, 剩余%d元钱' % (threading.current_thread(), money, gMoney))
            gCondition.release()
            time.sleep(0.5)


if __name__ == '__main__':
    for x in range(2):
        t = Producer()
        t.start()

    for x in range(3):
        t = Consumer()
        t.start()

2. 斗图爬虫

  • 单线程实现
import requests
from lxml import etree
import re
import os
from urllib import request


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}

url = "https://www.doutula.com/photo/list/?page=1"

response = requests.get(url, headers=headers)
htmlE = etree.HTML(response.text)
imgs = htmlE.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
    img_url = img.get("data-original")
    alt = img.get("alt")
    alt = re.sub(r"[\??,,\.!!]","", alt)
    suffix = os.path.splitext(img_url)[1]
    filename = alt + suffix
    request.urlretrieve(img_url, "images/" + filename)

  • 多线程(生产者消费者模式)实现
import os
import re
import threading
from queue import Queue
from urllib import request

import requests
from lxml import etree
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}


class Producer(threading.Thread):
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.image_queue = image_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            print("---")
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self, url):
        response = requests.get(url, headers=HEADERS)
        htmlE = etree.HTML(response.text)
        imgs = htmlE.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get("data-original")
            alt = img.get("alt")
            alt = re.sub(r"[\??,,\.!!*]", "", alt)
            suffix = os.path.splitext(img_url)[1]
            filename = alt + suffix
            self.image_queue.put((img_url, filename))
            print(self.image_queue.qsize())
        time.sleep(0.5)

class Consumer(threading.Thread):
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.image_queue = image_queue

    def run(self):
        while True:
            if self.image_queue.empty() and self.page_queue.empty():
                break
            image_url, filename = self.image_queue.get()
            request.urlretrieve(image_url, "images/" + filename)
            print(filename + "下载完成。。。")
            time.sleep(0.5)


if __name__ == '__main__':

    url = "https://www.doutula.com/photo/list/?page={}"

    page_queue = Queue(10)
    image_queue = Queue(1000)

    for i in range(1, 3):
        page_queue.put(url.format(i))

    for x in range(5):
        t = Producer(page_queue, image_queue)
        t.start()
    time.sleep(1)
    for x in range(2):
        t = Consumer(page_queue, image_queue)
        t.start()


3. 动态网页的数据爬取

  • 直接分析ajax调用的接口,然后通过代码请求这个接口
  • 使用Selenium + Chromedriver模拟浏览器行为获取数据
  • selenium 常用操作:https://selenium-python.readthedocs.io/installation.html
  • Selenium-Python中文文档链接: http://selenium-python-zh.readthedocs.io/en/latest/waits.html

安装Selenium

  • pip3 install selenium

安装chromedriver

  • 下载链接: https://sites.google.com/a/chromium.org/chromedriver/downloads
  • 下载完成后,放到不需要权限的纯英文目录下就可以了
  • 注意chromedriver的版本要和浏览器的版本一致,64位的也可以用32位的

第一个小案例

from selenium import webdriver
import time

driver_path = r"D:\chromedriver\chromedriver.exe"

driver = webdriver.Chrome(executable_path=driver_path)

driver.get("http://www.baidu.com")
# 通过page_source获取网页的源代码
print(driver.page_source)

time.sleep(3)
driver.close()

定位元素

  • 如果只是想要解析网页中的数据,那么推荐将网页源代码扔给lxml来解析, 因为lxml底层使用的是c怨言, 所以解析效率会高一点
  • 如果是想要对元素进行一些操作,比如给一个文本框输入,或者点击某个按钮,那么就必须使用selenium给我们提供的查找元素的额方法
from selenium import webdriver
import time
from lxml import etree

driver_path = r"D:\chromedriver\chromedriver.exe"

driver = webdriver.Chrome(executable_path=driver_path)

driver.get("http://www.baidu.com")
# 通过page_source获取网页的源代码
print(driver.page_source)

# inputTag = driver.find_element_by_id("kw")
# inputTag = driver.find_element_by_name("wd")
# inputTag = driver.find_element_by_class_name("s_ipt")
inputTag = driver.find_element_by_xpath("//input[@class='s_ipt']")
inputTag.send_keys("迪丽热巴")
htmlE = etree.HTML(driver.page_source)

print(htmlE)
time.sleep(3)
driver.close()


selenium 操作表单元素

  • 文本框的操作
inputTag = driver.find_element_by_xpath("//input[@class='s_ipt']")
inputTag.send_keys("迪丽热巴")

time.sleep(3)

inputTag.clear()
  • checkbox的操作
inputTag = driver.find_element_by_name("remember")
inputTag.click()
  • select的操作
  • 按钮的操作

行为链

from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import  ActionChains

driver_path = r"D:\chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("http://www.baidu.com")

inputTag = driver.find_element_by_xpath("//input[@class='s_ipt']")
submitBtn = driver.find_element_by_id('su')

actions = ActionChains(driver)
actions.move_to_element(inputTag)
actions.send_keys_to_element(inputTag, '黄渤')
actions.move_to_element(submitBtn)
actions.click(submitBtn)
actions.perform()

time.sleep(6)

inputTag.clear()


driver.close()


cookie的操作

页面等待

  • 隐式等待
driver.implicitly_wait(10)
  • 显示等待
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

driver_path = r"D:\chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("http://www.douban.com")
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, 'app-title'))
)
print(element)



切换页面

from selenium import webdriver

driver_path = r"D:\chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("http://www.jd.com")

driver.execute_script("window.open('https://www.douban.com/')")
print(driver.window_handles)
driver.switch_to.window(driver.window_handles[1])

print(driver.current_url)




selenium 使用代理

from selenium import webdriver

driver_path = r"D:\chromedriver\chromedriver.exe"

options = webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://60.17.239.207:31032")
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)

driver.get("http://www.jd.com")

WebElement元素

4. Selenium 拉勾网爬虫

  • 传统方式
import requests
from lxml import etree
import time
import re

# 请求头
HEADERS = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    "Referer": "https://www.lagou.com/jobs/list_python?labelWords=$fromSearch=true&suginput=",
    "Host": "www.lagou.com",
}

def request_list_page():
    url1 = 'https://www.lagou.com/jobs/list_python?labelWords=$fromSearch=true&suginput='

    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

    # 通过data来控制翻页

    for page in range(1, 2):
        data = {
            'first': 'false',
            'pn': page,
            'kd': 'python'
        }
        s = requests.Session()  # 建立session
        response = s.get(url=url1, headers=HEADERS, timeout=3)
        cookie = s.cookies  # 获取cookie
        respon = s.post(url=url, headers=HEADERS, data=data, cookies=cookie, timeout=3)
        time.sleep(7)
        result = respon.json()
        positions = result['content']['positionResult']['result']
        for position in positions:
            positionId = position['positionId']
            position_url = "https://www.lagou.com/jobs/{}.html".format(positionId)
            parse_position_detail(position_url, s)
            break

def parse_position_detail(url, s):
    response = s.get(url, headers=HEADERS)
    text = response.text
    htmlE = etree.HTML(text)
    position_name = htmlE.xpath("//div[@class='job-name']/@title")[0]
    job_request_spans = htmlE.xpath("//dd[@class='job_request']//span")
    salary = job_request_spans[0].xpath("./text()")[0].strip()
    education = job_request_spans[3].xpath("./text()")[0]
    education = re.sub(r"[/ \s]", "", education)
    print(education)
    job_detail = htmlE.xpath("//div[@class='job-detail']//text()")
    job_detail = "".join(job_detail).strip()
    print(job_detail)


if __name__ == '__main__':
    request_list_page()

  • Selenium + Chromedriver方式
import re
import time

from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


class LagouSpider(object):
    """
    Selenium + ChromeDriver 拉钩爬虫
    """
    driver_path = r"D:\chromedriver\chromedriver.exe"

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        # 这个链接并不是真正招聘职位信息的链接
        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=$fromSearch=true&suginput='
        # 职位信息列表
        self.positions = []

    def run(self):
        self.driver.get(self.url)
        while True:
            WebDriverWait(self.driver, 10).until(
                # 这里只能追踪的元素,不能追踪到元素的具体属性
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )

            source = self.driver.page_source
            self.parse_list_page(source)
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()

    def parse_list_page(self, source):
        htmlE = etree.HTML(source)
        links = htmlE.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        # self.driver.get(url)
        self.driver.execute_script("window.open('{}')".format(url))
        self.driver.switch_to.window(self.driver.window_handles[1])

        WebDriverWait(self.driver, 10).until(
            # EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/@title"))
            # 这里只能追踪到元素,追踪不到元素下的具体属性
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']"))
        )

        page_srouce = self.driver.page_source
        self.parse_detail_page(page_srouce)
        # 关闭这个详情页
        self.driver.close()
        # 继续切换到职位列表页面
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        htmlE = etree.HTML(source)
        position_name = htmlE.xpath("//div[@class='job-name']/h2/text()")[0]
        company = htmlE.xpath("//div[@class='job-name']/h4/text()")[0]
        job_request_spans = htmlE.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath("./text()")[0].strip()
        salary = re.sub(r"[/ \s]", "", salary)
        city = job_request_spans[1].xpath("./text()")[0].strip()
        city = re.sub(r"[/ \s]", "", city)
        experience = job_request_spans[2].xpath("./text()")[0].strip()
        experience = re.sub(r"[/ \s]", "", experience)
        education = job_request_spans[3].xpath("./text()")[0]
        education = re.sub(r"[/ \s]", "", education)
        type = job_request_spans[4].xpath("./text()")[0]
        type = re.sub(r"[/ \s]", "", type)
        job_detail = htmlE.xpath("//div[@class='job-detail']//text()")
        job_detail = "".join(job_detail).strip()
        print("职位:%s" % position_name)
        print("单位:%s" % company)
        print("")
        print(salary + "/" + city + "/" + experience + "/" + education + "/" + type)
        print("")
        print(job_detail)

        position = {
            'name': position_name,
            'company': company,
            'salary': salary,
            'city': city,
            'experience': experience,
            'education': education,
            'desc': job_detail
        }
        self.positions.append(position)
        # print(position)
        print("=" * 100)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

5. Tesseract库(后期用深度学习训练模型)

  • 下载链接: http://github.com/tesseract-ocr
发布了85 篇原创文章 · 获赞 12 · 访问量 3744

猜你喜欢

转载自blog.csdn.net/fanjianhai/article/details/103679915
17
今日推荐