14.多线程爬虫, 及抖音视频爬取小结

1.理解Queue队列中join()与task_done()的关系

2. 多线程爬虫(糗事百科)

1. 实现

# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue # 引入队列


class QiubaiSpdier:
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/63.0.3239.84 Safari/537.36"}
        self.url_queue = Queue()    # 存放带爬url的队列
        self.html_queue = Queue()   # 存放爬去网页返回数据的队列
        self.content_queue = Queue()    # 存放爬去结果集的队列

    def get_url_list(self):
        # return [self.url_temp.format(i) for i in range(1,14)]
        for i in range(1, 4):
            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):
        while True:
            url = self.url_queue.get()

            print(url)
            response = requests.get(url, headers=self.headers)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()

    def get_content_list(self):  # 提取数据
        while True:
            html_str = self.html_queue.get()

            html = etree.HTML(html_str)
            div_list = html.xpath("//div[@id='content-left']/div")  # 分组
            content_list = []
            for div in div_list:
                item = {}
                item["content"] = div.xpath(".//div[@class='content']/span/text()")
                item["content"] = [i.replace("\n", "") for i in item["content"]]
                item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
                item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(
                    item["author_gender"]) > 0 else None
                item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
                item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"]) > 0 else None
                item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
                item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
                item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
                item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
                item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
                item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
                content_list.append(item)
            self.content_queue.put(content_list)
            self.html_queue.task_done()  #

    def save_content_list(self):  # 保存
        while True:
            content_list = self.content_queue.get()
            for i in content_list:
                print(i)
                pass
            self.content_queue.task_done()

    def run(self):  # 实现主要逻辑
        thread_list = []
        # 1.url_list
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        # 2.遍历,发送请求,获取响应
        for i in range(20):
            t_parse = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse)
        # 3.提取数据
        for i in range(2):
            t_html = threading.Thread(target=self.get_content_list)
            thread_list.append(t_html)
        # 4.保存
        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True)  # 把子线程设置为守护线程,该线程不重要主线程结束,子线程结束
            t.start()

        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join()  # 让主线程等待阻塞,等待队列的任务完成之后再完成

        print("主线程结束")


if __name__ == '__main__':
    qiubai = QiubaiSpdier()
    qiubai.run()

2.1. 注意事项

  • 主要掌握多线程配合Queue的使用,内容没有太大价值,并且网页结构发生了变化

3. 使用python爬虫,批量爬取抖音app视频(requests+Fiddler+appium)

00. 抖音爬虫教程链接: https://www.cnblogs.com/stevenshushu/p/9635097.html

01. windows虚拟环境搭建链接:https://blog.csdn.net/qq_33404767/article/details/86479820

02. Fiddler的安装和使用 https://blog.csdn.net/ychgyyn/article/details/82154433

  • Fiddler乱码解决方法: https://blog.csdn.net/quiet_girl/article/details/50577828

03. appium桌面版1.6.5+python3.6

  • appium1.6.3也通过了,不要下太高的版本,会找不到adb https://blog.csdn.net/qq_33236708/article/details/78061787
  • Appium-Desktop 下载地址:https://testerhome.com/topics/680

04. 定位元素链接:

  • https://www.jianshu.com/p/6d71624cb5bb

  • https://www.cnblogs.com/bendouyao/p/9346379.html

  • 自动化控制计算器小案例

# coding=utf-8
from appium import webdriver
import time

desired_caps = {}
desired_caps['platformName'] = 'Android'
desired_caps['platformVersion'] = '5.0'
desired_caps['deviceName'] = 'T7G0215A14000138'
desired_caps['appPackage'] = 'com.android.calculator2'
desired_caps['appActivity'] = '.Calculator'
desired_caps["unicodeKeyboard"] = "True"  # appium提供的一种输入法,可以传中文。测试时直接用这个输入法
desired_caps["resetKeyboard"] = "True"  # 程序结束时重置原来的输入法
desired_caps["noReset"] = "False"  # 不初始化手机app信息(类似不清楚缓存)

driver = webdriver.Remote('http://localhost:4723/wd/hub', desired_caps)

driver.find_element_by_id("com.android.calculator2:id/digit1").click()
driver.find_element_by_id("com.android.calculator2:id/digit5").click()
driver.find_element_by_id("com.android.calculator2:id/digit9").click()
driver.find_element_by_id("com.android.calculator2:id/plus").click()
driver.find_element_by_id("com.android.calculator2:id/digit2").click()
driver.find_element_by_id("com.android.calculator2:id/digit3").click()
driver.find_element_by_id("com.android.calculator2:id/equal").click()

time.sleep(5)
driver.quit()

  • 定位QQ案例的小案例
# coding=utf-8
from appium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


desired_caps = {}
desired_caps['platformName'] = 'Android'
desired_caps['platformVersion'] = '5.0'
desired_caps['deviceName'] = 'T7G0215A14000138'
desired_caps['appPackage'] = 'com.tencent.mobileqq'
desired_caps['appActivity'] = '.activity.SplashActivity'
desired_caps["unicodeKeyboard"] = "True"  # appium提供的一种输入法,可以传中文。测试时直接用这个输入法
desired_caps["resetKeyboard"] = "True"  # 程序结束时重置原来的输入法
desired_caps["noReset"] = "False"  # 不初始化手机app信息(类似不清楚缓存)

driver = webdriver.Remote('http://localhost:4723/wd/hub', desired_caps)

WebDriverWait(driver, 60).until(
    # 这里只能追踪的元素,不能追踪到元素的具体属性
    EC.presence_of_element_located((By.ID, "com.tencent.mobileqq:id/dialogRightBtn"))
)

driver.find_element_by_id("com.tencent.mobileqq:id/dialogRightBtn").click()
time.sleep(1)
driver.find_element_by_id("com.tencent.mobileqq:id/btn_login").click()
time.sleep(1)
element = driver.find_element_by_accessibility_id('请输入QQ号码或手机或邮箱')
element.clear()
element.send_keys("594042358")
# driver.find_element_by_xpath('//android.widget.EditText[@content-desc="请输入QQ号码或手机或邮箱"]').send_keys("594042358")
driver.find_element_by_xpath('//android.widget.EditText[@content-desc="密码 安全"]').send_keys("fanjianhai")
time.sleep(1)
driver.find_element_by_xpath('//android.widget.ImageView[@content-desc="登 录"]').click()

time.sleep(8)
driver.quit()


"""
{
    "platformName": "Android",
    "platformVersion": "5.0",
    "deviceName": "T7G0215A14000138",
    "appPackage": "com.tencent.mobileqq",
    "appActivity": ".activity.SplashActivity"
}
"""

05. 抖音小视频综合案例爬取

  • fiddler脚本
 //保存到本地添加开始
        try{
        
            //这是抖音的地址||"v1-dy.ixigua.com"||"v3-dy.ixigua.com"||"v6-dy.ixigua.com"||"v9-dy.ixigua.com"||
            if (oSession.fullUrl.Contains("ixigua.com")){
                var d = new Date();
                
                
                var fso;
                var file;
                var hours = d.getHours();
                var hoursValue = parseInt(hours)
                if(hoursValue>= 0 && hoursValue <= 9)
                {
                    hoursValue = "0" + hoursValue
                }
                
                var minutes = d.getMinutes();
                var minutesValue = parseInt(minutes)
                if(minutesValue>= 0 && minutesValue <= 9)
                {
                    minutesValue = "0" + minutesValue
                }
                
                fso = new ActiveXObject("Scripting.FileSystemObject");
                //文件保存路径,可自定义
                file = fso.OpenTextFile("D:\\douyin\\url\\video_url_"+d.getDay()+"_"+hoursValue + "_" + minutesValue + ".txt" ,8 ,true);
                //file.writeLine("Request-url:" + oSession.url);
                file.writeLine("http://"+oSession.url)
                //file.writeLine("Request-host:" + oSession.host);
                //file.writeLine("Request-header:" + "\n" + oSession.oRequest.headers);
                //file.writeLine("Request-body:" + oSession.GetRequestBodyAsString());
                //file.writeLine("\n");
                file.close();
            }   
        }catch(e){
            var fsoErr = new ActiveXObject("Scripting.FileSystemObject");
            var file = fsoErr.OpenTextFile("D:\\douyin\\error_log.txt" ,8 ,true);
            file.writeLine(e.description);
        }
        //保存到本地添加结束
  • douyin_appium.py
from appium import webdriver
from time import sleep

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


class Action:
    def __init__(self):
        # 初始化配置,设置Desired Capabilities参数
        self.desired_caps = {
            "platformName": "Android",
            "platformVersion": "5.0",
            "deviceName": "T7G0215A14000138",
            "appPackage": "com.ss.android.ugc.aweme",
            "appActivity": ".splash.SplashActivity"
        }
        # 指定Appium Server
        self.server = 'http://localhost:4723/wd/hub'
        # 新建一个Session
        self.driver = webdriver.Remote(self.server, self.desired_caps)
        # 设置滑动初始坐标和滑动距离
        self.start_x = 500
        self.start_y = 1500
        self.distance = 1300

    def allow_permission(self):
        try:
            WebDriverWait(self.driver, 10).until(
                # 这里只能追踪的元素,不能追踪到元素的具体属性
                EC.presence_of_element_located((By.ID, "com.ss.android.ugc.aweme:id/y0"))
            )
            self.driver.find_element_by_id("com.ss.android.ugc.aweme:id/y0").click()
        except Exception as e:
            print(e)

    def comments(self):
        sleep(3)
        # app开启之后点击一次屏幕,确保页面的展示
        self.driver.tap([(500, 1200)], 500)

    def scroll(self):
        # 无限滑动
        while True:
            try:

                # 模拟滑动
                self.driver.swipe(self.start_x, self.start_y, self.start_x,
                                  self.start_y - self.distance)
                # 设置延时等待
                sleep(3)
            except Exception as e:
                print(e)

    def run(self):
        self.allow_permission()
        self.comments()
        self.scroll()


if __name__ == '__main__':
    action = Action()
    action.run()


"""
{
    "platformName": "Android",
    "platformVersion": "5.0",
    "deviceName": "T7G0215A14000138",
    "appPackage": "com.ss.android.ugc.aweme",
    "appActivity": ".splash.SplashActivity"
}
"""
  • douyin_spider.py
# _*_ coding: utf-8 _*_
import time
from urllib.request import urlretrieve
from urllib.error import ContentTooShortError
import os
import re

filter_pool = []


def distinct_data(pathtxt):
    """
    对抓取的url进行去重
    :return:
    """
    global filter_pool
    datalist_blank = []
    with open(pathtxt) as f:
        f_data_list = f.readlines()  # d得到的是一个list类型
        for a in f_data_list:
            datalist_blank.append(a.strip())  # 去掉\n strip去掉头尾默认空格或换行符
    data_dict = {}
    for data in datalist_blank:
        ret = re.match(".+\/m\/(.+)\/\?rc=", data)
        if not ret:
            continue
        result = ret.group(1)
        data_dict[result] = data
    data_new = []
    for x, y in data_dict.items():
        if y not in filter_pool:
            data_new.append(y)
            filter_pool.append(y)
        else:
            continue
    return data_new


def download(data_list, url_dir):
    """下载视频"""
    video_dir = "D:/douyin/video/{}".format(url_dir[:-4].replace("video_url", "video"))
    if not os.path.exists(video_dir):
        os.mkdir(video_dir)

    for index, data in enumerate(data_list):
        video_path = video_dir + "/{}.mp4".format(index)
        try:
            urlretrieve(data, video_path)
            time.sleep(3)
            # 下载完成后,删除url的文件路径
            print("{}--下载完成".format(video_path))
        except ContentTooShortError:
            pass

    os.remove(URL_DIRECTOR_ROOT_PATH + "\\" + url_dir)


if __name__ == '__main__':

    URL_DIRECTOR_ROOT_PATH = r"D:\douyin\url"
    # 当视频路径文件夹多余两个的时候开始去重,下载视频
    # 用来过滤重复的链接,阈值给到500时,删掉50

    while True:

        url_dirs = os.listdir(URL_DIRECTOR_ROOT_PATH)
        try:
            if url_dirs.__len__() >= 2:
                print(url_dirs)
                url_dir = url_dirs.pop(0)
                print(url_dir)
                # 去重开始...
                data_list = distinct_data(URL_DIRECTOR_ROOT_PATH + "\\" + url_dir)

                download(data_list, url_dir)

            else:

                print("请稍等,还没来得及抓包...")
                # 只有一个文件时,等待1分钟
                time.sleep(20)

            if filter_pool.__len__() > 1000:
                filter_pool = filter_pool[100:]
        except Exception as e:
            print(e)

        time.sleep(1)


JScript脚本链接:http://doc.51windows.net/jscript5/?url=/jscript5/dir.htm

发布了85 篇原创文章 · 获赞 12 · 访问量 3747

猜你喜欢

转载自blog.csdn.net/fanjianhai/article/details/103671245