12.2.爬虫

爬虫

相关工具包

builtwith      1.3.3    # 识别网站使用的技术
python-whois   0.7.0    # 查询网站的所有者
httpie         0.9.9    # 是一个 HTTP 的命令行客户端,
beautifulsoup4 4.6.0    # 解析HTML和XML,自带Beautiful与lxml相比,lxml效率更高。
bs4            0.0.1
lxml           4.2.1    # xml使页面解析定位更快,使用xpath语法来进行文件格式解析
pymongo        3.6.1    # mongodb数据库的python驱动
PyMySQL        0.8.1    # MySQL数据库的python驱动
redis          2.10.6   # redis数据库的python驱动
requests       2.18.4   # 第三方一个用于网络访问的模块
robobrowser    0.5.3    # robobrowser可以打开网页,点击链接和按钮并且提交表单
selenium       3.12.0   # Selenium是一个用于Web应用程序测试的工具。可以像真正的用户操作浏览器
Pillow         5.1.0    # 一款图像处理软件,这里用来处理表单验证码图片
pytesseract    0.2.2    # 光学图像识别软件Tesseract的python驱动

常用的user-agent

常见的User Agent

1.Android

Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19
Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30
Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
2.Firefox

Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0
3.Google Chrome

Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36
Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19
4.iOS

Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3
Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3

BeautifulSoup标签选择

知识点:bs4-re

from bs4 import BeautifulSoup

import re


def main():
    html = "……"
    soup = BeautifulSoup(html, 'lxml')
    # 相当于JavaScript中的 - document.title
    print(soup.title)
    # 相当于JavaScript中的 - document.body.h1
    print(soup.body.h1)
    # 取所以p标签
    print(soup.p)
    # 取body下p标签的内容
    print(soup.body.p.text)
    # 将p标签下的子节点以列表的方式输出
    print(soup.body.p.contents)
    # 将p标签下的子节点(直接字节点)以生成式的方式输出
    for p_child in soup.body.p.children:
        print(p_child)
    # 计算body下有多少个子节点(直接字节点)
    print(len([elem for elem in soup.body.children]))
    # 计算body下所有子节点的个数(包括子孙节点)
    print(len([elem for elem in soup.body.descendants]))
    # 在页面中找h1-h6所有标签
    print(soup.findAll(re.compile(r'^h[1-6]')))
    # 在页面中找所有h标签
    print(soup.body.find_all(r'^h'))
    # 在body下面div中找所有h标签
    print(soup.body.div.find_all(re.compile(r'^h')))
    # 在页面中找所有以r结尾的标签
    print(soup.find_all(re.compile(r'r$')))
    # 在页面中找所有img标签,并且标签的src属性为./img/开头.png结尾
    print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
    # 在页面中找所有具有两个属性的标签
    print(soup.find_all(lambda x: len(x.attrs) == 2))
    # 找所有foo()函数定义的标签
    print(soup.find_all(foo))
    # 查找p标签,并且具有class属性为foo的p标签
    print(soup.find_all('p', {'class': 'foo'}))
    # 查找所有具有href属性的a标签,并且把href属性值打印出来
    for elem in soup.select('a[href]'):
        print(elem.attrs['href'])


def foo(elem):
    return len(elem.attrs) == 2


if __name__ == '__main__':
    main()

简单爬虫实例

知识点:re-urllib-pymysql-ssl

from urllib.error import URLError
from urllib.request import urlopen

import re
import pymysql
import ssl

from pymysql import Error


# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('Decode:', error)
    return page_html


# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
    page_html = None
    try:
        page_html = decode_page(urlopen(seed_url).read(), charsets)
    except URLError:
        # logging.error('URL:', error)
        if retry_times > 0:
            return get_page_html(seed_url, retry_times=retry_times - 1,
                                 charsets=charsets)
    return page_html


# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
    pattern_regex = re.compile(pattern_str, pattern_ignore_case)
    return pattern_regex.findall(page_html) if page_html else []


# 开始执行爬虫程序并对指定的数据进行持久化操作
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
    conn = pymysql.connect(host='localhost', port=3306,
                           database='crawler', user='root',
                           password='123456', charset='utf8')
    try:
        with conn.cursor() as cursor:
            url_list = [seed_url]
            visited_url_list = {seed_url: 0}
            while url_list:
                current_url = url_list.pop(0)
                depth = visited_url_list[current_url]
                if depth != max_depth:
                    page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
                    links_list = get_matched_parts(page_html, match_pattern)
                    param_list = []
                    for link in links_list:
                        if link not in visited_url_list:
                            visited_url_list[link] = depth + 1
                            page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))
                            headings = get_matched_parts(page_html, r'<h1>(.*)<span')
                            if headings:
                                param_list.append((headings[0], link))
                    # 一次插入多条数据,优化数据库读写
                    cursor.executemany('insert into tb_result values (default, %s, %s)',
                                       param_list)
                    conn.commit()
    except Error:
        pass
        # logging.error('SQL:', error)
    finally:
        conn.close()


def main():
    # 设置全局的取消ssl证书验证
    ssl._create_default_https_context = ssl._create_unverified_context
    start_crawl('http://sports.sohu.com/nba_a.shtml',
                r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
                max_depth=2)


if __name__ == '__main__':
    main()

知识点:BeautifulSoup-requests

from bs4 import BeautifulSoup

import requests

import re


def main():
    # 通过requests第三方库的get方法获取页面
    resp = requests.get('http://sports.sohu.com/nba_a.shtml')
    # 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
    html = resp.content.decode('gbk')
    # 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
    bs = BeautifulSoup(html, 'lxml')
    # 通过CSS选择器语法查找元素并通过循环进行处理
    # for elem in bs.find_all(lambda x: 'test' in x.attrs):
    for elem in bs.select('a[test]'):
        # 通过attrs属性(字典)获取元素的属性值
        link_url = elem.attrs['href']
        resp = requests.get(link_url)
        bs_sub = BeautifulSoup(resp.text, 'lxml')
        # 使用正则表达式对获取的数据做进一步的处理
        print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))


if __name__ == '__main__':
    main()

知识点:urljoin-headers-proxies

from urllib.parse import urljoin

import re
import requests

from bs4 import BeautifulSoup


def main():
    # 设置请求头
    headers = {'user-agent': 'Baiduspider'}
    # 设置代理
    proxies = {
        'http': 'http://122.114.31.177:808'
    }
    base_url = 'https://www.zhihu.com/'
    # 拼接网址
    seed_url = urljoin(base_url, 'explore')
    resp = requests.get(seed_url,
                        headers=headers,
                        proxies=proxies)
    soup = BeautifulSoup(resp.text, 'lxml')
    href_regex = re.compile(r'^/question')
    link_set = set()
    for a_tag in soup.find_all('a', {'href': href_regex}):
        if 'href' in a_tag.attrs:
            href = a_tag.attrs['href']
            # 拼接网址
            full_url = urljoin(base_url, href)
            link_set.add(full_url)
    print('Total %d question pages found.' % len(link_set))


if __name__ == '__main__':
    main()

知识点: re-urllib-md5-pickle-zlib-redis-ssl

from urllib.error import URLError
from urllib.request import urlopen

import re
import redis
import ssl
import hashlib
import logging
import pickle
import zlib

# Redis有两种持久化方案
# 1. RDB
# 2. AOF


# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('[Decode]', err)
    return page_html


# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
    page_html = None
    try:
        if seed_url.startswith('http://') or \
                seed_url.startswith('https://'):
            page_html = decode_page(urlopen(seed_url).read(), charsets)
    except URLError as err:
        logging.error('[URL]', err)
        if retry_times > 0:
            return get_page_html(seed_url, retry_times=retry_times - 1,
                                 charsets=charsets)
    return page_html


# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
    pattern_regex = re.compile(pattern_str, pattern_ignore_case)
    return pattern_regex.findall(page_html) if page_html else []


# 开始执行爬虫程序
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
    client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
    charsets = ('utf-8', 'gbk', 'gb2312')
    # 打印日志
    logging.info('[Redis ping]', client.ping())
    url_list = [seed_url]
    visited_url_list = {seed_url: 0}
    while url_list:
        current_url = url_list.pop(0)
        depth = visited_url_list[current_url]
        if depth != max_depth:
            page_html = get_page_html(current_url, charsets=charsets)
            links_list = get_matched_parts(page_html, match_pattern)
            for link in links_list:
                if link not in visited_url_list:
                    visited_url_list[link] = depth + 1
                    page_html = get_page_html(link, charsets=charsets)
                    if page_html:
                        # 摘要
                        hasher = hashlib.md5()
                        hasher.update(link.encode('utf-8'))
                        # 序列化-压缩
                        zipped_page = zlib.compress(pickle.dumps(page_html))
                        client.set(hasher.hexdigest(), zipped_page)


def main():
     # 安装requests会安装certifi这个包,里面存了大量证书
     # 设置全局的取消ssl证书验证
    ssl._create_default_https_context = ssl._create_unverified_context
    start_crawl('http://sports.sohu.com/nba_a.shtml',
                r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
                max_depth=2)


if __name__ == '__main__':
    main()

知识点:re-requests-urljoin-bs4-sha1-pickle-zlib-redis

from hashlib import sha1
from urllib.parse import urljoin

import pickle
import re
import requests
import zlib

from bs4 import BeautifulSoup
from redis import Redis


def main():
    # 指定种子页面
    base_url = 'https://www.zhihu.com/'
    seed_url = urljoin(base_url, 'explore')
    # 创建Redis客户端
    client = Redis(host='localhost', port=6379, password='111111')
    # 设置用户代理(否则访问会被拒绝)
    headers = {'user-agent': 'Baiduspider'}
    # 通过requests模块发送GET请求并指定用户代理
    resp = requests.get(seed_url, headers=headers)
    # 创建BeautifulSoup对象并指定使用lxml作为解析器
    soup = BeautifulSoup(resp.text, 'lxml')
    href_regex = re.compile(r'^/question')
    # 将URL处理成SHA1摘要(长度固定更简短)
    hasher_proto = sha1()
    # 查找所有href属性以/question打头的a标签
    for a_tag in soup.find_all('a', {'href': href_regex}):
        # 获取a标签的href属性值并组装完整的URL
        href = a_tag.attrs['href']
        full_url = urljoin(base_url, href)
        # 传入URL生成SHA1摘要
        hasher = hasher_proto.copy()
        hasher.update(full_url.encode('utf-8'))
        field_key = hasher.hexdigest()
        # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
        if not client.hexists('zhihu', field_key):
            html_page = requests.get(full_url, headers=headers).text
            # 对页面进行序列化和压缩操作
            zipped_page = zlib.compress(pickle.dumps(html_page))
            # 使用hash数据类型保存URL摘要及其对应的页面代码
            client.hset('zhihu', field_key, zipped_page)
    # 显示总共缓存了多少个页面
    print('Total %d question pages found.' % client.hlen('zhihu'))


if __name__ == '__main__':
    main()

综合爬虫实例

知识点: requests-bs4-sha1-redis-mongodb-threading-urlparse

import logging
import pickle
import zlib
from hashlib import sha1

import pymongo
from enum import Enum, unique
# from queue import Queue # queue:FIFO(First in First Out)先进先出后进后出的结构,有lock,多个线程执行时不会存在安全性问题,这里没使用是因为队列存在数据库中,可以多机、间断执行
from random import random
from threading import Thread, current_thread, local
from time import sleep
from urllib.parse import urlparse

import redis
import requests
from bs4 import BeautifulSoup
from bson import Binary # 不要单独安装,在安装pymongo时会自动安装


class Constans(object):
    """定义一个常量类"""
    urser_agent = 'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0'
    proxies = {
        "http": "http://111.183.231.117:61234/",
    }


@unique # 表示内容具有唯一性
class SpiderStatus(Enum):
    """定义一个枚举类"""
    IDLE = 0
    WORKING = 1


def any_thread_alive(spider_threads):
    """判断在所有线程中是否存在还处于工作状态的线程"""
    # any(),all()全局函数,any()只要有一个为真则返回真,all()只有全部为真才返回真
    return any([spider_thread.spider.status == SpiderStatus.WORKING for spider_thread in spider_threads])


def decode_html_page(page, charsets):
    """页面解码"""
    page_html = None
    for charset in charsets:
        try:
            page_html = page.content.decode(charset)
            break # 只要解析出页面则跳出循环
        except Exception as e:
            logging.error(e) # 如果在给定编码解析失败则打印错误日志
    return page_html


class Retry(object):
    """用类定义的一个装饰器,可以定义执行次数及每次执行间等待的时间"""
    def __init__(self, *, retry_times=3, wait_secs=5, errors=(Exception,)):
        self.retry_time = retry_times
        self.wait_secs = wait_secs
        self.errors = errors

    def __call__(self, func):
        """魔法方法:自定义装饰器必须写在这个方法下"""
        def wapper(*args, **kwargs):
            for _ in range(self.retry_time):
                try:
                    return func(*args, **kwargs)
                except self.errors as e:
                    logging.error(e) # 如果在给定次数执行失败,则打印错误信息
                    sleep(int(self.wait_secs) * (random() + 1))
            return None
        return wapper


class Spider(object):
    """配置爬虫属性和方法"""
    def __init__(self):
        """定义默认工作状态"""
        self.status = SpiderStatus.IDLE

    @Retry() # 自定义装饰器调用方法,要加()括号。
    def fetch(self, current_url, *, user_agent=None, proxies=None, charsets=('gb2312', 'utf-8', 'gbk')):
        """获取页面"""
        Tread_name = current_thread().name
        print(f'{Tread_name}:{current_url}')
        headers = {'user-agent': user_agent} if user_agent else {}
        page = requests.get(current_url, headers=headers, proxies=proxies)
        return decode_html_page(page, charsets) if page.status_code == 200 else None

    def parse(self, html_page, domain='www.geyanw.com'):
        """解析页面中url"""
        if html_page:
            soup = BeautifulSoup(html_page, 'lxml') # lxml引擎比自带的快速,性能更好
            for a_tag in soup.select_one('div[id="p_left"]').select('a[href]'):
                # 对残缺url进行补充
                parser = urlparse(a_tag.attrs['href'])
                scheme = parser.scheme or 'https'                      # 获取协议
                netloc = parser.netloc or domain                        # 获取域名
                if netloc == domain and scheme != 'javascript':
                    path = parser.path                                  # 获取相对路径
                    query = '?' + parser.query if parser.query else '' # 获取传递的参数
                    full_url = f'{scheme}://{netloc}{path}{query}'    # 新版python格式化字符串写法
                    redis_client = thread_local.redis_client
                    if not redis_client.sismember('visited_url', full_url): # redis数据库操作
                        redis_client.rpush('task_list', full_url)
                        print('full_url:' + full_url)

    def extract(self, html_page):
        """获取标题和类容"""
        if html_page:
            soup = BeautifulSoup(html_page, 'lxml')
            title = content = ''
            try:
                title = soup.select_one('div[id="p_left"]').select_one('div[class="title"]').find('h2').text
            except Exception as e:
                pass

            try:
                content_ps = soup.select_one('div[id="p_left"]').select_one('div[class="content"]').find_all('p')
                for content_p in content_ps:
                    content += content_p.text
            except Exception as e:
                pass

            return title, content

    def store(self, my_dict):
        mongo_db = thread_local.mongo_db
        hasher = hash_proto.copy()  # 应用已经创建好的摘要函数,不要每次循环都自己创建
        hasher.update(my_dict['content'].encode('utf-8'))
        doc_id = hasher.hexdigest()
        mongo_data_coll = mongo_db[my_dict['current_path']]  # 动态创建mongodb集合
        if not mongo_data_coll.find_one({'_id': doc_id}):  # 把内容摘要作为_id,防止url不一样,内容一样的存入数据库
            mongo_data_coll.insert_one(
                dict(_id=doc_id, path=my_dict['current_path'], url=my_dict['current_url'], title=my_dict['title'],
                        content=Binary(zlib.compress(pickle.dumps(my_dict['content'])))))
            print('存入mongodb成功')


class SpiderThread(Thread):
    """配置线程"""
    def __init__(self, name, spider):
        """定义线程名字和参数"""
        super().__init__(name=name, daemon=True) # 守护线程:daemon = True
        self.spider = spider

    def run(self):
        """线程方法,必须写在run()方法里面"""
        redis_client = redis.Redis(host='localhost', port=6379)  # 连接mongodb,并创建数据库
        mongo_client = pymongo.MongoClient(host='localhost', port=27017)  # 注意,这里不要把连接数据库函数放入__init__中,否则会产生循环引用(软连接)
        thread_local.redis_client = redis_client
        thread_local.mongo_db = mongo_client.geyanwang
        while True:
            current_url = redis_client.lpop('task_list')
            while not current_url:
                self.spider.status = SpiderStatus.IDLE
                current_url = redis_client.lpop('task_list')
            if current_url:
                self.spider.status = SpiderStatus.WORKING  # 改变爬虫工作状态
                current_url = current_url.decode('utf-8') # 由于url存在redis数据库,所以取出来时不时str,需要自行解码
            if not redis_client.sismember('visited_url', current_url):
                redis_client.sadd('visited_url', current_url)

                html_page = self.spider.fetch(current_url, user_agent=Constans.urser_agent, proxies=Constans.proxies)
                if html_page:
                    title, content = self.spider.extract(html_page)
                    current_path = '' # 取分类的字段
                    try:
                        current_path = urlparse(current_url).path.split('/')[1]
                    except Exception as e:
                        pass
                    if current_path and title and content:
                        my_dict = dict(current_url=current_url,current_path=current_path,title=title,content=content)
                        self.spider.store(my_dict)
                    self.spider.parse(html_page)


thread_local = local()  # ThreadLocal 是线程的局部变量, 是每一个线程所单独持有的,其他线程不能对其进行访问
hash_proto = sha1()  # 创建好摘要函数,要用时直接copy一份,不要再去创建,提升程序性能 # hasher = hash_proto.copy()


def main():
    redis_client = redis.Redis(host='localhost', port=6379)  # 连接redis
    if not redis_client.exists('task_list'):
        redis_client.rpush('task_list', 'https://www.geyanw.com/')  # 添加根url
    spider_threads = [SpiderThread('th-%d' % i, Spider()) for i in range(10)]
    for spider_thread in spider_threads: # 创建10个线程,并启动
        spider_thread.start()
    # 检查url是否执行完,并且检查是否所有线程都停止工作,否则,将在这里自循环,不往下执行
    # redis中的有序列表,如果里面没有值,则自动删除列表
    while redis_client.exists('task_list') or any_thread_alive(spider_threads):
        pass
    print('Over!')


if __name__ == '__main__':
    main()

表单操作

知识点:requests-bs4

import requests
from bs4 import BeautifulSoup


def main():
    resp = requests.get('https://github.com/login/')
    if resp.status_code != 200:
        return
    cookies = resp.cookies.get_dict()  # 获取cookie
    soup = BeautifulSoup(resp.text, 'lxml')
    utf8_value = soup.select_one('form input[name="utf8"]').attrs['value'] # 获取隐藏域内容
    authenticity_token_value = soup.select_one('form input[name="authenticity_token"]').attrs['value'] # 获取token
    data = {
        'utf8': utf8_value,
        'authenticity_token': authenticity_token_value,
        'login':'[email protected]',
        'password': 'xxxxx'
    }
    resp = requests.post('https://github.com/session/', data=data, cookies=cookies) # 利用post请求提交
    print(resp.text)


if __name__ == '__main__':
    main()

知识点:robobrowser

# 建立在requests和BeautifulSoup之上的爬虫工具
import robobrowser


def main():
    b = robobrowser.RoboBrowser(parser='lxml') # 传入解析方式
    b.open('https://github.com/login/')
    f = b.get_form(action='/session')
    f['login'].value = '[email protected]'
    f['password'].value = 'your password'
    b.submit_form(f)
    for a_tag in b.select('a[href]'):
        print(a_tag.attrs['href'])


if __name__ == '__main__':
    main()

动态内容抓取

知识点:Selenium(webdriver)-BeautifulSoup

# Selenium是一个用于Web应用程序测试的工具。
# Selenium测试直接运行在浏览器中,就像真正的用户在操作一样,需要想要浏览器的驱动包(并配置环境变量),
from selenium import webdriver
from bs4 import BeautifulSoup


def main():
    driver = webdriver.Chrome()
    driver.get('https://v.taobao.com/v/content/live?catetype=704')
    soup = BeautifulSoup(driver.page_source, 'lxml')
    for img_tag in soup.select('img[src]'):
        print(img_tag.attrs['src'])


if __name__ == '__main__':
    main()

知识点:Selenium(webdriver-keys)-BeautifulSoup

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys


def main():
    driver = webdriver.Chrome()
    driver.get('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang')
    elem = driver.find_element_by_css_selector('input[placeholder="输入关键词搜索"]')
    elem.send_keys('美女') # 自动传入搜索内容
    elem.send_keys(Keys.ENTER) # 自动回车
    soup = BeautifulSoup(driver.page_source, 'lxml')
    for img_tag in soup.select('img[src]'):
        print(img_tag.attrs['src'])


if __name__ == '__main__':
    main()

selenium滚屏

from time import sleep

from selenium import webdriver


def main():
    driver = webdriver.Chrome()
    driver.get('http://www.jd.com/')
    driver.execute_script('document.documentElement.scrollTop = 10000')
    while True:
        sleep(1)


if __name__ == '__main__':
    main()

图片验证码识别

知识点:PIL-pytesseract

from io import BytesIO

import requests
from PIL import Image
# pytesseract是光学图像识别软件Tesseract的python驱动,应用前需要安装Tesseract并配置环境变量
from pytesseract import image_to_string 


def main():
    resp  = requests.get('http://www.yundama.com/index/captcha?r=0.018109785648503074') 
    img1 = Image.open(BytesIO(resp.content))
    img1.save('yanzhengma1.jpg')
    img2 = Image.open(open('yanzhengma1.jpg', 'rb'))
    img3 = img2.point(lambda x: 0 if x < 128 else 255) # 图像处理
    img3.save(open('yanzhengma100.jpg', 'wb'))
    print(image_to_string(img3))


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/gold_time_/article/details/80959922