四大解析器（BeautifulSoup、PyQuery、lxml、正则）性能比较

用标题中的四种方式解析网页，比较其解析速度。当然比较结果数值与电脑配置，python版本都有关系，但总体差别不会很大。

下面是我的结果，lxml xpath最快，bs4最慢

==== Python version: 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 17:00:18) [MSC v.1900 64 bit (AMD64)] =====

==== Total trials: 10000 =====
bs4 total time: 5.5
pq total time: 0.9
lxml (cssselect) total time: 0.8
lxml (xpath) total time: 0.5
regex total time: 1.1 (doesn't find all p)

　以下是测试代码

# -*- coding: utf-8 -*-

"""
@Datetime: 2019/3/13
@Author: Zhang Yafei
"""
import re
import sys
import time
import requests
from lxml.html import fromstring
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup as bs


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}


def Timer():
    a = time.time()
    while True:
        c = time.time()
        yield time.time() - a
        a = c

# ################# start request #################
timer = Timer()
url = "https://www.python.org/"
html = requests.get(url, headers=headers).text
num = 10000
print('\n==== Python version: %s =====' % sys.version)
print('\n==== Total trials: %s =====' % num)
next(timer)

# ################# bs4 #########################
soup = bs(html, 'lxml')
for x in range(num):
    paragraphs = soup.findAll('p')
t = next(timer)
print('bs4 total time: %.1f' % t)
# ################ pyquery #######################
d = pq(html)
for x in range(num):
    paragraphs = d('p')
t = next(timer)
print('pq total time: %.1f' % t)
# ############### lxml css #########################
tree = fromstring(html)
for x in range(num):
    paragraphs = tree.cssselect('p')
t = next(timer)
print('lxml (cssselect) total time: %.1f' % t)
# ############## lxml xpath #######################
tree = fromstring(html)
for x in range(num):
    paragraphs = tree.xpath('.//p')
t = next(timer)
print('lxml (xpath) total time: %.1f' % t)
# ############### re ##########################
for x in range(num):
    paragraphs = re.findall('<[p ]>.*?</p>', html)
t = next(timer)
print('regex total time: %.1f (doesn\'t find all p)\n' % t)

测试代码二

# -*- coding: utf-8 -*-

"""
@Datetime: 2019/3/13
@Author: Zhang Yafei
"""
import functools
import re
import sys
import time

import requests
from bs4 import BeautifulSoup as bs
from lxml.html import fromstring
from pyquery import PyQuery as pq


def timeit(fun):
    @functools.wraps(fun)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = fun(*args, **kwargs)
        print('运行时间为%.6f' % (time.time() - start_time))
        return res

    return wrapper


@timeit  # time1 = timeit(time)
def time1(n):
    return [i * 2 for i in range(n)]


# ################# start request #################
url = "https://www.taobao.com/"
html = requests.get(url).text
num = 10000
print('\n==== Python version: %s =====' % sys.version)
print('\n==== Total trials: %s =====' % num)


@timeit
def bs4_test():
    soup = bs(html, 'lxml')
    for x in range(num):
        paragraphs = soup.findAll('p')
    print('bs4 total time:')


@timeit
def pq_test():
    d = pq(html)
    for x in range(num):
        paragraphs = d('p')
    print('pq total time:')


@timeit
def lxml_css():
    tree = fromstring(html)
    for x in range(num):
        paragraphs = tree.cssselect('p')
    print('lxml (cssselect) total time:')


@timeit
def lxml_xpath():
    tree = fromstring(html)
    for x in range(num):
        paragraphs = tree.xpath('.//p')
    print('lxml (xpath) total time:')


@timeit
def re_test():
    for x in range(num):
        paragraphs = re.findall('<[p ]>.*?</p>', html)
    print('regex total time:')


if __name__ == '__main__':
    bs4_test()
    pq_test()
    lxml_css()
    lxml_xpath()
    re_test()

　　测试结果

==== Python version: 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 17:00:18) [MSC v.1900 64 bit (AMD64)] =====

==== Total trials: 10000 =====
bs4 total time:
运行时间为9.049424
pq total time:
运行时间为0.899639
lxml (cssselect) total time:
运行时间为0.841596
lxml (xpath) total time:
运行时间为0.619440
regex total time:
运行时间为1.207861

四大解析器（BeautifulSoup、PyQuery、lxml、正则）性能比较

猜你喜欢