Several data analysis and extraction methods of python crawler

After using python crawler, whether it is requests library or urllib library to crawl the source code of a webpage, it is more difficult to directly obtain the information in it. Usually, some third-party libraries are used to analyze and extract information. At present, bs4 is simple and fast. Libraries, regular expressions and xpath to complete, the following through an example to explain their specific usage:

import requests
from bs4 import BeautifulSoup
import json
import re
from lxml import etree
from urllib.parse import quote
header={
    
    
"Cookie": "BIDUPSID=1D3E686AE65F1365106D5F2B6DEDA5C9; PSTM=1584013332; BAIDUID=1D3E686AE65F13650E993E0898A5D1CA:SL=0:NR=10:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2IxN0pKRmVtR1lLV3JqY2ZUSXpFbmF5Q3htbU5wNTh3TWlTQU83a0J2Zy0xVkZmRVFBQUFBJCQAAAAAAAAAAAEAAACm4ladsfnWrsHox-UAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD5IKl8-SCpfQ; BDUSS_BFESS=2IxN0pKRmVtR1lLV3JqY2ZUSXpFbmF5Q3htbU5wNTh3TWlTQU83a0J2Zy0xVkZmRVFBQUFBJCQAAAAAAAAAAAEAAACm4ladsfnWrsHox-UAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD5IKl8-SCpfQ; Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1596201563,1596236841,1596616615,1596697867; H_PS_PSSID=32292_1428_32301_32361_32327_31254_32349_32045_32394_32405_32429_32117_26350_32482; Hm_lpvt_55b574651fcae74b0a9f1cf9c8d7c93a=1596698546",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
url="https://baike.baidu.com/item/%E5%B0%8F%E6%98%8E/33553"
print(url)
r=requests.get(url,headers=header)
r.encoding=r.apparent_encoding
#第一种,使用bs4库来完成信息的提取
def get_bs4(r):
    soup = BeautifulSoup(r.content, "html.parser")
    str_one = soup.find("div", {
    
    "class": "para"})
    text = str_one.text
    print(str_one.text)
    with open("D://download//demo_bs4.text", "w") as file:
        file.write(text)

#第二种,使用re库来完成
def get_re(r):
    str_two=re.findall('<meta name="description" content="(.*?)\.\.\."',r.text,re.S)
    print(type(str_two))
    # print(str_two)
    # print(r.text)
    with open("D://download//demo_re.text", "w") as file:
        file.write(str(str_two[0]))
# get_re(r)

#第三种,使用xpath来完成
def get_xpath(r):
    str_one=""
    tree=etree.HTML(r.text)
    str_three=tree.xpath('//div[@class="lemma-summary"]//*[@class="para"]/text()')
    #print(str_three)
    for i in str_three:
            str_one=str_one+str(i)
    print(str_one)
#get_xpath(r)


    



Guess you like

Origin blog.csdn.net/xinzhilinger/article/details/107903997