Use xpath+re to obtain the detailed information of Chinese medicine in the Chinese Medicine Collection

full code

from lxml import etree
import requests
import re

def spider(name,img_path):
    url="http://zhongyibaodian.com/zhongcaoyaotupian/"+name+".html"
    headers = {
    
    
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
        }

    page_text=requests.get(url=url,headers=headers)
    #如果不加这个会解析出乱码
    page_text.encoding="gb23112"
    page_text=page_text.text

    ex='<a target="_blank" href="http://zhongyaocai360.com/zhuzhi/.*?">'
    a_list=re.findall(ex,page_text,re.S)
    # print(a_list)
    for a in a_list:
        page_text=page_text.replace(a,"")
    page_text=page_text.replace("<br/>","")
    page_text=page_text.replace("</a>","")
    # print(page_text)

    tree=etree.HTML(page_text)

    #获取中药名称
    title=tree.xpath('//*[@id="divMain"]/div/h1/text()')[0]
    # print(title)

    #获取中药的详情信息
    result=""
    content=tree.xpath('//div[@class="spider"]/p[4]/text()')
    if content!=[]:
        p_list = tree.xpath('//div[@class="spider"]/p/text()')
        for p in p_list:
            result += p

    else:
        content=tree.xpath('//div[@class="spider"]/p[3]/text()')
        if content==[]:
            content=tree.xpath('//div[@class="spider"]/p[2]/text()')
        result=content[0]
    result=result.replace("【","\n【")
    # print(result)
    with open(img_path,'a') as fp:
        fp.write("名称:"+title+'\n'+result+'\n\n\n')



#解表药
##辛温解表药
xinwenjiebiaoyao=['mahuang','guizhi','xixin','jingjie','fangfeng','baizhi','qianghuo','gaoben','xinyi','sugeng','chengliu',
                  'congbai','husui','shengjiang']

for name in xinwenjiebiaoyao:
    spider(name,'../中药信息/解表药/辛温解表药.txt')
    print(name+'爬取成功!')


##二、辛凉解表药
xinliangjiebiaoyao=['baohe','niubangzi','chantui','sangye','juhua','yejuhua','shengma','chaihu','gegen','manjingzi','fuping',
                    'muzei','gehua','dandouchi','dadouhuangjuan','heizha']

for name in xinliangjiebiaoyao:
    spider(name,'../中药信息/解表药/辛凉解表药.txt')
    print(name+'爬取成功!')

code explanation

First we use xpath to find this paragraph.
insert image description here
But we parsed it like this, and saw that the data in the a tag was not parsed.
insert image description here
Therefore, I can use regular expressions to remove the qualified a tags, leaving only the text content in them.

ex='<a target="_blank" href="http://zhongyaocai360.com/zhuzhi/.*?">'
a_list=re.findall(ex,page_text,re.S)
for a in a_list:
    page_text=page_text.replace(a,"")
page_text=page_text.replace("<br/>","")
page_text=page_text.replace("</a>","")

At this point, we are parsing it out, and what we
insert image description here
get is a list with only one element
insert image description here
. Because the page design of this website is not uniform, we need to judge a variety of situations, so that there will be no exceptions when crawling.

content=tree.xpath('//div[@class="spider"]/p[4]/text()')
    if content!=[]:
        p_list = tree.xpath('//div[@class="spider"]/p/text()')
        for p in p_list:
            result += p

    else:
        content=tree.xpath('//div[@class="spider"]/p[3]/text()')
        if content==[]:
            content=tree.xpath('//div[@class="spider"]/p[2]/text()')
        result=content[0]

final result

insert image description here
insert image description here

Guess you like

Origin blog.csdn.net/m0_50127633/article/details/119303595