lxml和xpath结合使用

lxml和xpath结合使用主要有以下5个方面内容:

# 1.获取所有的tr标签
# 2.获取第2个标签
# 3.获取所有class 等于event的tr标签
# 4.获取所有a标签下的href属性
# 5.或许所有的职位信息(纯文本)

实例代码如下:

# -- coding:utf-8 --
from lxml import etree

parser = etree.HTMLParser(encoding="utf-8")
html = etree.parse("baudu.html", parser=parser)
# 1.获取所有的tr标签
# //tr
# xpath返回的是一个列表
def return_trs():
    trs = html.xpath("//tr")
    for tr in trs:
        print tr
# 2.获取第2个标签
def return_tr():
    tr = html.xpath("//tr[2]")[0]
    print type(tr)
    print tr
    print etree.tostring(tr, encoding="utf-8").decode("utf-8")
# 3.获取所有class 等于event的tr标签
def class_tr():
    trs = html.xpath("//tr[@class='event']")
    print trs
# 4.获取所有a标签下的href属性
def a_href():
    aLists = html.xpath("//a/@href")
    for a in aLists:
        print a
        print "D:\\Python2.7\\"+a
# 5.或许所有的职位信息(纯文本)
positions = []
def position_text():
    trs = html.xpath("//tr[position()>1]")
    for tr in trs:
        # 在某个标签下执行xpath函数,获取这个标签下的子孙元素,那么在“//”前面加一个“.”,即“//.”
        herf = tr.xpath(".//a/@href")[0]
        fullurl = "http://"+herf
        title = tr.xpath("./td[1]//text()")
        category = tr.xpath("./td[2]//text()")
        nums = tr.xpath("./td[3]//text()")
        address = tr.xpath("./td[4]//text()")
        pubtime = tr.xpath("./td[5]//text()")

        position = {
            "url": fullurl,
            "title": title,
            "category": category,
            "nums": nums,
            "address": address,
            "pubtime": pubtime
        }
        positions.append(position)
def main():
    position_text()
    print positions
    # a_href()
    # class_tr()
    # return_trs()
    # return_tr()


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_42281053/article/details/80665242
今日推荐