Python爬虫bs4解析实战

1.常用方法

from bs4 import BeautifulSoup

html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
    <tr class="h">
        <td class="l" width="374">职位名称</td>
        <td>职位类别</td>
        <td>人数</td>
        <td>地点</td>
        <td>发布时间</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45021&keywords=python&tid=0&lid=0">22989-腾讯云计费PHP高级开发工程师</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45005&keywords=python&tid=0&lid=0">25663-腾讯云高级后台开发(互联网业务)(北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>北京</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=45007&keywords=python&tid=0&lid=0">TEG06-云计算架构师(深圳)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44980&keywords=python&tid=0&lid=0">PCG04-PCG研发部数据科学家(深圳/北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44981&keywords=python&tid=0&lid=0">PCG04-PCG研发部业务运维工程师(深圳)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44971&keywords=python&tid=0&lid=0">23674-腾讯新闻大数据分析工程师(北京)</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>北京</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44964&keywords=python&tid=0&lid=0">TEG05-高级数据挖掘工程师(深圳)</a></td>
        <td>技术类</td>
        <td>2</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44968&keywords=python&tid=0&lid=0">PCG01-QQ后台推荐算法工程师</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="even">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44969&keywords=python&tid=0&lid=0">PCG01-QQ后台大数据开发工程师</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
    <tr class="odd">
        <td class="l square"><a target="_blank" href="position_detail.php?id=44952&keywords=python&tid=0&lid=0">22989-腾讯云AI产品高级咨询顾问(深圳北京)</a></td>
        <td>技术类</td>
        <td>1</td>
        <td>深圳</td>
        <td>2018-10-23</td>
    </tr>
</table>    
"""

soup = BeautifulSoup(html, "lxml")
# 1.找到所有的tr标签
# trs = soup.find_all("tr")
# 2.找到第二个tr标签,limit表示找到个数,在列表层面获取具体标签
# tr = soup.find_all("tr", limit=2)[1]
# 3.找到所有class等于even的tr标签,class关键字冲突,加下划线
# trs = soup.find_all("tr", class_="even")
# 4.attrs属性可添加多个,以key-value形式
# trs = soup.find_all("tr", attrs={"class": "even"})
# 5.将所有a标签有target属性的找到,可以添加多个关键字参数
# aList = soup.find_all("a", target="_blank")
# 6.获取所有的a标签的href属性
# aList = soup.find_all("a")
# for a in aList:
    # 1.通过下标操作的方式
    # href = a["href"]
    # 2.通过attrs属性的方式
    # href = a.attrs["href"]
# 获取所有的职位信息,过滤掉第一个
trs = soup.find_all("tr")[1:]
jobs = []
for tr in trs:
    job = {}
    # tds = tr.find_all("td")
    # title = tds[0].string
    # category = tds[1].string
    # nums = tds[2].string
    # city = tds[3].string
    # pubtime = tds[4].string
    # job["title"] = title
    # job["category"] = category
    # job["nums"] = nums
    # job["city"] = city
    # job["pubtime"] = pubtime
    # jobs.append(job)
    # 获取所有文本
    infos = list(tr.stripped_strings)
    job["title"] = infos[0]
    job["category"] = infos[1]
    job["nums"] = infos[2]
    job["city"] = infos[3]
    job["pubtime"] = infos[4]
    jobs.append(job)
print(jobs)
View Code

 https://www.cnblogs.com/zhangxinqi/p/9218395.html#_label5        参考博客

猜你喜欢

转载自www.cnblogs.com/Guishuzhe/p/9835859.html