百度搜索结果爬虫

代码如下 

import requests
from lxml import etree

# 抓取整个页面
words = input("输入搜索内容:")
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
date = {'wd':words}   #wd为需要搜索的内容,pn为你需要获取第几页的内容
response = requests.get('https://www.baidu.com/s',headers=headers,params=date)
result = response.text


# 提取数据
html = etree.HTML(result,parser=etree.HTMLParser())
titles = html.xpath("//h3[@class='t' or @class='t c-title-en' or @class='t c-gap-bottom-small']")    # 标题    过滤了广告
abstracts = html.xpath("//div[@class='c-abstract' or @class='c-abstract c-abstract-en']") # 简介
links = html.xpath("//div[@class='f13']/a[@class='c-showurl']/@href")   #获取完整的url
for i in range(10):
    title = titles[i].xpath("string()")
    abstract = abstracts[i].xpath("text()")
    print('='*3)
    print(title)
    print(abstract)
    print(links[i])
    print('='*3)

结果显示如下: 

猜你喜欢

转载自blog.csdn.net/qq_40727267/article/details/85260198