代码如下
import requests
from lxml import etree
# 抓取整个页面
words = input("输入搜索内容:")
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
date = {'wd':words} #wd为需要搜索的内容,pn为你需要获取第几页的内容
response = requests.get('https://www.baidu.com/s',headers=headers,params=date)
result = response.text
# 提取数据
html = etree.HTML(result,parser=etree.HTMLParser())
titles = html.xpath("//h3[@class='t' or @class='t c-title-en' or @class='t c-gap-bottom-small']") # 标题 过滤了广告
abstracts = html.xpath("//div[@class='c-abstract' or @class='c-abstract c-abstract-en']") # 简介
links = html.xpath("//div[@class='f13']/a[@class='c-showurl']/@href") #获取完整的url
for i in range(10):
title = titles[i].xpath("string()")
abstract = abstracts[i].xpath("text()")
print('='*3)
print(title)
print(abstract)
print(links[i])
print('='*3)
结果显示如下: