实例网站:
- 猿人云
- 糗事百科
ip的获取获取
提取api地址
下面是代码的详解
请求模块
import requests
解析模块
from lxml import etree
日志模块
from loguru import logger
存储模块
from pymongo import MongoClient
获取代理
def proxiy():
logger.info("正在设置代理")
response=requests.get("http://tunnel-api.apeyun.com/h?id=2020120800179741613&secret=t3XRovhQSQxmdhm1&limit=1&format=txt&auth_mode=hand")
res=response.text
proxys={
'http':'http://'+res
}
content(base_url,proxys)
获取内容
def content(url,proxy):
logger.info("开始执行抓取程序")
response=requests.get(url=url,headers=headers,proxies=proxy)
Html=etree.HTML(response.text)
div_list=Html.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
page_content=div.xpath('./a/div[@class="content"]/span/text()')
page_content=str(page_content)
print(page_content)
saver(page_content)
def saver(page_conyent):
# 建立数据库的链接
client = MongoClient('localhost', 27017)
#test为数据库
db = client.qiushi
#test为集合,相当于表明
collection=db.test
# 插入数据
collection.insert({
"content":page_conyent})
主函数
def main():
logger.info("程序已启动")
proxiy()
logger.info("程序结束保存完成")
if __name__ == '__main__':
base_url='https://www.qiushibaike.com/text/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36'
}
main()