代理ip的设置配合requests库实现爬虫

实例网站:

  • 猿人云
  • 糗事百科

ip的获取获取在这里插入图片描述
提取api地址
在这里插入图片描述
下面是代码的详解

请求模块
import requests
解析模块
from lxml import etree
日志模块
from loguru import logger
存储模块
from pymongo import MongoClient

获取代理
def proxiy():
    logger.info("正在设置代理")
    response=requests.get("http://tunnel-api.apeyun.com/h?id=2020120800179741613&secret=t3XRovhQSQxmdhm1&limit=1&format=txt&auth_mode=hand")
    res=response.text
    proxys={
    
    
        'http':'http://'+res
    }
    content(base_url,proxys)
    
获取内容
def content(url,proxy):
    logger.info("开始执行抓取程序")
    response=requests.get(url=url,headers=headers,proxies=proxy)
    Html=etree.HTML(response.text)
    div_list=Html.xpath('//div[@class="col1 old-style-col1"]/div')
    for div in div_list:
        page_content=div.xpath('./a/div[@class="content"]/span/text()')
        page_content=str(page_content)
        print(page_content)
        saver(page_content)
def saver(page_conyent):
    # 建立数据库的链接
    client = MongoClient('localhost', 27017)
    #test为数据库
    db = client.qiushi
    #test为集合,相当于表明
    collection=db.test
    # 插入数据
    collection.insert({
    
    "content":page_conyent})
    
主函数
def main():
    logger.info("程序已启动")
    proxiy()
    logger.info("程序结束保存完成")
if __name__ == '__main__':
    base_url='https://www.qiushibaike.com/text/'
    headers={
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36'
    }
    main()

猜你喜欢

转载自blog.csdn.net/h1751541643/article/details/114376636