Python爬虫代理IP结合BeautifulSoup MongoDB

安装包准备:

pymongo、BeautifulSoup

实现代码

#coding=utf-8
import requests
import pymongo

from  bs4 import BeautifulSoup
from  conf.config import *
from  multiprocessing import Pool

client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

proxy = None
max_count=5
count=0
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/24.0'}

def get_proxy():
    print('正在获取代理...')
    try:
        proxy_pool_url='http://127.0.0.1:5555/random'
        response=requests.get(proxy_pool_url)
        if response.status_code == 200:
            return response.text
        return  None
    except ConnectionError:
        return None

def parse_index(htmlText):
    soup = BeautifulSoup(htmlText, 'lxml')
    items = soup.select(".table-plate3")
    for item in items:
        data = {}#css选择器,通过select实现,#代表id,.代表样式
        data['projectUrl'] = item.select('.tp1 a')[0].attrs['href']
        data['titleImg'] = item.select('.tp1 a img')[0].attrs['src']
        data['projectName'] = item.select('.tp2 .tp2_tit a')[0].get_text()
        data['enterpriseName'] = item.select('.tp2 .tp2_com')[0].get_text()
        data['amt'] = item.select('.tp-mean .money')[0].get_text()
        # 获取第7个节点
        data['turn'] = item.contents[7].get_text()
        data['Investors'] = item.select('.tp3')[0].attrs['title']
        data['Industry'] = item.contents[11].get_text()
        data['Date'] = item.contents[13].get_text()
        data['detailUrl'] = item.contents[15].select('a')[0].attrs['href']
        save_to_mongo(data)

def get_html(page,count=1):
    global proxy
    url = 'http://www.cyzone.cn/event/list-764-0-%s-0-0-0-0/' %(page)
    print('42',url)
    if count == max_count:
        print('重试次数大于5次了。')
        return None
    try:
        if proxy:
            print('代理存在')
            proxies = {
                'http': 'http://' + proxy
            }
            print('proxies',proxies)
            response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
        else:
            print('代理为NoneType,正在重新获取')
            proxy = get_proxy()
            print('proxy', proxy)
            count += 1
            return get_html(page, count)
        if response.status_code == 200:
            print('当前状态正常。')
            print("-->>当前页码%s" %page)
            return response.text
        # if response.status_code == 302:
        #     print('Response status:302')
        #     proxy = get_proxy()
        #     if proxy:
        #         print('正在使用代理:', proxy)
        #         return get_html(url)
        #     else:
        #         print('代理获取失败...')
        #         return None
    except Exception:
        print('代理获取异常,正在重新获取')
        proxy = get_proxy()
        print('proxy',proxy)
        count += 1
        return get_html(page, count)

def main(page):
    begin = page-100+1
    end = page - 1
    if page==13:
        end=1226
    for i in range(begin,end):
        html=get_html(i,count)
        parse_index(html)

def save_to_mongo(result):
    #将返回的结果存储到MONGODB。
    if result:
        if db[MONGO_TABLE].insert(result):
            print('插入成功')
            return True
        return False
if __name__ == "__main__":
    pool = Pool()
    pool.map(main, [i * 100 for i in range(2,13)])
发布了35 篇原创文章 · 获赞 6 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/colby_chenlun/article/details/100123484