[Crawler learning 5] Python large-scale crawler case: crawling the product data of an e-commerce website (2) code and efficiency optimization

For reptiles studies four code and optimize efficiency


One code optimization:

Use def custom functions to optimize code and improve code readability

import requests
import time
import pymongo

def connect_mongo(): #连接数据库
    client = pymongo.MongoClient('localhost', 27017)  # 建立连接
    book_qunar = client['qunar']  # 建立名为“qunar” 的数据库
    return book_qunar['sheet_qunar']  # 在数据库中创建新 “sheet_qunar”

sheet_qunar = connect_mongo()

def get_json(url): #获得链接为url的数据的JSON
    str = requests.get(url)
    time.sleep(1)
    return str.json()

def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
    url = 'https://touch.dujia.qunar.com/' \
          'list?modules=list%2CbookingInfo%2' \
          'CactivityDetail&dep={}&query={}&' \
          'dappDealTrace=true&mobFunction=%E' \
          '6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
          '%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
          'ination&date=&needNoResult=true&origina' \
          'lquery={}&width=480&height' \
          '=320&quality=90&limit=0,' \
          '20&includeAD=true&qsact=search&' \
          'filterTagPlatform=mobile_touch'.format(dep, item, item)
    time.sleep(1)
    str = get_json(url)
    routeCount = int(str["data"]["limit"]["routeCount"])  # 取出产品数
    for limit in range(0, routeCount, 20):  # 获取产品信息
        url = 'https://touch.dujia.qunar.com/' \
              'list?modules=list%2CbookingInfo%2' \
              'CactivityDetail&dep={}&query={}&' \
              'dappDealTrace=true&mobFunction=%E' \
              '6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
              '%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
              'ination&date=&needNoResult=true&origina' \
              'lquery={}&width=480&height' \
              '=320&quality=90&limit={},' \
              '20&includeAD=true&qsact=search&' \
              'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
        time.sleep(1)
        str = get_json(url)
        # 产品的数据类型
        result = {
    
    
            'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
            'dep': dep,
            'arrive': item,
            'limit': limit,
            'result': str
        }
        sheet_qunar.insert_one(result) #向表中插入数据


if __name__ == "__main__": #当前py直接运行时为true,若为导入的py则为false
    url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
    dep_dic = get_json(url)
    for dep_item in dep_dic["data"]:
        for dep in dep_dic["data"][dep_item]:
            url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
            arrive_dic = get_json(url)
            arrive_city = [] #存放当前出发点能到的所有目的地
            for arr_item in arrive_dic["data"]:
                for arr_item_1 in arr_item["subModules"]:
                    for query in arr_item_1["items"]:
                        if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
                            arrive_city.append(query["query"])

            for item in arrive_city:
                get_list(dep,item)

operation result

Insert picture description here


Two crawler efficiency optimization:

import requests
import time
import pymongo

def connect_mongo(): #连接数据库
    client = pymongo.MongoClient('localhost', 27017)  # 建立连接
    book_qunar = client['qunar']  # 建立名为“qunar” 的数据库
    return book_qunar['sheet_qunar']  # 在数据库中创建新表 “sheet_qunar”

sheet_qunar = connect_mongo()

def get_json(url): #获得链接为url的数据的JSON
    str = requests.get(url)
    time.sleep(1)
    return str.json()

def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
    url = 'https://touch.dujia.qunar.com/' \
          'list?modules=list%2CbookingInfo%2' \
          'CactivityDetail&dep={}&query={}&' \
          'dappDealTrace=true&mobFunction=%E' \
          '6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
          '%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
          'ination&date=&needNoResult=true&origina' \
          'lquery={}&width=480&height' \
          '=320&quality=90&limit=0,' \
          '20&includeAD=true&qsact=search&' \
          'filterTagPlatform=mobile_touch'.format(dep, item, item)
    time.sleep(1)
    str = get_json(url)
    routeCount = int(str["data"]["limit"]["routeCount"])  # 取出产品数
    for limit in range(0, routeCount, 20):  # 获取产品信息
        url = 'https://touch.dujia.qunar.com/' \
              'list?modules=list%2CbookingInfo%2' \
              'CactivityDetail&dep={}&query={}&' \
              'dappDealTrace=true&mobFunction=%E' \
              '6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
              '%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
              'ination&date=&needNoResult=true&origina' \
              'lquery={}&width=480&height' \
              '=320&quality=90&limit={},' \
              '20&includeAD=true&qsact=search&' \
              'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
        time.sleep(1)
        str = get_json(url)
        # 产品的数据类型
        result = {
    
    
            'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
            'dep': dep,
            'arrive': item,
            'limit': limit,
            'result': str
        }
        sheet_qunar.insert_one(result) #向表中插入数据

def get_all_data(dep): #获取所有产品数据
        url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
        arrive_dic = get_json(url)
        arrive_city = [] #存放当前出发点能到的所有目的地
        for arr_item in arrive_dic["data"]:
            for arr_item_1 in arr_item["subModules"]:
                for query in arr_item_1["items"]:
                    if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
                        arrive_city.append(query["query"])
        for item in arrive_city:
            get_list(dep,item)


dep_list = []  #所有出发地的列表,在多进程爬虫中使用
url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
dep_dic = get_json(url)
for dep_item in dep_dic["data"]:
    for dep in dep_dic["data"][dep_item]:
        dep_list.append(dep)

Create a new Python File and name it main.py (implement multi-process crawler)

Pool () function can be number of custom multi-threaded , when not set the default to open the number of how many CPU processes

Note: (If you want to use the process module on Windows, you must write the code of the relevant process under the if name ==' main ': statement to use the process module under Windows normally. Not needed under Unix/Linux)

from test import get_all_data
from test import dep_list
from multiprocessing import Pool

if __name__=="__main__": #运行该文件
    pool = Pool() 
    pool.map(get_all_data,dep_list)

Insert picture description here

Guess you like

Origin blog.csdn.net/weixin_45260385/article/details/108931555