Para reptiles estudia cuatro códigos y optimiza la eficiencia.
Optimización de un código:
Utilice las funciones personalizadas de def para optimizar el código y mejorar la legibilidad del código
import requests
import time
import pymongo
def connect_mongo(): #连接数据库
client = pymongo.MongoClient('localhost', 27017) # 建立连接
book_qunar = client['qunar'] # 建立名为“qunar” 的数据库
return book_qunar['sheet_qunar'] # 在数据库中创建新 “sheet_qunar”
sheet_qunar = connect_mongo()
def get_json(url): #获得链接为url的数据的JSON
str = requests.get(url)
time.sleep(1)
return str.json()
def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit=0,' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item)
time.sleep(1)
str = get_json(url)
routeCount = int(str["data"]["limit"]["routeCount"]) # 取出产品数
for limit in range(0, routeCount, 20): # 获取产品信息
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit={},' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
time.sleep(1)
str = get_json(url)
# 产品的数据类型
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': str
}
sheet_qunar.insert_one(result) #向表中插入数据
if __name__ == "__main__": #当前py直接运行时为true,若为导入的py则为false
url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
dep_dic = get_json(url)
for dep_item in dep_dic["data"]:
for dep in dep_dic["data"][dep_item]:
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
arrive_dic = get_json(url)
arrive_city = [] #存放当前出发点能到的所有目的地
for arr_item in arrive_dic["data"]:
for arr_item_1 in arr_item["subModules"]:
for query in arr_item_1["items"]:
if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
arrive_city.append(query["query"])
for item in arrive_city:
get_list(dep,item)
resultado de la operación
Optimización de la eficiencia de dos orugas:
import requests
import time
import pymongo
def connect_mongo(): #连接数据库
client = pymongo.MongoClient('localhost', 27017) # 建立连接
book_qunar = client['qunar'] # 建立名为“qunar” 的数据库
return book_qunar['sheet_qunar'] # 在数据库中创建新表 “sheet_qunar”
sheet_qunar = connect_mongo()
def get_json(url): #获得链接为url的数据的JSON
str = requests.get(url)
time.sleep(1)
return str.json()
def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit=0,' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item)
time.sleep(1)
str = get_json(url)
routeCount = int(str["data"]["limit"]["routeCount"]) # 取出产品数
for limit in range(0, routeCount, 20): # 获取产品信息
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit={},' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
time.sleep(1)
str = get_json(url)
# 产品的数据类型
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': str
}
sheet_qunar.insert_one(result) #向表中插入数据
def get_all_data(dep): #获取所有产品数据
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
arrive_dic = get_json(url)
arrive_city = [] #存放当前出发点能到的所有目的地
for arr_item in arrive_dic["data"]:
for arr_item_1 in arr_item["subModules"]:
for query in arr_item_1["items"]:
if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
arrive_city.append(query["query"])
for item in arrive_city:
get_list(dep,item)
dep_list = [] #所有出发地的列表,在多进程爬虫中使用
url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
dep_dic = get_json(url)
for dep_item in dep_dic["data"]:
for dep in dep_dic["data"][dep_item]:
dep_list.append(dep)
Cree un nuevo archivo Python y asígnele el nombre main.py (implemente un rastreador multiproceso)
La función Pool () puede ser un número de subprocesos múltiples personalizados , cuando no se establece el valor predeterminado para abrir el número de cuántos procesos de CPU
Nota: (Si desea usar el módulo de proceso en Windows, debe escribir el código del proceso relevante bajo la instrucción if name == ' main ': para poder usar el módulo de proceso en Windows normalmente. No es necesario en Unix / Linux)
from test import get_all_data
from test import dep_list
from multiprocessing import Pool
if __name__=="__main__": #运行该文件
pool = Pool()
pool.map(get_all_data,dep_list)