For reptiles studies four code and optimize efficiency
One code optimization:
Use def custom functions to optimize code and improve code readability
import requests
import time
import pymongo
def connect_mongo(): #连接数据库
client = pymongo.MongoClient('localhost', 27017) # 建立连接
book_qunar = client['qunar'] # 建立名为“qunar” 的数据库
return book_qunar['sheet_qunar'] # 在数据库中创建新 “sheet_qunar”
sheet_qunar = connect_mongo()
def get_json(url): #获得链接为url的数据的JSON
str = requests.get(url)
time.sleep(1)
return str.json()
def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit=0,' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item)
time.sleep(1)
str = get_json(url)
routeCount = int(str["data"]["limit"]["routeCount"]) # 取出产品数
for limit in range(0, routeCount, 20): # 获取产品信息
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit={},' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
time.sleep(1)
str = get_json(url)
# 产品的数据类型
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': str
}
sheet_qunar.insert_one(result) #向表中插入数据
if __name__ == "__main__": #当前py直接运行时为true,若为导入的py则为false
url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
dep_dic = get_json(url)
for dep_item in dep_dic["data"]:
for dep in dep_dic["data"][dep_item]:
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
arrive_dic = get_json(url)
arrive_city = [] #存放当前出发点能到的所有目的地
for arr_item in arrive_dic["data"]:
for arr_item_1 in arr_item["subModules"]:
for query in arr_item_1["items"]:
if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
arrive_city.append(query["query"])
for item in arrive_city:
get_list(dep,item)
operation result
Two crawler efficiency optimization:
import requests
import time
import pymongo
def connect_mongo(): #连接数据库
client = pymongo.MongoClient('localhost', 27017) # 建立连接
book_qunar = client['qunar'] # 建立名为“qunar” 的数据库
return book_qunar['sheet_qunar'] # 在数据库中创建新表 “sheet_qunar”
sheet_qunar = connect_mongo()
def get_json(url): #获得链接为url的数据的JSON
str = requests.get(url)
time.sleep(1)
return str.json()
def get_list(dep,item): #获取产品列表: dep为出发地,item为目的地
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit=0,' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item)
time.sleep(1)
str = get_json(url)
routeCount = int(str["data"]["limit"]["routeCount"]) # 取出产品数
for limit in range(0, routeCount, 20): # 获取产品信息
url = 'https://touch.dujia.qunar.com/' \
'list?modules=list%2CbookingInfo%2' \
'CactivityDetail&dep={}&query={}&' \
'dappDealTrace=true&mobFunction=%E' \
'6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1' \
'%E8%A1%8C&cfrom=zyx&it=dujia_hy_dest' \
'ination&date=&needNoResult=true&origina' \
'lquery={}&width=480&height' \
'=320&quality=90&limit={},' \
'20&includeAD=true&qsact=search&' \
'filterTagPlatform=mobile_touch'.format(dep, item, item, limit)
time.sleep(1)
str = get_json(url)
# 产品的数据类型
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': str
}
sheet_qunar.insert_one(result) #向表中插入数据
def get_all_data(dep): #获取所有产品数据
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(dep)
arrive_dic = get_json(url)
arrive_city = [] #存放当前出发点能到的所有目的地
for arr_item in arrive_dic["data"]:
for arr_item_1 in arr_item["subModules"]:
for query in arr_item_1["items"]:
if query["query"] not in arrive_city: #使得当前出发点对应的目的地不重复
arrive_city.append(query["query"])
for item in arrive_city:
get_list(dep,item)
dep_list = [] #所有出发地的列表,在多进程爬虫中使用
url = 'https://touch.dujia.qunar.com/depCities.qunar' #出发点列表的链接
dep_dic = get_json(url)
for dep_item in dep_dic["data"]:
for dep in dep_dic["data"][dep_item]:
dep_list.append(dep)
Create a new Python File and name it main.py (implement multi-process crawler)
Pool () function can be number of custom multi-threaded , when not set the default to open the number of how many CPU processes
Note: (If you want to use the process module on Windows, you must write the code of the relevant process under the if name ==' main ': statement to use the process module under Windows normally. Not needed under Unix/Linux)
from test import get_all_data
from test import dep_list
from multiprocessing import Pool
if __name__=="__main__": #运行该文件
pool = Pool()
pool.map(get_all_data,dep_list)