成都美食信息爬虫

from bs4 import BeautifulSoup 
from urllib import request
import pandas as pd
import numpy as np
import urllib.parse as urp
import time
import json
loc_1 = [30.389814,103.801536]
loc_2 = [30.836448,104.299382]
        #步长根据测试选择相对合适的值
step = 0.02
        #for循环嵌套,获取loc_2与loc_1间步长0.03的矩形区域列表
loc_fin = []
for a in range(1,int((loc_2[0]-loc_1[0])/step+1)+1):
    print(a)
    for b in range(1,int((loc_2[1]-loc_1[1])/step+1)+1):
        print(b)
        lat_1 = round((loc_1[0]+step*a),6)
        lon_1 = round((loc_1[1]+step*b),6)
        lat_2 = round((lat_1-step),6)
        lon_2 = round((lon_1-step),6)
        loc_fin.append(str(lat_2)+","+str(lon_2)+','+str(lat_1)+','+str(lon_1))
df = pd.DataFrame({'loc':loc_fin,'id':np.arange(len(loc_fin))})
class RestrantInfo:
    def __init__(self,data):
        self._df = data
        self._df_final = pd.DataFrame([],columns = ['name','area','address',\
                                      'lng','lat','price','com_num','key_word',\
                                      'tag','type','children'])
    def _getjson(self,url):
        req = request.urlopen(url)
        res = req.read().decode()
        result = json.loads(res)
        return result
    def _getrating(self,detail):
        try:
            return detail['overall_rating']
        except:
            return np.nan
    def _getchild(self,detail):
        try:
            return detail['childeren']
        except:
            return np.nan
    def _getcom_num(self,detail):
        try:
            return detail['comment_num']
        except:
            return np.nan
    def _getword(self,detail):
        try:
            return detail['di_review_keyword']
        except:
            return np.nan
    def _gettag(self,detail):
        try:
            return detail['tag']
        except:
            return np.nan
    def _gettype(self,detail):
        try:
            return detail['type']
        except:
            return np.nan
    def _getprice(self,detail):
        try:
            return detail['price']
        except:
            return np.nan
    def getinfo(self,start,end):
        my_ak = ##使用自己的AK
        for i in self._df.index[start:end]:
            print(i)
            temp_df = pd.DataFrame([],columns = ['name','area','address',\
                                      'lng','lat','price','com_num','key_word',\
                                      'tag','type','children','rating'])
            url = 'http://api.map.baidu.com/place/v2/search?query='+urp.quote('美食')+\
            '&bounds='+self._df.loc[i,'loc']+'&output=json&scope=2&page_size=20&page_num='+\
            str(0)+'&ak='+my_ak
            result = self._getjson(url)
            total = result['total']
            if total > 0:
                for j in np.arange(0,int(total/20)+1):
                    try:
                        url = 'http://api.map.baidu.com/place/v2/search?query='+urp.quote('美食')+\
                        '&bounds='+self._df.loc[i,'loc']+'&output=json&scope=2&page_size=20&page_num='+\
                        str(j)+'&ak='+my_ak
                        r1 = self._getjson(url)
                        r2 = pd.Series(r1['results'])
                        name = r2.apply(lambda x:x['name'])
                        area = r2.apply(lambda x:x['area'])
                        address = r2.apply(lambda x:x['address'])
                        lat = r2.apply(lambda x:x['location']).apply(lambda x:x['lat'])
                        lng = r2.apply(lambda x:x['location']).apply(lambda x:x['lng'])
                        rating = r2.apply(lambda x:x['detail_info']).apply(self._getrating)
                        children = r2.apply(lambda x:x['detail_info']).apply(self._getchild)
                        tag = r2.apply(lambda x:x['detail_info']).apply(self._gettag)
                        type_1 = r2.apply(lambda x:x['detail_info']).apply(self._gettype)
                        key_word = r2.apply(lambda x:x['detail_info']).apply(self._getword)
                        com_num = r2.apply(lambda x:x['detail_info']).apply(self._getcom_num)
                        price = r2.apply(lambda x:x['detail_info']).apply(self._getprice)
                        temp_df['name'] = name
                        temp_df['area'] = area
                        temp_df['address'] = address
                        temp_df['lng'] = lng
                        temp_df['lat'] = lat
                        temp_df['price'] = price
                        temp_df['com_num'] = com_num
                        temp_df['key_word'] = key_word
                        temp_df['tag'] = tag
                        temp_df['type'] = type_1
                        temp_df['children'] = children
                        temp_df['rating'] = rating
                        self._df_final = self._df_final.append(temp_df,ignore_index=True)
                    except:
                        continue
        return self._df_final

猜你喜欢

转载自blog.csdn.net/weixin_41968760/article/details/80927248
今日推荐