python爬虫 爬取大众点评中所有行政区内的商户 将获取信息存于excle中

import xlwt
'''
爬取网页时直接出现403,意思是没有访问权限
'''
import requests
from bs4 import BeautifulSoup

#入口网页
start_url = 'https://www.dianping.com/search/category/344/10'

def get_content(url,headers = None):
    response = requests.get(url,headers=headers)#发起了一次请求
    html = response.content
    return html

'''
    获取所有行政区的url
'''
def region_url(html):
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    #<div id="region-nav" class="nc-items ">
    #   <a href="/search/category/344/10/r299"><span>芙蓉区</span></a>
    #列表推导式
    base_url = 'https://www.dianping.com'
    region_url_list = [base_url+i['href'] for i in soup.find('div',id="region-nav").find_all('a')]
    return region_url_list

#获取商户的详情页的url地址
#find:取第一个(返回一个具体的元素,没有为null)       find_all:匹配所有(返回列表,没有返回[])
def get_shop_url(html):
    base_url = 'https://www.dianping.com'
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    shop_url_list = [base_url+i.find('a')['href'] for i in soup.find_all('div',class_='tit')]
    return shop_url_list

#获取所得信息(店名,价格,评分)。。。解析页面
def get_detail(html):
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    #<h1 class="shop-name">1911牛肉烤串</h1>
    title = soup.find('div',class_='breadcrumb').find('span').text
    #<span id="avgPriceTitle" class="item">人均:-</span>
    price = soup.find('span',id="avgPriceTitle").text
    #<span id="comment_score"><span class="item">口味:7.6</span><span class="item">环境:7.4</span><span class="item">服务:7.5</span></span>
    evaluation = soup.find('span',id="comment_score").find_all('span',class_="item")#评分的list
    #<span id="reviewCount" class="item">3条评论</span>
    comments = soup.find('span',id="reviewCount").text#评论的数量
#     <div class="expand-info address" itemprop="street-address">
#         <span class="item" itemprop="street-address" title="麓松路南丰港安置小区12栋">
#                      麓松路南丰港安置小区12栋
#         </span>
#     </div>
    address = soup.find('span',class_="item",itemprop="street-address").text.strip()
    
#     print u'店名'+title
#     for ev in evaluation:
#         print ev.text
#     print u'价格'+price
#     print u'评论数量'+comments
#     print u'地址'+address
    return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address)
  



#文件作为脚本直接执行,而import到其他脚本中是不会被执行的。
if __name__ =='__main__':
    items = []
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        'Cookie':'_hc.v=dd67ff67-20d0-6e83-7f61-ce93e4d46539.1503387665; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_cuid=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; _lxsdk=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; __utma=205923334.211352043.1503391484.1503391484.1503391484.1; __utmz=205923334.1503391484.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=29bc50ef1530ab64cbaa69b29cad64f39a_51868%3A1; s_ViewType=10; JSESSIONID=A49EED22A236962EA3506BA888799402; aburl=1; cy=344; cye=changsha; PHOENIX_ID=0a010918-15e0a223263-d4c1a92; __mta=146625163.1503391361571.1503401588676.1503408592089.10; _lxsdk_s=15e0a219034-38-9d5-acb%7C%7C37'
        }
    html = get_content(start_url)
    region_url_list = region_url(html)
    #遍历所有行政区的所有商户
    for url in region_url_list:#遍历所有的行政区
        #简单的出错处理,有错则略过
        try:
            for n in range(1,51):#遍历所有的50页
                html = get_content(url+'p'+str(n))
                #所有商户的详情页
                shop_url_list = get_shop_url(html)
                for shop_url in  shop_url_list:
    #                 print shop_url
                    #提取数据,获取
                    detail_html = get_content(shop_url,headers)
                    '''
                    #403 Forbidden(没有访问权限):
                                            (1)直接出现:
                                            (2)爬取一会儿出现403:可以通过代理ip解决
                    referer   防盗链
                    Host域名
                    Cookie
                    '''
                    items.append(get_detail(detail_html))
        except:
            continue
    new_table = r'F:\reptile_Python\daZhongDianPin_spiders\dzdp.xls'
    wb = xlwt.Workbook(encoding='utf-8')
    ws =wb.add_sheet('test1')  
    headData = ['商户名字','口味评分','环境评分','服务评分','人均价格','评论数量','地址']
    for colnum in  range(0,7):
        ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on'))        
    index = 1
    lens = len(items)
    for j in range(0,lens):
        for i in range(0,7):
            ws.write(index,i,items[j][i])
        index=index+1
    
    wb.save(new_table)


发布了11 篇原创文章 · 获赞 6 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/m0_37762912/article/details/77489392