大众点评 —— 爬虫 小样例

版权声明:记事、留心、分享;如有侵权、违规情况等,请及时联系原创作者。 https://blog.csdn.net/qq_27297393/article/details/82924456

这是一个大众点评数据的爬取,爬取信息包含:是否含有广告推广,商店名称,电话号码;并将爬取的数据保存到csv文件以及mysql数据库。

数据采集视频:https://www.bilibili.com/video/av32892172/

github源码参考:https://github.com/hilqiqi0/crawler/tree/master/simple/dianping

采用技术:模拟浏览器(selenium + Chrome),BeautifulSoup数据提取,mysql等

爬取难点:网页中的店铺地址和电话号码一部分是无法进行选取、复制的,网站采用的是图片位移、局部显示技术来显示中文、数字。店铺地址有时候有,有时候没有(我在写代码前分析的时候是有的,现在写博文找不着了;这就是爬虫,爬取的网页时刻在变化着);电话号码参考ip:http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/shoptextcss/num.D9R9SROk2K.svg

源码:

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import csv

import myPymysql

# 此函数用于打开浏览器
browser = webdriver.Chrome()

def telparse(xx):
    # 规则
    #.fn-mcun{background:-8.0px -7.0px;}
    #.fn-PV9m{background:-22.0px -7.0px;}
    #.fn-YSfV{background:-36.0px -7.0px;}
    #.fn-kiQs{background:-50.0px -7.0px;}
    #.fn-SbXU{background:-64.0px -7.0px;}
    #.fn-xBtZ{background:-78.0px -7.0px;}
    #.fn-BARz{background:-92.0px -7.0px;}
    #.fn-QQA6{background:-106.0px -7.0px;}
    #.fn-Gypm{background:-120.0px -7.0px;}
    #268534079
    #268534079
    a = xx.replace('<p class="expand-info tel"> <span class="info-name">',"")
    a = a.replace('</span> <span class="',"")
    a = a.replace('"></span><span class="',"")
    a = a.replace('"></span>',"")
    a = a.replace('<span class="',"")
    a = a.replace(' </p>',"")
    a = a.replace('fn-mcun',"2")
    a = a.replace('fn-PV9m',"6")
    a = a.replace('fn-YSfV',"8")
    a = a.replace('fn-kiQs',"5")
    a = a.replace('fn-SbXU',"3")
    a = a.replace('fn-xBtZ',"4")
    a = a.replace('fn-BARz',"0")
    a = a.replace('fn-QQA6',"7")
    a = a.replace('fn-Gypm',"9")
    a = a.replace('   ',",")
    a = a.replace('</span>',",")
    return a
    
# 火锅
url = "http://www.dianping.com/shenzhen/ch10/g110"
browser.get(url)
print(browser.current_window_handle)
handle = browser.current_window_handle

time.sleep(5)
print("开始……")

counte = 1;     #查询页数
next_index = 11; #翻页索引
number = 2; #需要查询的总页数
while True:
    
    # 详细信息
    
    for i in range(1,16):
#    for i in range(1,2):
        shop_path = '//*[@id="shop-all-list"]/ul/li[' + str(i) + ']/div[2]/div[1]/a[1]/h4'
        shop_path_a = '//*[@id="shop-all-list"]/ul/li[' + str(i) + ']/div[2]/div[1]/a/h4'
    #    print(shop_path)
    #    print(shop_path_a)
    
        title_path = '//*[@id="shop-all-list"]/ul/li[' + str(i) + ']/div[2]/div[1]'
        
        title = browser.find_element_by_xpath(title_path).text
        print(title)
        ad = ""
        if re.search("广告",title):
            ad = "+"
            print("有广告")
        
        try:
            browser.find_element_by_xpath(shop_path).click()
#            print(shop_path)
        except:
            browser.find_element_by_xpath(shop_path_a).click()
            print(shop_path_a)
    
        handles = browser.window_handles
        
        for newhandle in handles:
            # 筛选新打开的窗口
            if newhandle!=handle:
                # 切换到新打开的窗口B
                browser.switch_to_window(newhandle)
                content = browser.page_source.encode('utf-8')
                bsobj = BeautifulSoup(content, 'html5lib')
                
                name_list = bsobj.find_all("div", {"class": "basic-info default nug_shop_ab_pv-a"})
                #print(len(name_list))

                for shop_name in name_list:
                    name = shop_name.find("h1", {"class": "shop-name"}).get_text()
                    print(name)
#                    print("####")
#                    print(name.strip().split(" ")[0].strip())
                    tel = telparse(str(shop_name.find("p", {"class": "expand-info tel"})))
                    print(tel)
                    #写文件
                    with open("./huoguo.csv", "a") as f:
                        # writer 对象,修改默认分隔符为 "|"
                        writer = csv.writer(f, delimiter="|")
                        try:
                             writer.writerow([name.strip().split(" ")[0].strip(), tel.replace('电话:',"").replace('   ',","), ad])
                        except:
                              pass
                    
                    # 写数据库                                
                    dbhelper = myPymysql.DBHelper()
                    name = name.strip().split(" ")[0].strip()
                    tel = tel.replace('电话:',"")
                    sql = "INSERT INTO dinping.huoguo(name, tel, ad)VALUES(%s,%s,%s);"
                    params = (name, tel, ad)
                    result = dbhelper.execute(sql, params)
                    if result == True:
                        print("插入成功")
                    else:
                        print("插入失败")
                        
                time.sleep(5)
                # 关闭当前窗口B
                browser.close()
                #切换回窗口A
                browser.switch_to_window(handles[0])  
                
    if counte > (number-1) :
        print("总计爬取" + str(counte) + "页")
        print("结束")
        break

    print("下一页……")            
    next_page = '/html/body/div[2]/div[3]/div[1]/div[2]/a[' + str(next_index) + ']'
    again_page = '/html/body/div[2]/div[3]/div[1]/div[2]/a[' + str(next_index-1) + ']'
    try:
        browser.find_element_by_xpath(next_page).click()
        next_index += 1;
    except:
        browser.find_element_by_xpath(again_page).click()
    time.sleep(10)
    counte += 1;
    
browser.close()


猜你喜欢

转载自blog.csdn.net/qq_27297393/article/details/82924456
今日推荐