python+selenium爬虫抓取动态网页

        这个过程比较麻烦,需要分好几步来完成,下边一一来介绍:

1. 安装selenium库,可直接用'pip install selenium'命令安装。

2. 下载chromeDriver,并将其添加到环境变量,也可直接将.exe文件放入python安装目录下scripts文件夹下。下载时一定要选择与浏览器相对应的版本。查看浏览器版本的方法为:右上角->帮助->关于google chrome便可查看,下载驱动的地址为chromedriver下载地址,与chrome版本的对应关系为对应关系,验证是否下载安装成功,只需要执行下面一段代码,如果没有出错,便安装成功。

import selenium.webdriver as driver

index = driver.Chrome()
index.get('https://www.wdzj.com/dangan')
print(index)

3. selenium实现爬虫,有多种定位方法,简单介绍一下:

        官网地址为:http://selenium-python.readthedocs.io/locating-elements.html

        查找单个元素的方法为:

  • find_element_by_id
  • find_element_by_name
  • find_element_by_xpath
  • find_element_by_link_text
  • find_element_by_partial_link_text
  • find_element_by_tag_name
  • find_element_by_class_name
  • find_element_by_css_selector

        查找多个元素(返回一个列表)的方法为:

  • find_elements_by_name
  • find_elements_by_xpath
  • find_elements_by_link_text
  • find_elements_by_partial_link_text
  • find_elements_by_tag_name
  • find_elements_by_class_name
  • find_elements_by_css_selector

4. 基本功能具备后,便可以愉快地爬虫了(由于还没写完,先放部分代码,随后再更)!

import selenium.webdriver as driver
import xlwt
import types

URL = 'https://www.wdzj.com/dangan/'
# KEYWORD = '银行存管'
def key_word():
    # index = driver.Chrome()
    # index.get(URL)
    # select_data = index.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
    # print(index.current_url)
    # keyword_index = index.find_element_by_link_text()
    # keyword_index.click()
    names = []
    banks = []
    tel_nums = []
    urls = []
    for i in range(0, 76):
        page_url = URL + 'search?filter=e1¤tPage=' + str(i + 1)
        index_page = driver.Chrome()
        index_page.get(page_url)
        select_data = index_page.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
        print(index_page.current_url)
        for data in select_data:
            names.append(data.text)
            print(names) #名字
            sec_url = data.get_attribute("href")
            index_sec = driver.Chrome()
            index_sec.get(sec_url)
            # print(index_sec.current_url) #链接
            yhcg = index_sec.find_element_by_xpath('/html/body/div[10]/div/div[1]/div[1]/dl[1]/dd[2]/div[2]')
            banks.append(yhcg.text)
            # print(banks) #银行存管
            tel_num = index_sec.find_element_by_link_text('联系方式')
            tel_num.click()
            number = index_sec.find_element_by_xpath('//*[@class="da-lxfs zzfwbox"]/dl[1]/dd[1]/div[2]')
            tel_nums.append(number.text)
            # print(tel_nums) #客服电话
            yuming = index_sec.find_element_by_link_text('工商/备案')
            yuming.click()
            yu_beian = index_sec.find_element_by_xpath('//*[@class="lcen"]/table/tbody/tr[7]/td[2]')
            urls.append(yu_beian.text)
            print(urls) #域名
            index_sec.close()

        # print(page_url)
        # next_page = index.find_element_by_link_text('下一页')
        # next_page.click()
    return names, banks, tel_nums, urls

def xls():
    wb = xlwt.Workbook()
    ws = wb.add_sheet('numbers')
    ws.write(0, 0, '序号')
    ws.write(0, 1, '公司名称')
    ws.write(0, 2, '银行存管')
    ws.write(0, 3, '客服电话')
    ws.write(0, 4, '公司域名')
    names, banks, tel_nums, urls = key_word()
    print(len(names))
    for i in range (0, len(names)):
        ws.write(i + 1, 0, i+1)
        ws.write(i + 1, 1, names[i])
        ws.write(i + 1, 2, banks[i])
        ws.write(i + 1, 3, tel_nums[i])
        ws.write(i + 1, 4, urls[i])
    wb.save('D:\\number.xls')

def run():
    xls()
run()

改:

import selenium.webdriver as driver
import xlwt
from xlutils.copy import copy
import xlrd

URL = 'https://www.wdzj.com/dangan/'
# KEYWORD = '银行存管'
def key_word():
    names = []
    banks = []
    tel_nums = []
    urls = []
    count= 0
    wb = xlwt.Workbook()
    ws = wb.add_sheet('numbers')
    ws.write(0, 0, '序号')
    ws.write(0, 1, '公司名称')
    ws.write(0, 2, '银行存管')
    ws.write(0, 3, '客服电话')
    ws.write(0, 4, '公司域名')
    wb.save('D:\\number.xls')
    for i in range(0, 76):
        page_url = URL + 'search?filter=e1¤tPage=' + str(i + 1)
        index_page = driver.Chrome()
        index_page.get(page_url)
        select_data = index_page.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
        print(index_page.current_url)
        for data in select_data:
            names.append(data.text)
            print(names) #名字
            sec_url = data.get_attribute("href")
            index_sec = driver.Chrome()
            index_sec.get(sec_url)
            # print(index_sec.current_url) #链接
            yhcg = index_sec.find_element_by_xpath('//*[@class="bgbox-bt zzfwbox"]/dl/dd/div[@class="r" and contains(text(),"存管")]')
            banks.append(yhcg.text)
            print(banks) #银行存管
            tel_num = index_sec.find_element_by_link_text('联系方式')
            tel_num.click()
            number = index_sec.find_element_by_xpath('//*[@class="da-lxfs zzfwbox"]/dl[1]/dd[1]/div[2]')
            tel_nums.append(number.text)
            # print(tel_nums) #客服电话
            yuming = index_sec.find_element_by_link_text('工商/备案')
            yuming.click()
            yu_beian = index_sec.find_element_by_xpath('//*[@class="lcen"]/table/tbody/tr[7]/td[2]')
            urls.append(yu_beian.text)
            print(urls) #域名
            oldWb =xlrd.open_workbook('D:\\number.xls', formatting_info=True)
            newWb = copy(oldWb)
            news = newWb.get_sheet(0)
            news.write(count + 1, 0, count + 1)
            news.write(count + 1, 1, names[count])
            news.write(count + 1, 2, banks[count])
            news.write(count + 1, 3, tel_nums[count])
            news.write(count + 1, 4, urls[count])
            newWb.save('D:\\number.xls')
            print(count)
            count+=1
            index_sec.close()
        index_page.close()
    return names, banks, tel_nums, urls

def run():
    key_word()
run()

猜你喜欢

转载自blog.csdn.net/weixin_42341986/article/details/80857494