python-- crawling learning and thinking official website


```python
import re
import time
import pandas  as pds
import numpy
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

browser = webdriver.Chrome()  #驱动谷歌浏览器

#进入网站
def enter(url,element):
     wait = WebDriverWait(browser, 2)
     try:
         browser.get(url)
         wait.until(
             EC.presence_of_element_located((By.XPATH,element)),
         )
     except TimeoutException:
          result = "在"+url+'\n'+'未定位到'+element
          print(result)

#获取节点的文本信息
def get_detail(element):    
    try:
        elements = browser.find_element_by_xpath(element)
        detail = elements.text
    except :
        detail = "无"
    return detail

#获取节点的属性信息
def get_element_attribute(element, attribute):
    elements = browser.find_element_by_xpath(element)
    return elements.get_attribute(attribute)

#点击节点
def click_element(element):
    elements = browser.find_element_by_xpath(element).click()
    
#输入内容并回车
def send_word(element,text):
    elements = browser.find_element_by_xpath(element)
    elements.send_keys(text)
    elements.send_keys(Keys.ENTER)

def clear_word(element):
    elements = browser.find_element_by_xpath(element).clear()
     
def get_ele_cnt(element):
     lis = browser.find_elements_by_xpath(element)
     return len(lis)
    
#获取A年级有多少条,多少页数据
def get_each_class(element1,element2):
    m = get_detail(element1)
    lis = browser.find_elements_by_xpath(element2)
    n = lis[-1].text
    return m,n

#获取每个班级的详细信息
def get_class_detail(element):
     classname         = get_detail(element+'//div[@class="item_header"]/div[1]')
     teaching_mode = get_detail(element+'//div[@class="item_header"]/div[2]')
     dtbegindate      = get_detail(element+'//div[@class="item_info"]/span[1]')
     dtdate               = get_detail(element+'//div[@class="item_info"]/span[2]')
     address             = get_detail(element+'//div[@class="item_info"]/span[3]')
     teacher_main    = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_main"]')
     teacher_vice     = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_vice"]')
     if teaching_mode=="在线":
          teacher_tag      = get_detail(element+'//div[@class="teacher"]/div[@class="remain_tag"]')
     else:
          teacher_tag      = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_tag"]')
     class_price        = get_detail(element+'//div[@class="item_footer"]/div[@class="left"]')
     return classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price

#添加部门,年级
def sdept_grade(i,j):
     if i == 1:
         sdept = "幼儿部"
         if j == 1:
             classtype = "托班"
         elif j == 2:
             classtype = "小班"
         elif j==3:
             classtype = "中班"
         elif j==4:
             classtype = "大班"
     elif i ==2:
         sdept = "小学部"
         if j == 1:
             classtype = "一年级"
         elif j == 2:
             classtype = "二年级"
         elif j==3:
             classtype = "三年级"
         elif j==4:
             classtype = "四年级"
         elif j==5:
             classtype = "五年级"
         elif j==6:
             classtype = "六年级"
         elif j==7:
             classtype = "小学组"
     elif i==3:
         sdept = "初中部"
         if j == 1:
             classtype = "初一"
         elif j == 2:
             classtype = "初二"
         elif j==3:
             classtype = "中考"
         elif j==4:
             classtype = "初中组"
     else:
         sdept = "高中部"
         if j == 1:
             classtype = "高一"
         elif j == 2:
             classtype = "高二"
         elif j==3:
             classtype = "高考"
         elif j==4:
             classtype = "高中组"
     return (sdept,classtype)

#写入csv
#获取url中的表并写入文件
def write_csv(i , school):
        writeschool=pds.DataFrame([[i,school]])
        writeschool.to_csv('C:/Users/Administrator/Desktop/一批文分数线.csv', sep=',', mode='a',index = False,header = False)  

#主函数
def main():
    url = 'https://www.speiyou.com/shanxi_xian/list'
    enter(url, '//*[@id="test"]/div/ul/li[1]/a')            #进入网站并获取节点
    click_element('//div[@class="modal_btn"]')       #点击"我知道了"

    #多个年级,班级个数
    for  i in range(1,5):
          if i == 2:
               jj = 8
          else:
               jj = 5
          for j in range(1,jj):
            (sdept,grade)=sdept_grade(i,j)
            #点击年级下拉键,点选A年级
            click_element('//*[@id="__layout"]/div/header/div[3]/div/span/div[2]/span')  #点击年级下拉
            click_element('//div[@class="grade_container"]//li['+str(i)+']/div/span['+str(j)+']')
            time.sleep(3)
            #获取A年级的总条数m,总页数n
            (m,n)=get_each_class('//span[@class="el-pagination__total"]','//ul[@class="el-pager"]//li')
            print(m, n)
            
            #进入第1到n页
            for page in range(1,int(n)+1):
                print(page)
                #点击下一页
                click_element('//*[@id="__layout"]/div/div/section/div[3]/div/button[2]/i')
                #获取每页的班级数量
                classcnt = get_ele_cnt('//*[@id="__layout"]/div/div/section/div[2]/div[@class="card_list"]/div')
                #获取每个班级课程信息,部门,年级,班级名称,授课类型,上课日期,上课时间,上课地点,主讲教师,辅导教师,班级状态,价格
                for k in range(1,classcnt+1):
                     (classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price)=get_class_detail('//*[@id="__layout"]/div/div/section/div[2]/div[@class="card_list"]/div['+str(k)+']')
                     s_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
                     #写入Excel或数据库中
                     writeschool=pds.DataFrame([[s_time,sdept,grade,m,classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price]])
                     writeschool.to_csv('C:/Users/Administrator/Desktop/学而思finnal.csv', sep=',', mode='a',index = False,header = False,encoding='utf_8_sig')
    browser.close()    #关闭浏览器


    
#调用主函数
if __name__ ==  "__main__":
     main()


Published 55 original articles · won praise 17 · views 10000 +

Guess you like

Origin blog.csdn.net/weixin_43213658/article/details/104536591
Recommended