小程序1:通过selenium获取博客园所有发布的文章

思路:用selenium打开页面,通过xpath获取标题,然后点击下一页,循环如此

 1 from selenium import webdriver
 2 from time import sleep
 3 from copy import copy
 4 
 5 
 6 
 7 def get_pro_titles(page):
 8     """获取博客园最新发布的文章标题"""
 9     all_title = dict()
10     option = webdriver.ChromeOptions()
11     option.add_argument('--headless')
12     option.add_argument("--disable-gpu")
13     option.add_argument("--window-size=1280,800")
14     d = webdriver.Chrome(options=option)
15     d.get('https://www.cnblogs.com')
16     for i in range(1, int(page)+1):
17         one_page_title = []
18         for j in range(1, 21):
19             sleep(2)
20             title = d.find_element_by_xpath('//*[@id="post_list"]/div[{}]/div[2]/h3/a'.format(j)).text
21             # print(title)
22             one_page_title.append(title)
23         p = copy(one_page_title)
24         all_title['第{}页'.format(i)] = p
25         sleep(2)
26         js = 'document.documentElement.scrollTop=10000;'
27         d.execute_script(js)
28         sleep(2)
29         d.find_element_by_xpath('//div[@id="pager_bottom"]/div/div/a[text()="Next >"]').click()
30         one_page_title.clear()
31     sleep(2)
32     d.quit()
33     return all_title
34 
35 
36 print(get_pro_titles(2))

猜你喜欢

转载自www.cnblogs.com/qingy/p/11800800.html
今日推荐