# -*- coding:utf-8 -*- from selenium import webdriver import time # 运行程序要安装selenium模块,并下载Chrome浏览器驱动 driver = webdriver.Chrome() driver.get('https://zhuanlan.zhihu.com/buzhisuowei') # 专栏地址 # 将滚动条拉到页面最底部,使页面加载全部文章 i = 0 while(i < 10): #当文章数量多时,把10变大一些 driver.execute_script("window.scrollBy(0,5000)") time.sleep(2) i += 1 # 通过class name找到所有文章的标题 titles = driver.find_elements_by_class_name('PostListItem-title') print(len(titles)) j = 1 xpath_1 = '//*[@id="react-root"]/div/div/div[3]/div[2]/div[2]/ul/li[' xpath_2 = ']/div/div/a' while(j <= 17): #专栏文章数量 # 通过 xpath 找到所有文章的url xpath = xpath_1 + str(j) + xpath_2 url = driver.find_element_by_xpath(xpath).get_property('href') # 打印信息 print url + titles[j-1].text j += 1
Python爬取知乎专栏文章标题及URL
猜你喜欢
转载自blog.csdn.net/qq_32862515/article/details/78867786
今日推荐
周排行