Python爬取知乎专栏文章标题及URL

# -*- coding:utf-8 -*-
from selenium import webdriver
import time

# 运行程序要安装selenium模块,并下载Chrome浏览器驱动
driver = webdriver.Chrome()
driver.get('https://zhuanlan.zhihu.com/buzhisuowei')                           # 专栏地址

# 将滚动条拉到页面最底部,使页面加载全部文章
i = 0
while(i < 10):                                                  #当文章数量多时,把10变大一些
    driver.execute_script("window.scrollBy(0,5000)")
    time.sleep(2)
    i += 1

# 通过class name找到所有文章的标题
titles = driver.find_elements_by_class_name('PostListItem-title')
print(len(titles))


j = 1
xpath_1 = '//*[@id="react-root"]/div/div/div[3]/div[2]/div[2]/ul/li['
xpath_2 = ']/div/div/a'
while(j <= 17):                                                              #专栏文章数量

    # 通过 xpath 找到所有文章的url
    xpath = xpath_1 + str(j) + xpath_2
    url = driver.find_element_by_xpath(xpath).get_property('href')

    # 打印信息
    print url + titles[j-1].text
    j += 1

猜你喜欢

转载自blog.csdn.net/qq_32862515/article/details/78867786