Python爬虫 - 爬取公交线路

import requests
from lxml import etree

items = []
import time
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
def main():
    #爬取第一页所有的导航链接
    N_list = parse_first()
    #print(N_list)
    #爬取二级页面
    parse_second(N_list)
    #
    fp = open('天津公交.txt','w',encoding='utf8')
    for item in items:
        fp.write(str(item) + '\n')
    fp.close()


def parse_first():
    url = 'https://tianjin.8684.cn/'
    r = requests.get(url, headers=header)
    #解析内容,获取所有的导航链接
    tree= etree.HTML(r.text)
    #获取以数字开头的链接
    number_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
    #查找以字母开头的链接
    char_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')

    return number_list + char_list

def parse_second(list):
    #遍历列表依次发送请求,解析内容,获取所有的公交路线url
    #print(list)
    for N in list:
        #print(N)
        url = 'https://tianjin.8684.cn' + N
        #print(url)
        r = requests.get(url, headers=header)
        #解析内容,获取每一路的公交的url
        parse_third(r.text)

def parse_third(r_text):
    tree = etree.HTML(r_text)
    route = tree.xpath('//div[@class="stie_list"]/a/@href')
    #print(route)
    # 遍历列表
    for R in route:
        url = 'https://tianjin.8684.cn' + R
        #print(url)
        r2 = requests.get(url, headers=header)
        #print(r2.text)
        # 解析内容,获取每一路的公交的详细信息
        parse_details(r2.text)

def parse_details(r_text):
    tree = etree.HTML(r_text)
    #print(tree.text)
    bus_name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
    work_time = tree.xpath('//div[@class="bus_i_content"]/p[1]/text()')[0]
    Fare_information = tree.xpath('//div[@class="bus_i_content"]/p[2]/text()')[0]
    update_time = tree.xpath('//div[@class="bus_i_content"]/p[4]/text()')[0]
    #bus_stop = tree.xpath('//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/i/text()')
    bus_stop = tree.xpath('//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/a/text()')
    print(len(bus_stop))
    item = {
        '公交车名': bus_name,
        '工作时间': work_time,
        '售票信息': Fare_information,
        '更新时间': update_time,
        '公交站点': bus_stop
    }
    items.append(item)

if __name__ =='__main__':
    main()
    
发布了51 篇原创文章 · 获赞 29 · 访问量 2377

猜你喜欢

转载自blog.csdn.net/fangweijiex/article/details/103788270
今日推荐