爬虫10-百度贴吧

"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/8/21'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神兽保佑    ┣┓
                ┃　永无BUG！   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""


"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/8/21'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神兽保佑    ┣┓
                ┃　永无BUG！   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
from urllib import parse
from urllib import request
from lxml import etree
import csv
import codecs


def ba_spider():
    url = 'https://tieba.baidu.com/f?'
    headers = {}
    headers['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
    ba_name = input('请输入贴吧的名字：')
    word = {'kw': ba_name}
    begin_page = int(input('起始页码:'))
    end_page = int(input('终止页码:'))
    for page in range(begin_page, end_page+1):
        word['pn'] = (page-1)*50
        wd = parse.urlencode(word)
        end_url = url + wd
        req = request.Request(end_url, headers=headers)
        response = request.urlopen(req)
        html = response.read().decode()
        temp = etree.HTML(html)
        links = temp.xpath("//li[contains(@class,'j_thread_list clearfix')]")
        print(len(links))
        base_url = "https://tieba.baidu.com"
        for link in links:
            # 回复数
            pv = link.xpath('./div/div[1]/span[@class="threadlist_rep_num center_text"]/text()')[0]
            title = link.xpath('./div/div[2]/div[1]/div[1]/a/text()')[0]
            teizi_url = base_url + link.xpath('./div/div[2]/div[1]/div[1]/a/@href')[0]
            author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()')
            if len(author) > 0:
                author = author[0]
            else:
                author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[2]/a/text()')
                author = author[0]
            print('author:', author)
            with codecs.open('data/tieba_'+ba_name+'.csv', 'a', encoding='utf-8') as file:
                wr = csv.writer(file)
                wr.writerow([title, author, pv, teizi_url])

    print('Success')


ba_spider()
猜你喜欢