"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/8/21'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/8/21'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from urllib import parse
from urllib import request
from lxml import etree
import csv
import codecs
def ba_spider():
url = 'https://tieba.baidu.com/f?'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
ba_name = input('请输入贴吧的名字:')
word = {'kw': ba_name}
begin_page = int(input('起始页码:'))
end_page = int(input('终止页码:'))
for page in range(begin_page, end_page+1):
word['pn'] = (page-1)*50
wd = parse.urlencode(word)
end_url = url + wd
req = request.Request(end_url, headers=headers)
response = request.urlopen(req)
html = response.read().decode()
temp = etree.HTML(html)
links = temp.xpath("//li[contains(@class,'j_thread_list clearfix')]")
print(len(links))
base_url = "https://tieba.baidu.com"
for link in links:
# 回复数
pv = link.xpath('./div/div[1]/span[@class="threadlist_rep_num center_text"]/text()')[0]
title = link.xpath('./div/div[2]/div[1]/div[1]/a/text()')[0]
teizi_url = base_url + link.xpath('./div/div[2]/div[1]/div[1]/a/@href')[0]
author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()')
if len(author) > 0:
author = author[0]
else:
author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[2]/a/text()')
author = author[0]
print('author:', author)
with codecs.open('data/tieba_'+ba_name+'.csv', 'a', encoding='utf-8') as file:
wr = csv.writer(file)
wr.writerow([title, author, pv, teizi_url])
print('Success')
ba_spider()
爬虫10-百度贴吧
猜你喜欢
转载自blog.csdn.net/qwerLoL123456/article/details/83515109
今日推荐
周排行