# coding:utf-8
from selenium import webdriver
from sqlalchemy import create_engine
import pandas as pd
import lxml.html, queue, logging
import time, json, datetime
import requests
import copy
# browser = webdriver.Firefox()
# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"}
driver = webdriver.Firefox()
tieba = []
# 当前时间点
TIME = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 上周这个时间点
# LAST_WEEK = datetime.datetime.strptime((datetime.datetime.now()-datetime.timedelta(hours=168)).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
LAST_WEEK = datetime.datetime.strptime('1990-01-01 00:00', '%Y-%m-%d %H:%M')
App_Key = ''
Secret_Key = ''
logfile = "result.log"
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s line:%(lineno)d %(levelname)s : %(message)s',
datefmt=' %Y-%m-%d %H:%M:%S',
filename='result.log',
filemode='a+')
# 等待加载页面,滚动页面显示页面数据
def scroll_page(driver):
time.sleep(10)
driver.execute_script('var q=document.documentElement.scrollTop=0')
for i in range(40):
js = 'window.scrollBy(0,400)'
driver.execute_script(js)
time.sleep(0.5)
# 调取情感识别接口
def getToken():
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=' + App_Key + '&client_secret=' + Secret_Key
response = requests.get(host)
if response.status_code == 200: # 响应成功
info = json.loads(response.text) # 将字符串转成字典
access_token = info['access_token'] # 解析数据到access_token
return access_token
return ''
AccessToken = getToken()
# 情感识别
def getEmotion(inputText, access_token):
url = 'https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify?access_token=' + access_token
# url = 'https://aip.baidubce.com/rpc/2.0/nlp/v2/comment_tag?access_token=' + access_token
header = {
'Content-Type ': 'application/json'}
body = {
'text': inputText}
requests.packages.urllib3.disable_warnings()
res = requests.post(url=url, data=json.dumps(body), headers=header, verify=False)
if res.status_code == 200:
info = json.loads(res.text)
print(info) # 打印接口返回信息,如果报错方便查看,也可以忽略报错继续执行
if 'items' in info and len(info['items']) > 0:
sentiment = info['items'][0]['sentiment']
if sentiment == 2:
type = '正面'
return type
elif sentiment == 1:
type = '中性'
return type
else:
type = '负面'
return type
# 获取所有评论信息
def get_all_reply(page_info):
parser = page_info
reply_lt = parser.xpath('//div[starts-with(@class,"l_post j_l_post l_post_bright ")]')
if reply_lt:
reply_list = reply_lt
else:
reply_list = parser.xpath('//div[starts-with(@class,"l_post l_post_bright j_l_post clearfix ")]')
for element in reply_list:
data = {
}
data['tieba_name'] = tieba_name
data['title'] = subject
name = element.xpath('.//li[@class="d_name"]/a[starts-with(@class,"p_author_name")]/text()')
if len(name) > 1:
lt_name = []
for i in range(len(name)):
lt_name.append(name[i])
str_name = ''.join(lt_name)
else:
str_name = name[0]
# 回复内容
try:
content = ''
content1 = element.xpath('.//div[@class="d_post_content j_d_post_content clearfix"]/text()')
if len(content1) > 1:
for i in range(len(content1)):
content += content1[i]
else:
content = content1[0]
except:
try:
content = element.xpath('.//div[starts-with(class, "d_post_content j_d_post_content")]/text()')[
0].strip()
except:
content = element.xpath(
'.//div[@class="d_post_content j_d_post_content d_post_content_bold clearfix"]/text()')[
0].strip()
# 内容类型
# data['content_type'] = getEmotion(content, AccessToken)
try:
reply_time = json.loads(element.xpath('@data-field')[0])['content']['date']
except:
reply_time = element.xpath('.//div[@class="post-tail-wrap"]/span[4]/text()')[0]
data['author_name'] = str_name
data['content'] = content
data['public_time'] = reply_time
# 二级回复
lt_reply2 = element.xpath(
'./div[@class="d_post_content_main"]//div[@class="j_lzl_c_b_a core_reply_content"]/ul/li')
if lt_reply2:
data['content_type'] = getEmotion(content, AccessToken)
time.sleep(3)
reply2(data, lt_reply2)
else:
pb_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d %H:%M')
if pb_time > LAST_WEEK:
# 内容类型
data['content_type'] = getEmotion(content, AccessToken)
tieba.append(data)
else:
continue
# 二级回复信息
def reply2(data, lt_reply2):
for reply_detail2 in lt_reply2:
# 二级回复内容
cp_data = copy.deepcopy(data)
reply_content = reply_detail2.xpath('.//span[@class="lzl_content_main"]//text()')
if len(reply_content) > 1:
reply_msg = reply_content[0]
for i in range(1, len(reply_content)):
reply_msg += reply_content[i]
cp_data['reply_content'] = reply_msg
# cp_data['reply_type'] = getEmotion(reply_msg, AccessToken)
time.sleep(3)
else:
try:
cp_data['reply_content'] = reply_content[0]
# cp_data['reply_type'] = getEmotion(reply_content[0], AccessToken)
time.sleep(3)
except:
pass
# 二级回复人
reply_name = reply_detail2.xpath('./div/a[@class="at j_user_card "]/text()')
if reply_name:
cp_data['reply_name'] = reply_name[0]
# else:
# cp_data['reply_name'] = ''
# 二级回复时间
reply2_time = reply_detail2.xpath('.//span[@class="lzl_time"]/text()')
if reply2_time:
reply_time1 = ' '.join(reply2_time[0].split())
cp_data['reply_time'] = reply_time1
# else:
# cp_data['reply_time'] = ''
try:
r_time = datetime.datetime.strptime(cp_data['reply_time'], '%Y-%m-%d %H:%M')
p_time = datetime.datetime.strptime(cp_data['public_time'], '%Y-%m-%d %H:%M')
if r_time > LAST_WEEK or p_time > LAST_WEEK:
# content = cp_data['content']
reply2_content = cp_data['reply_content']
# cp_data['content_type'] = getEmotion(content, AccessToken)
cp_data['reply_type'] = getEmotion(reply2_content, AccessToken)
tieba.append(cp_data)
else:
continue
except Exception as e:
logging.info(e)
class DownloadItem(object):
"""
下载类型对象
"""
def __init__(self, url_str, type):
"""
:param url_str: 下载地址
:param type: 0:列表页,1:详情页
"""
self.url = url_str
self.type = type
# 创建一个队列
download_queue = queue.Queue()
download_item = DownloadItem(
"百度贴吧根据关键词全爬取全贴吧相关信息(此处放置搜索后第一页的结果,下面元素获取是原搜索信息不方便透露)例如:https://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=lol&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn=1",
0)
download_queue.put(download_item)
# 如果下载队列不为空就一直循环
while not download_queue.empty():
item = download_queue.get()
driver.get(item.url)
time.sleep(3)
parser = lxml.html.fromstring(driver.page_source)
if item.type == 0:
# 翻页信息
pages = parser.xpath('.//div[@class="pager pager-search"]/a[@class="last"]/@href')
page_num = pages[0].split('=')[-1]
print(page_num)
# for i in range(76, int(page_num) + 1):
for i in range(1, 2):
url = 'https://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=lol&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn={}'.format(
1 + i)
result = requests.get(url)
# time.sleep(2)
parser = lxml.html.fromstring(result.text)
detail_url = parser.xpath('//span[@class="p_title"]/a/@href')
for url in detail_url:
real_url = 'https://tieba.baidu.com/' + url
detail_item = DownloadItem(real_url, 1)
download_queue.put(detail_item)
break
# 解析下一页地址
# next_item = DownloadItem('https:' + parser.xpath('//a[@class="next pagination-item "]/@href')[0], 0)
# print(next_item)
# download_queue.put(next_item)
else:
# 解析详情页内容
scroll_page(driver)
parser = lxml.html.fromstring(driver.page_source)
tieba_name = driver.find_element_by_xpath('//div[@class="card_title "]/a').text
# 文章主题
try:
subject = parser.xpath('//h1/@title')[0]
except:
# subject = driver.find_element_by_xpath('//h3[@class="core_title_txt pull-left text-overflow ")').text
subject = parser.xpath('//h3/@title')[0]
# 多评论也问题
get_page = parser.xpath('//li[@class="l_reply_num"]/span[2]/text()')[0]
reply_urls = []
if int(get_page) > 1:
reply_url = \
parser.xpath('//ul[@class="l_posts_num"]/li[@class="l_pager pager_theme_4 pb_list_pager"]/a[1]/@href')[
0].split(
"=")[0]
for i in range(1, int(get_page) + 1):
reply_urls.append('https://tieba.baidu.com' + reply_url + '=' + str(i))
if reply_urls:
for item in reply_urls:
driver.get(item)
scroll_page(driver)
page_info = lxml.html.fromstring(driver.page_source)
get_all_reply(page_info)
else:
page_info = parser
get_all_reply(page_info)
conndb = create_engine('mysql+pymysql://root:123456@localhost:3306/hj_bi')
for item in tieba:
item['public_media'] = '贴吧'
item['project_name'] = 'craw'
insert_data = pd.DataFrame(tieba)
try:
pd.io.sql.to_sql(insert_data, 'spider_copy2', conndb, if_exists='append', index=False)
except Exception as e:
logging.info(e)
クロールテキスト音声保存データベースの自動クローラーポジティブおよびネガティブ認識
おすすめ
転載: blog.csdn.net/weixin_46046193/article/details/108407634
ランキング