クロールテキスト音声保存データベースの自動クローラーポジティブおよびネガティブ認識

# coding:utf-8
from selenium import webdriver
from sqlalchemy import create_engine
import pandas as pd
import lxml.html, queue, logging
import time, json, datetime
import requests
import copy

# browser = webdriver.Firefox()

# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"}
driver = webdriver.Firefox()

tieba = []

# 当前时间点
TIME = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 上周这个时间点
# LAST_WEEK = datetime.datetime.strptime((datetime.datetime.now()-datetime.timedelta(hours=168)).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
LAST_WEEK = datetime.datetime.strptime('1990-01-01 00:00', '%Y-%m-%d %H:%M')

App_Key = ''
Secret_Key = ''

logfile = "result.log"
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  line:%(lineno)d  %(levelname)s : %(message)s',
                    datefmt=' %Y-%m-%d %H:%M:%S',
                    filename='result.log',
                    filemode='a+')


# 等待加载页面，滚动页面显示页面数据
def scroll_page(driver):
    time.sleep(10)
    driver.execute_script('var q=document.documentElement.scrollTop=0')
    for i in range(40):
        js = 'window.scrollBy(0,400)'
        driver.execute_script(js)
        time.sleep(0.5)


# 调取情感识别接口
def getToken():
    # client_id 为官网获取的AK， client_secret 为官网获取的SK
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=' + App_Key + '&client_secret=' + Secret_Key
    response = requests.get(host)

    if response.status_code == 200:  # 响应成功
        info = json.loads(response.text)  # 将字符串转成字典
        access_token = info['access_token']  # 解析数据到access_token
        return access_token
    return ''


AccessToken = getToken()


# 情感识别
def getEmotion(inputText, access_token):
    url = 'https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify?access_token=' + access_token
    # url = 'https://aip.baidubce.com/rpc/2.0/nlp/v2/comment_tag?access_token=' + access_token
    header = {
    
    'Content-Type	': 'application/json'}
    body = {
    
    'text': inputText}
    requests.packages.urllib3.disable_warnings()
    res = requests.post(url=url, data=json.dumps(body), headers=header, verify=False)
    if res.status_code == 200:
        info = json.loads(res.text)
        print(info)  # 打印接口返回信息，如果报错方便查看，也可以忽略报错继续执行
        if 'items' in info and len(info['items']) > 0:
            sentiment = info['items'][0]['sentiment']
            if sentiment == 2:
                type = '正面'
                return type
            elif sentiment == 1:
                type = '中性'
                return type
            else:
                type = '负面'
                return type


# 获取所有评论信息
def get_all_reply(page_info):
    parser = page_info
    reply_lt = parser.xpath('//div[starts-with(@class,"l_post j_l_post l_post_bright  ")]')
    if reply_lt:
        reply_list = reply_lt
    else:
        reply_list = parser.xpath('//div[starts-with(@class,"l_post l_post_bright j_l_post clearfix  ")]')

    for element in reply_list:
        data = {
    
    }
        data['tieba_name'] = tieba_name
        data['title'] = subject
        name = element.xpath('.//li[@class="d_name"]/a[starts-with(@class,"p_author_name")]/text()')
        if len(name) > 1:
            lt_name = []
            for i in range(len(name)):
                lt_name.append(name[i])
            str_name = ''.join(lt_name)
        else:
            str_name = name[0]
        # 回复内容
        try:
            content = ''
            content1 = element.xpath('.//div[@class="d_post_content j_d_post_content  clearfix"]/text()')
            if len(content1) > 1:
                for i in range(len(content1)):
                    content += content1[i]
            else:
                content = content1[0]
        except:
            try:
                content = element.xpath('.//div[starts-with(class, "d_post_content j_d_post_content")]/text()')[
                    0].strip()
            except:
                content = element.xpath(
                    './/div[@class="d_post_content j_d_post_content d_post_content_bold clearfix"]/text()')[
                    0].strip()
            # 内容类型
            # data['content_type'] = getEmotion(content, AccessToken)
        try:
            reply_time = json.loads(element.xpath('@data-field')[0])['content']['date']
        except:
            reply_time = element.xpath('.//div[@class="post-tail-wrap"]/span[4]/text()')[0]
        data['author_name'] = str_name
        data['content'] = content
        data['public_time'] = reply_time

        # 二级回复
        lt_reply2 = element.xpath(
            './div[@class="d_post_content_main"]//div[@class="j_lzl_c_b_a core_reply_content"]/ul/li')
        if lt_reply2:
            data['content_type'] = getEmotion(content, AccessToken)
            time.sleep(3)
            reply2(data, lt_reply2)
        else:
            pb_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d %H:%M')
            if pb_time > LAST_WEEK:
                # 内容类型
                data['content_type'] = getEmotion(content, AccessToken)
                tieba.append(data)
            else:
                continue


# 二级回复信息
def reply2(data, lt_reply2):
    for reply_detail2 in lt_reply2:
        # 二级回复内容
        cp_data = copy.deepcopy(data)
        reply_content = reply_detail2.xpath('.//span[@class="lzl_content_main"]//text()')
        if len(reply_content) > 1:
            reply_msg = reply_content[0]
            for i in range(1, len(reply_content)):
                reply_msg += reply_content[i]
            cp_data['reply_content'] = reply_msg
            # cp_data['reply_type'] = getEmotion(reply_msg, AccessToken)
            time.sleep(3)
        else:
            try:
                cp_data['reply_content'] = reply_content[0]
                # cp_data['reply_type'] = getEmotion(reply_content[0], AccessToken)
                time.sleep(3)
            except:
                pass
        # 二级回复人
        reply_name = reply_detail2.xpath('./div/a[@class="at j_user_card "]/text()')
        if reply_name:
            cp_data['reply_name'] = reply_name[0]
        # else:
        # cp_data['reply_name'] = ''
        # 二级回复时间
        reply2_time = reply_detail2.xpath('.//span[@class="lzl_time"]/text()')
        if reply2_time:
            reply_time1 = ' '.join(reply2_time[0].split())
            cp_data['reply_time'] = reply_time1
        # else:
        #     cp_data['reply_time'] = ''
        try:
            r_time = datetime.datetime.strptime(cp_data['reply_time'], '%Y-%m-%d %H:%M')
            p_time = datetime.datetime.strptime(cp_data['public_time'], '%Y-%m-%d %H:%M')
            if r_time > LAST_WEEK or p_time > LAST_WEEK:
                # content = cp_data['content']
                reply2_content = cp_data['reply_content']
                # cp_data['content_type'] = getEmotion(content, AccessToken)
                cp_data['reply_type'] = getEmotion(reply2_content, AccessToken)
                tieba.append(cp_data)
            else:
                continue
        except Exception as e:
            logging.info(e)


class DownloadItem(object):
    """
    下载类型对象
    """

    def __init__(self, url_str, type):
        """

        :param url_str: 下载地址
        :param type: 0:列表页，1：详情页
        """
        self.url = url_str
        self.type = type


# 创建一个队列
download_queue = queue.Queue()

download_item = DownloadItem(
    "百度贴吧根据关键词全爬取全贴吧相关信息（此处放置搜索后第一页的结果，下面元素获取是原搜索信息不方便透露）例如：https://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=lol&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn=1",
    0)
download_queue.put(download_item)

# 如果下载队列不为空就一直循环
while not download_queue.empty():

    item = download_queue.get()
    driver.get(item.url)
    time.sleep(3)
    parser = lxml.html.fromstring(driver.page_source)
    if item.type == 0:
        # 翻页信息
        pages = parser.xpath('.//div[@class="pager pager-search"]/a[@class="last"]/@href')
        page_num = pages[0].split('=')[-1]
        print(page_num)
        # for i in range(76, int(page_num) + 1):
        for i in range(1, 2):
            url = 'https://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=lol&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn={}'.format(
                1 + i)
            result = requests.get(url)
            # time.sleep(2)
            parser = lxml.html.fromstring(result.text)
            detail_url = parser.xpath('//span[@class="p_title"]/a/@href')
            for url in detail_url:
                real_url = 'https://tieba.baidu.com/' + url
                detail_item = DownloadItem(real_url, 1)
                download_queue.put(detail_item)
                break
            # 解析下一页地址
            # next_item = DownloadItem('https:' + parser.xpath('//a[@class="next pagination-item "]/@href')[0], 0)
            # print(next_item)
            # download_queue.put(next_item)

    else:
        # 解析详情页内容
        scroll_page(driver)
        parser = lxml.html.fromstring(driver.page_source)
        tieba_name = driver.find_element_by_xpath('//div[@class="card_title "]/a').text
        # 文章主题
        try:
            subject = parser.xpath('//h1/@title')[0]

        except:
            # subject = driver.find_element_by_xpath('//h3[@class="core_title_txt pull-left text-overflow  ")').text
            subject = parser.xpath('//h3/@title')[0]
        # 多评论也问题
        get_page = parser.xpath('//li[@class="l_reply_num"]/span[2]/text()')[0]
        reply_urls = []
        if int(get_page) > 1:
            reply_url = \
                parser.xpath('//ul[@class="l_posts_num"]/li[@class="l_pager pager_theme_4 pb_list_pager"]/a[1]/@href')[
                    0].split(
                    "=")[0]
            for i in range(1, int(get_page) + 1):
                reply_urls.append('https://tieba.baidu.com' + reply_url + '=' + str(i))
        if reply_urls:
            for item in reply_urls:
                driver.get(item)
                scroll_page(driver)
                page_info = lxml.html.fromstring(driver.page_source)
                get_all_reply(page_info)
        else:
            page_info = parser
            get_all_reply(page_info)

conndb = create_engine('mysql+pymysql://root:123456@localhost:3306/hj_bi')

for item in tieba:
    item['public_media'] = '贴吧'
    item['project_name'] = 'craw'
insert_data = pd.DataFrame(tieba)
try:
    pd.io.sql.to_sql(insert_data, 'spider_copy2', conndb, if_exists='append', index=False)
except Exception as e:
    logging.info(e)
クロールテキスト音声保存データベースの自動クローラーポジティブおよびネガティブ認識

おすすめ