某微博爬虫 根据关键词 每条信息数据 和 对应用户数据

#coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pymysql
import os
import re
driver = webdriver.Chrome()

def get_night(month, date):
    month = int(month)
    date = int(date)
    dates = (21, 20, 21, 21, 22, 22, 23, 24, 24, 24, 23, 22)
    constellations = ("摩羯", "水瓶", "双鱼", "白羊", "金牛", "双子", "巨蟹", "狮子", "处女", "天秤", "天蝎", "射手", "摩羯")
    if date < dates[month-1]:
    	return constellations[month-1] + '座'
    else:
    	return constellations[month] + '座'

def login_weibo(username, password):
    print('登录开始')
    driver.get('https://passport.weibo.cn/signin/login')
    time.sleep(4)
    driver.find_element_by_id("loginName").send_keys(username)
    driver.find_element_by_id("loginPassword").send_keys(password)
    driver.find_element_by_id("loginAction").click()
    stats = input("完成登录输入1\n")


def push_mysql(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content):
    data_zwho = "'" + str(data_zwho) + "'"
    data_ping = "'" + str(data_ping) + "'"
    data_zhuan = "'" + str(data_zhuan) + "'"
    data_zan = "'" + str(data_zan) + "'"
    u_wcnt = "'" + str(u_wcnt) + "'"
    u_gcnt = "'" + str(u_gcnt) + "'"
    u_fcnt = "'" + str(u_fcnt) + "'"
    u_id = "'" + str(u_id) + "'"
    u_name = "'" + str(u_name) + "'"
    u_renzheng = "'" + str(u_renzheng) + "'"
    u_sex = "'" + str(u_sex) + "'"
    u_place = "'" + str(u_place) + "'"
    u_birth = "'" + str(u_birth) + "'"
    u_night = "'" + str(u_night) + "'"
    u_jianjie = "'" + str(u_jianjie) + "'"
    u_gq = "'" + str(u_gq) + "'"
    u_learn = "'" + str(u_learn) + "'"
    u_work = "'" + str(u_work) + "'"
    data_content = "'" + str(data_content) + "'"
    
    db = pymysql.connect("","","","" )
    cursor = db.cursor()
    sql = 'insert into data_3 (data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content)'
    sql += 'values('+str(data_zwho)+','+str(data_zan)+','+str(data_ping)+','+str(data_zhuan)+','+str(u_wcnt)+','+str(u_gcnt)+','+str(u_fcnt)+','+str(u_id)+','+str(u_name)+','+str(u_renzheng)+','+str(u_sex)+','+str(u_place)+','+str(u_birth)+','+str(u_night)+','+str(u_jianjie)+','+str(u_gq)+','+str(u_learn)+','+str(u_work)+','+str(data_content)+')'
    try:
       cursor.execute(sql)
       db.commit()
    except Exception as e:
       db.rollback()
       print(e)
       pass
    db.close()

def get_u_info(u_id, data_content,data_zwho, data_zan, data_ping, data_zhuan):
    if (u_id == ''):
        return
    
    driver.get('https://weibo.cn/'+str(u_id)+'/info')
    page = driver.page_source
    
    u_name = ''.join(re.compile('昵称:(.*?)<br>').findall(page))
    u_renzheng = ''.join(re.compile('认证:(.*?)<br>').findall(page))
    u_sex = ''.join(re.compile('性别:(.*?)<br>').findall(page))
    u_place = ''.join(re.compile('地区:(.*?)<br>').findall(page))
    u_birth = ''.join(re.compile('生日:(.*?)<br>').findall(page))
    if (len(u_birth.split('-')) != 3):
        u_night = ''
    else:
        u_night = get_night(u_birth.split('-')[1], u_birth.split('-')[2])
    u_jianjie = ''.join(re.compile('简介:(.*?)<br>').findall(page))
    u_gq = ''.join(re.compile('感情状况:(.*?)<br>').findall(page))
    
    u_learn = ''.join(re.compile('<div class="tip">学习经历</div><div class="c">(.*?)<br></div>').findall(page))
    u_learn = u_learn.replace('&nbsp;', '')

    u_work = ''.join(re.compile('<div class="tip">工作经历</div><div class="c">(.*?)<br></div>').findall(page))
    u_work = u_work.replace('&nbsp;', '')

    driver.get('https://weibo.cn/u/'+str(u_id))
    page = driver.page_source
    u_wcnt = ''.join(re.compile('微博\[(.*?)\]').findall(page))
    u_gcnt = ''.join(re.compile('关注\[(.*?)\]').findall(page))
    u_fcnt = ''.join(re.compile('粉丝\[(.*?)\]').findall(page))
    
    print(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, len(data_content))
    push_mysql(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content)

def save_data(keyword, start, end):
    if not os.path.exists('D:/desktop/data/'+str(keyword)):
        os.mkdir('D:/desktop/data/'+str(keyword))
    for i in range(start, end + 1):
        print(i)
        driver.get('https://weibo.cn/search/mblog?hideSearchFrame=&keyword='+str(keyword)+'&page='+str(i))
        path = 'D:/desktop/data/'+str(keyword)+'/' + str(i) + '.txt'
        f = open(path,"w",encoding='utf-8')
        page = driver.page_source
        f.write(page)
        f.close()
        time.sleep(4)

def get_words(keyword):
    path = 'D:/desktop/data/'+str(keyword)+''
    files= os.listdir(path)
    for file in files: #遍历文件夹
        position = path+'/'+ file          
        with open(position, "r",encoding='utf-8') as f:    #打开文件
            data = f.read()   #读取文件
            print(position)
            #print(data)
        #<a class="nk" href="https://weibo.cn/u/1075019743">武汉公交集团</a>

        pat_all = '<div class="c" id="(.*?)</div></div>'
        data_all = re.compile(pat_all).findall(data)
        for each_data in data_all:
            pat_id = '<a class="nk" href="https://weibo.cn/u/(.*?)">'
            data_id = re.compile(pat_id).findall(each_data)

            pat_zan = '赞\[(.*?)\]'
            pat_ping = '评论\[(.*?)\]'
            pat_zhuan = '转发\[(.*?)\]'
            pat_zwho1 = '转发了(.*?)/a>'
            pat_zwho = '>(.*?)<'
            data_zan = ''.join(re.compile(pat_zan).findall(each_data))
            data_ping = ''.join(re.compile(pat_ping).findall(each_data))
            data_zhuan = ''.join(re.compile(pat_zhuan).findall(each_data))
            data_zwho1 = ''.join(re.compile(pat_zwho1).findall(each_data))
            data_zwho = ''.join(re.compile(pat_zwho).findall(data_zwho1))
            
            #print(data_zwho, data_zan, data_ping, data_zhuan)

            pat_content = '>(.*?)<'
            data_content = re.compile(pat_content).findall(each_data)
            data_content = ''.join(data_content)
            data_content = data_content.replace('&nbsp;',' ')

            get_u_info(''.join(data_id), data_content,data_zwho, data_zan, data_ping, data_zhuan)
            time.sleep(3)
            print(keyword)
        


login_weibo('', '')

keywords = ['']
for i in keywords:
    save_data(i, 1, 100)
    get_words(i)





发布了209 篇原创文章 · 获赞 72 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/weixin_43870649/article/details/104970380