#coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pymysql
import os
import re
driver = webdriver.Chrome()
def get_night(month, date):
month = int(month)
date = int(date)
dates = (21, 20, 21, 21, 22, 22, 23, 24, 24, 24, 23, 22)
constellations = ("摩羯", "水瓶", "双鱼", "白羊", "金牛", "双子", "巨蟹", "狮子", "处女", "天秤", "天蝎", "射手", "摩羯")
if date < dates[month-1]:
return constellations[month-1] + '座'
else:
return constellations[month] + '座'
def login_weibo(username, password):
print('登录开始')
driver.get('https://passport.weibo.cn/signin/login')
time.sleep(4)
driver.find_element_by_id("loginName").send_keys(username)
driver.find_element_by_id("loginPassword").send_keys(password)
driver.find_element_by_id("loginAction").click()
stats = input("完成登录输入1\n")
def push_mysql(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content):
data_zwho = "'" + str(data_zwho) + "'"
data_ping = "'" + str(data_ping) + "'"
data_zhuan = "'" + str(data_zhuan) + "'"
data_zan = "'" + str(data_zan) + "'"
u_wcnt = "'" + str(u_wcnt) + "'"
u_gcnt = "'" + str(u_gcnt) + "'"
u_fcnt = "'" + str(u_fcnt) + "'"
u_id = "'" + str(u_id) + "'"
u_name = "'" + str(u_name) + "'"
u_renzheng = "'" + str(u_renzheng) + "'"
u_sex = "'" + str(u_sex) + "'"
u_place = "'" + str(u_place) + "'"
u_birth = "'" + str(u_birth) + "'"
u_night = "'" + str(u_night) + "'"
u_jianjie = "'" + str(u_jianjie) + "'"
u_gq = "'" + str(u_gq) + "'"
u_learn = "'" + str(u_learn) + "'"
u_work = "'" + str(u_work) + "'"
data_content = "'" + str(data_content) + "'"
db = pymysql.connect("","","","" )
cursor = db.cursor()
sql = 'insert into data_3 (data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content)'
sql += 'values('+str(data_zwho)+','+str(data_zan)+','+str(data_ping)+','+str(data_zhuan)+','+str(u_wcnt)+','+str(u_gcnt)+','+str(u_fcnt)+','+str(u_id)+','+str(u_name)+','+str(u_renzheng)+','+str(u_sex)+','+str(u_place)+','+str(u_birth)+','+str(u_night)+','+str(u_jianjie)+','+str(u_gq)+','+str(u_learn)+','+str(u_work)+','+str(data_content)+')'
try:
cursor.execute(sql)
db.commit()
except Exception as e:
db.rollback()
print(e)
pass
db.close()
def get_u_info(u_id, data_content,data_zwho, data_zan, data_ping, data_zhuan):
if (u_id == ''):
return
driver.get('https://weibo.cn/'+str(u_id)+'/info')
page = driver.page_source
u_name = ''.join(re.compile('昵称:(.*?)<br>').findall(page))
u_renzheng = ''.join(re.compile('认证:(.*?)<br>').findall(page))
u_sex = ''.join(re.compile('性别:(.*?)<br>').findall(page))
u_place = ''.join(re.compile('地区:(.*?)<br>').findall(page))
u_birth = ''.join(re.compile('生日:(.*?)<br>').findall(page))
if (len(u_birth.split('-')) != 3):
u_night = ''
else:
u_night = get_night(u_birth.split('-')[1], u_birth.split('-')[2])
u_jianjie = ''.join(re.compile('简介:(.*?)<br>').findall(page))
u_gq = ''.join(re.compile('感情状况:(.*?)<br>').findall(page))
u_learn = ''.join(re.compile('<div class="tip">学习经历</div><div class="c">(.*?)<br></div>').findall(page))
u_learn = u_learn.replace(' ', '')
u_work = ''.join(re.compile('<div class="tip">工作经历</div><div class="c">(.*?)<br></div>').findall(page))
u_work = u_work.replace(' ', '')
driver.get('https://weibo.cn/u/'+str(u_id))
page = driver.page_source
u_wcnt = ''.join(re.compile('微博\[(.*?)\]').findall(page))
u_gcnt = ''.join(re.compile('关注\[(.*?)\]').findall(page))
u_fcnt = ''.join(re.compile('粉丝\[(.*?)\]').findall(page))
print(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, len(data_content))
push_mysql(data_zwho, data_zan, data_ping, data_zhuan,u_wcnt,u_gcnt,u_fcnt,u_id, u_name, u_renzheng, u_sex, u_place, u_birth, u_night, u_jianjie, u_gq, u_learn, u_work, data_content)
def save_data(keyword, start, end):
if not os.path.exists('D:/desktop/data/'+str(keyword)):
os.mkdir('D:/desktop/data/'+str(keyword))
for i in range(start, end + 1):
print(i)
driver.get('https://weibo.cn/search/mblog?hideSearchFrame=&keyword='+str(keyword)+'&page='+str(i))
path = 'D:/desktop/data/'+str(keyword)+'/' + str(i) + '.txt'
f = open(path,"w",encoding='utf-8')
page = driver.page_source
f.write(page)
f.close()
time.sleep(4)
def get_words(keyword):
path = 'D:/desktop/data/'+str(keyword)+''
files= os.listdir(path)
for file in files: #遍历文件夹
position = path+'/'+ file
with open(position, "r",encoding='utf-8') as f: #打开文件
data = f.read() #读取文件
print(position)
#print(data)
#<a class="nk" href="https://weibo.cn/u/1075019743">武汉公交集团</a>
pat_all = '<div class="c" id="(.*?)</div></div>'
data_all = re.compile(pat_all).findall(data)
for each_data in data_all:
pat_id = '<a class="nk" href="https://weibo.cn/u/(.*?)">'
data_id = re.compile(pat_id).findall(each_data)
pat_zan = '赞\[(.*?)\]'
pat_ping = '评论\[(.*?)\]'
pat_zhuan = '转发\[(.*?)\]'
pat_zwho1 = '转发了(.*?)/a>'
pat_zwho = '>(.*?)<'
data_zan = ''.join(re.compile(pat_zan).findall(each_data))
data_ping = ''.join(re.compile(pat_ping).findall(each_data))
data_zhuan = ''.join(re.compile(pat_zhuan).findall(each_data))
data_zwho1 = ''.join(re.compile(pat_zwho1).findall(each_data))
data_zwho = ''.join(re.compile(pat_zwho).findall(data_zwho1))
#print(data_zwho, data_zan, data_ping, data_zhuan)
pat_content = '>(.*?)<'
data_content = re.compile(pat_content).findall(each_data)
data_content = ''.join(data_content)
data_content = data_content.replace(' ',' ')
get_u_info(''.join(data_id), data_content,data_zwho, data_zan, data_ping, data_zhuan)
time.sleep(3)
print(keyword)
login_weibo('', '')
keywords = ['']
for i in keywords:
save_data(i, 1, 100)
get_words(i)
某微博爬虫 根据关键词 每条信息数据 和 对应用户数据
猜你喜欢
转载自blog.csdn.net/weixin_43870649/article/details/104970380
今日推荐
周排行