Python爬虫之微信数据爬取（十三）

原创不易，转载前请注明博主的链接地址：Blessy_Zhu https://blog.csdn.net/weixin_42555080
本次代码的环境：
运行平台： Windows
Python版本： Python3.x
IDE： PyCharm

一、前言

微信作为我们日常交流的软件，越来越深入到我们的生活。但是，随着微信好的数量的增加，实际上真正可以联系的知心人却越来越少了。那么，怎么样能更清楚的认识自己微信朋友圈里面的微信好友的结构呢？接下来通过以下这些内容来认识自己微信朋友圈的结构信息。

二、将好友头像进行拼接

效果图如图1：
　

图1

代码如下，里面已经添加过备注了，接下来就不再细讲了：

import itchat
import os
import math
from PIL import Image

# 获取数据
def download_image():
    # 扫描二维码登陆微信，即通过网页版微信登陆
    itchat.auto_login()
    # 返回一个包含用户信息字典的列表
    friends = itchat.get_friends(update=True)
    #  在当前位置创建一个用于存储头像的目录wechatImages
    base_path = 'wechatImages'
    if not os.path.exists(base_path):
        os.mkdir(base_path)

    # 获取所有好友头像
    for friend in friends:
        # 获取头像数据
        img_data = itchat.get_head_img(userName = friend['UserName'])
        #判断备注名是否为空
        if friend['RemarkName'] != '':
            img_name = friend['RemarkName']
        else :
            img_name = friend['NickName']
         #   在实际操作中如果文件名中含有*标志，会报错。则直接可以将其替换掉
        if img_name is "*":
            img_name = ""
        #通过os.path.join()函数来拼接文件名
        img_file = os.path.join(base_path, img_name + '.jpg')
        print(img_file)
        with open(img_file, 'wb') as file:
            file.write(img_data)


# 拼接头像
def join_image():
    base_path = 'headImages'
    files = os.listdir(base_path) #返回指定的文件或文件夹的名字列表
    print(len(files))
    each_size = int(math.sqrt(float(6400 * 6400) / len(files)))#计算每个粘贴图片的边长
    lines = int(6400 / each_size)#计算总共有多少行
    print(lines)
    image = Image.new('RGB', (6400, 6400))# new(mode, size, color=0) 定义一张大小为640*640大小的图片，不给出第三个参数默认为黑色
    x = 0 #定义横坐标
    y = 0 #定义纵坐标
    for file_name in files:
        img = Image.open(os.path.join(base_path, file_name)) #找到/打开图片
        img = img.resize((each_size, each_size), Image.ANTIALIAS)#实现图片同比例缩放，Image.ANTIALIAS添加滤镜效果
        image.paste(img, (x * each_size, y * each_size))#将缩放后的照片放到对应的坐标下
        x += 1
        if x == lines:#如果每行的粘贴内容够了，则换行
            x = 0
            y += 1
    image.save('jointPic.jpg')#最后将全部的照片保存下来

if __name__ == '__main__':
    download_image()
    join_image()

三、统计微信好友性别比例

效果图如图2,因为本人是一个工科男，所以里面的好友的比例和我们专业的比例相仿“计算机、计算机，一对鸳鸯，一对基”，开了一个玩笑：
　

图2

代码如下:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import itchat
from pyecharts import Pie

# 获取数据
def get_data():
    itchat.auto_login()
    friends = itchat.get_friends(update=True)  # 返回一个包含用户信息字典的列表
    return friends

# 处理数据
def parse_data(data):
    friends = []
    for item in data[1:]:  # 因为第一个元素是自己的信息，去掉
        friend = {
            'NickName': item['NickName'],  # 昵称
            'RemarkName': item['RemarkName'],  # 备注名
            'Sex': item['Sex'],  # 性别：1男，2女，0未设置
            'Province': item['Province'],  # 省份
            'City': item['City'],  # 城市
            'Signature': item['Signature'].replace('\n', ' ').replace(',', ' '),  # 个性签名（处理签名内容换行的情况）
           }
        print(friend)
        friends.append(friend)
    return friends
# 存储数据，存储到文本文件
def save_to_txt():
    friends = parse_data(get_data())
    for item in friends:
        with open('friends.txt', mode='a', encoding='utf-8') as f:
            f.write('%s,%s,%d,%s,%s,%s,%d,%d\n' % (
                item['NickName'], item['RemarkName'], item['Sex'], item['Province'], item['City'], item['Signature'],
                ))

def stastic_sex():
    # 获取所有性别
    sex = []
    with open('friends.txt', mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        for row in rows:
            sex.append(row.split(',')[2])

    # 统计每个性别的数量
    attr = ['帅哥', '美女', '未知']
    value = [sex.count('1'), sex.count('2'), sex.count('0')]

    pie = Pie('好友性别比例', '好友总人数：%d' % len(sex), title_pos='center')
    pie.add('', attr, value, radius=[30, 75], rosetype='area', is_label_show=True,
            is_legend_show=True, legend_top='bottom',is_more_utils=True)
    # pie.show_config()
    pie.render('好友性别比例.html')

# 获取数据
if __name__ == '__main__':
    stastic_sex()

四、将好友备注信息生成词云

效果图如图3,因为本人喜欢足球所以加了很多“球霸”、因为自己以前参加过TMC，所以在那里面认识的人更多，所以会出现下面的这个结果（处于信息安全的考虑，部分信息做了处理）：
　

图3

代码如下:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba
from wordcloud import WordCloud

# 获取全部的备注名
remarkNames = []
with open('friends.txt', 'r', encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        remarkName = row.split(',')[1]
        if remarkName != '':
            remarkNames.append(remarkName)
# 设置分词 False精准模式分词、True全模式分词
split = jieba.cut(str(remarkNames), cut_all=False)
words = ' '.join(split)  # 以空格进行拼接
print(words)
# WordCloud（）函数的词云参数分别表示：画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
wc = WordCloud(width=1024, height=768, background_color='white',  font_path='STKAITI.TTF',max_font_size=400, random_state=50)
# 将分词后数据传入云图
wc.generate_from_text(words)
wc.to_file('好友备注名词云.jpg')

五、好友备注词频统计

上面的词云可以很清晰的看到哪些单词出现的最多，但是还是不知道它们的数量，接下里通过引入标准库collections的Counter模块来统计单词的出现次数。效果图如图4：
　

图4

代码如下：

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba
# 导入Counter类，用于统计值出现的次数
from collections import Counter
from pyecharts import Bar

remarkNames = []
with open('friends.txt', mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        remarkName = row.split(',')[1]
        if remarkName != '':
            remarkNames.append(remarkName)
# 排除下划线、短横线、逗号、空格、单引号
words = []
for cutword in jieba.cut(str(remarkNames), cut_all=False):
    if cutword not in ['_', '-', ',', '(', ')', '（', '）', ' ', "'"]:
        words.append(cutword)
data_top10 = Counter(words).most_common(10)  # 返回出现次数最多的20条
print(data_top10)
bar = Bar('好友备注词频数量统计TOP10', '', title_pos='center', width=1200, height=600)
attr, value = bar.cast(data_top10)
bar.add('', attr, value, visual_range=[0, 200], is_visualmap=True, is_label_show=True)
bar.render('好友备注词频数量统计TOP10.html')

在这里面主要用到了collection的Counter模块，其中Counter(words).most_common(10)就是设置选出words词库中出现超过10此的数据。

六、总结

这篇文章主要介绍了利用iechart爬出来的微信好友数据，进而进行一些常见的数据统计：性别、备注、备注频率，当然还爬取了微信好友的头像将其拼接在一张图片上。这篇文章就到这里了，欢迎大佬们多批评指正，也欢迎大家积极评论多多交流。