朴素贝叶斯代码

使用朴素贝叶斯在nlp中的应用

# coding=utf-8
import numpy as np

# 过滤网站的恶意留言  侮辱性:1     非侮辱性:0
# 创建一个实验样本
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],
                    ['maybe','not','take','him','to','dog','park','stupid'],
                    ['my','dalmation','is','so','cute','I','love','him'],
                   ['stop','posting','stupid','worthless','garbage'],
                   ['mr','licks','ate','my','steak','how','to','stop','him'],
                   ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList, classVec
#创建list 包含所有的词 但是不重复
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def bagofWord2VecMN(vocablist,inputSet):
    returnVec = [0]*len(vocablist)
    for word in inputSet:
        if word in vocablist:
            returnVec[vocablist.index(word)]+=1
    return returnVec

#朴素贝叶斯的训练集
def trainNb0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    # print("(trainMatrix[0]",trainMatrix[5])
    numWords = len(trainMatrix[0])
    pAbsive = sum(trainCategory)/float(numTrainDocs)#总数中侮辱的概率

    p0Num=np.ones(numWords)#非侮辱的词的数量
    p1Num =np.ones(numWords)

    p0Denom = 2.0#分母  公式中的S
    p1Denom = 2.0
    # print(trainCategory[1])
    for i in range(numTrainDocs):
        if np.all(trainCategory[i] == 0):
            p0Num +=trainMatrix[i]
            p0Denom+=sum(trainMatrix[i])
        else:
            p1Num +=trainMatrix[i]
            p1Denom +=sum(trainMatrix[i])
    # print(p0Num,p0Denom)
    p0Vect=np.log(p0Num/p0Denom)
    p1Vect=np.log(p1Num/p1Denom)
    return p0Vect,p1Vect,pAbsive

#朴素贝叶斯的分类
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # print("vec2Classify",vec2Classify)
    # print("vec2Classify", p0Vec)

    p1 = sum(vec2Classify*p1Vec) + np.log(pClass1)#vec2Classify*p1Vec 对应位相乘 做加和 一个词出现一次和两次是不一样的
    p0 = sum(vec2Classify*p0Vec) + np.log(1-pClass1)

    return 1 if p1>p0 else 0
def tsNB():
    listOPosts,listClasses = loadDataSet()

    myVocabList = createVocabList(listOPosts)

    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(bagofWord2VecMN(myVocabList,postinDoc))
    p0V,p1V,pAb=trainNb0(np.array(trainMat),np.array(listClasses))
    testEntry=["stupid","not","not","dfdd","not","not","dfdd"]
    thisDoc = np.array(bagofWord2VecMN(myVocabList,testEntry))
    # print(classifyNB(thisDoc,p0V,p1V,pAb))
if __name__ == '__main__':
    tsNB()
    '''
    思想:
        1.Word2Vec构建词向量 词向量的构建需要使用到词汇(训练集中)的数量 注意不要重复
        2.在训练集中每一条记录都转为向量 然后构成一个矩阵
        3.在使用朴素贝叶斯分类的时候 首先还是计算出1与0的频率 即公式中的P(Y=Ci) p0V,p1V表示每一个词在0与1类中的频率 
          并使用了拉普普拉斯平滑
    '''

猜你喜欢

转载自blog.csdn.net/weixin_40642306/article/details/83346899
今日推荐