一、概述
优点:在数据少的情况下仍然有效,可以处理多类别问题
缺点:对于输入数据的准备方式较为敏感
适用数据类型:标称型数据
二、原理
三、文档分类
A,B,C,D..为文档中单词。假设总词汇只有A,B,C,D四种。训练样本为5个
A B C D 类别
文档1 0 0 1 1 0
文档2 0 1 1 1 0
文档3 1 0 0 1 1
文档4 1 1 0 0 1
文档5 1 1 1 0 1
测试文档 1 0 1 0 ?
类别:C0,C1
测试文档:W
求:max{P(C0|W),P(C1|W)} ===> max{log[P(C0|W)],log[P(C1|W)]}
P(C0|W) = P(W|C0) * P(C0) / P(W)
P(C0) = 2 / 5 ==> 2个0类型的文档,3个1类型的文档
P(W|C0) = P(A*B*C*D|C0) ==> Navie Bayes ==> P(A|C0) * P(B|C0) * P(C|C0) * P(D|C0)
P(A|C0)=(0 + 0)/(0 + 0 + 1 + 1 + 0 + 1 + 1 + 1)=0 ==> A在类别0文档中出现的次数/ 类别0文档中的总词汇量
P(B|C0)=(0 + 1)/(0 + 0 + 1 + 1 + 0 + 1 + 1 + 1)=1/5 ==> B在类别0文档中出现的次数/ 类别0文档中的总词汇量
P(C|C0)=(1 + 1)/(0 + 0 + 1 + 1 + 0 + 1 + 1 + 1)=2/5 ==> C在类别0文档中出现的次数/ 类别0文档中的总词汇量
P(D|C0)=(1 + 1)/(0 + 0 + 1 + 1 + 0 + 1 + 1 + 1)=2/5 ==> D在类别0文档中出现的次数/ 类别0文档中的总词汇量
因为相乘为存在0* ==>0 取log
log[P(W|C0) * P(C0)] = log[P(A|C0) * P(B|C0) * P(C|C0) * P(D|C0) * P(C0)]
=log[P(A|C0)] + log[P(B|C0)] + log[P(C|C0)] + log[P(D|C0) ] + log[P(C0)]
同理计算log[P(W|C1) * P(C1)]
测试样本:
log[P(C0|W)] = 0 * log(1/5) + 1 * log(2/5) + 0 * log(2/5) + log(2/5) =
log[P(C1|W)] = 1 * log(3/7) + 0 * log(2/7) + 1 * log(1/7) + 0 * log(1/7) + log(1 - 2/5) =
# -*- coding:UTF-8
from
numpy import *
'''
1.伯努利模型==>不考虑词在文档中出现的次数,只考虑出不出现。假定词是等权重中的
2.多项式模型
'''
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocaSet = set([])
for document in dataSet:
vocaSet = vocaSet | set(document)
return list(vocaSet)
'''
vocabList = ['','',.....]
inputSet = ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
'''
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print 'the word: %s is not in my vocabulary!' % word
return returnVec
'''
P(c|w) = P(w|c) * P(c) / P(w)
1.P(c)
2.P(w|c)
trainMatrix
trainCategory===>[0,0,1,1,0] 标签集合的向量
pAbusive = (0 + 0 + 1 + 1 + 0) / 5
A B C D category
0 0 1 1 0
0 1 1 1 0
1 0 0 1 1
1 1 0 0 1
1 1 1 0 1
1 0 1 0 ?
numTrainDocs = 5 => 5个文档
numWords = 4 => 4个
特征
pAbusive = (0 + 0 + 0 + 1 + 1) / 5 = 2/5 ==>
先验概率
p0Num = [0,0,0,0]
p1Num = [0,0,0,0]
p0Denom = 0.0
p1Denom = 0.0
0 0 1 1 0 ===> p0Num=[0,0,1,1] p0Denom=1
0 1 1 1 0 ===> p0Num=[0,1,2,2] p0Denom=2
1 0 0 1 1 ===> p1Num=[1,0,0,1] p1Denom=1
1 1 0 0 1 ===> p1Num=[2,1,0,1] p1Denom=2
1 1 1 0 1 ===> p1Num=[3,2,1,1] p1Denom=3
P(C0|W) = P(W|C0) * P(C0) / P(W) = P(A*B*C*D|C0) * P(C0) / P(W) = P(A|C0) * P(B|C0) * P(C|C0) * P(D|C0) * P(C0) / P(W)
P(C1|W) = P(W|C1) * P(C1) / P(W) = P(A*B*C*D|C1) * P(C1) / P(W) = P(A|C1) * P(B|C1) * P(C|C1) * P(D|C1) * P(C1) / P(W)
P(W) ==> 无需再计算了
max{P(C0|W),P(C1|W)} ===> max{Log[P(C0|W)],Log[P(C1|W)]}
Log[P(C0|W)] = Log[P(A|C0)] + Log[P(B|C0)] + Log[P(C|C0)] + Log[P(D|C0)] + Log[P(C0)]
P(A|C0) = 0/(0+1+2+2) = 0/5
P(B|C0) = 1/(0+1+2+2) = 1/5
P(C|C0) = 2/(0+1+2+2) = 2/5
P(D|C0) = 2/(0+1+2+2) = 2/5
Log[P(C1|W)] = Log[P(A|C1)] + Log[P(B|C1)] + Log[P(B|C1)] + Log[P(B|C1)] + Log[P(C1)]
P(A|C1) = 3/(3+2+1+1) = 3/7
P(B|C1) = 2/(3+2+1+1) = 2/7
P(C|C1) = 1/(3+2+1+1) = 1/7
P(D|C1) = 1/(3+2+1+1) = 1/7
测试样本1 0 1 0 ?
Log[P(C0|W)] = 1 * Log[0/5] + 0 * Log[1/5] + 1 * Log[2/5] + 0 * Log[2/5] + Log[2/5]
Log[P(C1|W)] = 1 * Log[3/7] + 0 * Log[2/7] + 1 * Log[1/7]+ 0 * Log[1/7] + Log[1 - 2/5]
注意存在Log[0] ==> 所有初始化,我们设置
p0Num = [1,1,1,1]
p1Num = [1,1,1,1]
p0Denom = 2.0
p1Denom = 2.0
'''
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = zeros(numWords)
p1Num = zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vec = log(p1Num/p1Denom)
p0Vec = log(p0Num/p0Denom)
return p0Vec,p1Vec,pAbusive
def trainNB1(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vec = log(p1Num/p1Denom)
p0Vec = log(p0Num/p0Denom)
return p0Vec,p1Vec,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postingDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postingDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry,' classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
四、过滤垃圾邮件
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString) #简单空格分词
return [tok.lower() for tok in listOfTokens if len(tok) > 2] #简单过滤词长<=2的词
def spamTest():
docList = []
classList = []
#fullText = []
for i in range(1,26):
#读取所有的单词
wordList = textParse(open('emial/spam/%d.txt' % i).read())
docList.append(wordList)
#fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('emial/ham/%d.txt' % i).read())
docList.append(wordList)
#fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainSet = range(50)
testSet = []
for i in range(10):
randIndex = int(random.uniform(0,len(trainSet)))
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(trainMat, trainClasses)
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print 'classification error',docList[docIndex]
print 'the error rate is: ',float(errorCount) / len(testSet)
CDA数据分析师考试相关入口一览(建议收藏):
▷ 想报名CDA认证考试,点击>>>
“CDA报名”
了解CDA考试详情;
▷ 想加入CDA考试题库,点击>>> “CDA题库” 了解CDA考试详情;
▷ 想学习CDA考试教材,点击>>> “CDA教材” 了解CDA考试详情;
▷ 想查询CDA考试成绩,点击>>> “CDA成绩” 了解CDA考试详情;
▷ 想了解CDA考试含金量,点击>>> “CDA含金量” 了解CDA考试详情;
▷ 想获取CDA考试时间/费用/条件/大纲/通过率,点击 >>>“CDA考试官网” 了解CDA考试详情;