朴素贝叶斯的垃圾邮件识别算法
作者:Aliot
发布时间:2019-02-12
评论:0
阅读:6
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import math
import sys
#创建去重的词汇表
def create_vocabulary_list(dataset):
# dataset 包含了多条留言的文本,每一条留言为一个列表
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
def wordsToVector(vocabset, inputset):
# vocabset 是词汇表, inputset 为要转化为向量的文本
vector = [0]*len(vocabset)
for word in inputset:
if word in vocabset:
vector[vocabset.index(word)] = 1
else:
print("The word: {} is not in my Vocabulary.".format(word))
return vector
def traing_bayes(trainset, category):
"""sovle the conditional probility"""
# trainCategory 是标记为负类和正类的向量
num_train = len(trainset) # 获取训练集的数量,其中训练集是由文本向量组成
num_vocab = len(trainset[0]) # 由于训练集中的每个元素都是文本向量,即包含了整个词汇表
pAbusive = sum(category) / float(num_train)# 在训练集中计算负类的概率,负类的记录数除以训练集总数
#下面的这2个变量计算每个词汇在正负类下出现的次数,由于词汇表是训练集组成的不重复的词汇集,所以,每个词都来自词汇表,即至少会出现一次
pnorm_vector = np.ones(num_vocab) # 每个词汇出现在正类中的概率组成的向量
pabu_vector = np.ones(num_vocab) # 每个词汇出现在负类中的概率组成的向量
pnorm_denom = 2.0 # 正类中词汇的数目
pabu_denom = 2.0 # 负类中词汇的数目
for i in range(num_train):
if category[i] == 1: #负类
pabu_vector += trainset[i]
pabu_denom += sum(trainset[i])
else: #正类
pnorm_vector += trainset[i]
pnorm_denom += sum(trainset[i])
pnorm_vector = np.log(pnorm_vector / pnorm_denom)
pabu_vector = np.log(pabu_vector / pabu_denom)
return pnorm_vector, pabu_vector, pAbusive
def classify(testVector, pnorm_vector, pabu_vector, pabusive):
# 先将测试文本转化为向量
pnorm = sum(testVector*pnorm_vector) + np.log(1 - pabusive) # 计算测试文本为正类的概率
pabu = sum(testVector*pabu_vector) + np.log(pabusive) # 计算测试文本为负类的概率
if pabu > pnorm:
return 1
else:
return 0
post_list = [
['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'ny', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
] #训练集
category = [0, 1, 0, 1, 0, 1]
def test_bayes():
vocab_list = create_vocabulary_list(post_list) #训练集去重复词汇表
trainset = [] #训练集转向量
for post in post_list:
trainset.append(wordsToVector(vocab_list, post))
pnorm_vector, pabu_vector, pAbusive = traing_bayes(trainset, category)#用训练集进行训练得出先验概率
testEntry = ['love', 'my', 'dalmation']
testVector = np.array(wordsToVector(vocab_list, testEntry))
#判别
print(testEntry, "classified as: {}".format(classify(testVector, pnorm_vector, pabu_vector, pAbusive)))
testEntry = ['stupid', 'garbage']
testVector = np.array(wordsToVector(vocab_list, testEntry))
print(testEntry, "classified as: {}".format(classify(testVector, pnorm_vector, pabu_vector, pAbusive)))
test_bayes()