1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
| #encoding=utf-8
import re
import pickle
#spam类对象
import jieba;
import os;
class spamEmailBayes:
#获得停用词表
def getStopWords(self):
stopList=[]
for line in open("../data/stopwords"):
stopList.append(line[:len(line)-1])
return stopList;
#获得词典
def get_word_list(self,content,wordsList,stopList):
#分词结果放入res_list
res_list = list(jieba.cut(content))
for i in res_list:
if i not in stopList and i.strip()!='' and i!=None:
if i not in wordsList:
wordsList.append(i)
#若列表中的词已在词典中,则加1,否则添加进去
def addToDict(self,wordsList,wordsDict):
for item in wordsList:
if item in wordsDict.keys():
wordsDict[item]+=1
else:
wordsDict.setdefault(item,1)
def get_File_List(self,filePath):
filenames=os.listdir(filePath)
return filenames
#通过计算每个文件中p(s|w)来得到对分类影响最大的15个词
def getTestWords(self,testDict,spamDict,normDict,normFilelen,spamFilelen):
wordProbList={}
for word,num in testDict.items():
if word in spamDict.keys() and word in normDict.keys():
#该文件中包含词个数
pw_s=spamDict[word]/spamFilelen
pw_n=normDict[word]/normFilelen
ps_w=pw_s/(pw_s+pw_n)
wordProbList.setdefault(word,ps_w)
if word in spamDict.keys() and word not in normDict.keys():
pw_s=spamDict[word]/spamFilelen
pw_n=0.01
ps_w=pw_s/(pw_s+pw_n)
wordProbList.setdefault(word,ps_w)
if word not in spamDict.keys() and word in normDict.keys():
pw_s=0.01
pw_n=normDict[word]/normFilelen
ps_w=pw_s/(pw_s+pw_n)
wordProbList.setdefault(word,ps_w)
if word not in spamDict.keys() and word not in normDict.keys():
#若该词不在脏词词典中,概率设为0.4
wordProbList.setdefault(word,0.4)
sorted(wordProbList.items(),key=lambda d:d[1],reverse=True)[0:15]
return (wordProbList)
#计算贝叶斯概率
def calBayes(self,wordList,spamdict,normdict):
ps_w=1
ps_n=1
for word,prob in wordList.items() :
print(word+"/"+str(prob))
ps_w*=(prob)
ps_n*=(1-prob)
p=ps_w/(ps_w+ps_n)
# print(str(ps_w)+"////"+str(ps_n))
return p
#计算预测结果正确率
def calAccuracy(self,testResult):
rightCount=0
errorCount=0
for name ,catagory in testResult.items():
if (int(name)<1000 and catagory==0) or(int(name)>1000 and catagory==1):
rightCount+=1
else:
errorCount+=1
return rightCount/(rightCount+errorCount)
spam=spamEmailBayes()
#保存词频的词典
spamDict={}
normDict={}
testDict={}
#保存每封邮件中出现的词
wordsList=[]
wordsDict={}
#保存预测结果,key为文件名,值为预测类别
testResult={}
#分别获得正常邮件、垃圾邮件及测试文件名称列表
normFileList=spam.get_File_List("../data/normal")
spamFileList=spam.get_File_List("../data/spam")
testFileList=spam.get_File_List("../data/test2")
#获取训练集中正常邮件与垃圾邮件的数量
normFilelen=len(normFileList)
spamFilelen=len(spamFileList)
#获得停用词表,用于对停用词过滤
stopList=spam.getStopWords()
#获得正常邮件中的词频
for fileName in normFileList:
wordsList.clear()
for line in open("../data/normal/"+fileName,encoding='gbk'):
#过滤掉非中文字符
rule=re.compile(r"[^\u4e00-\u9fa5]")
line=rule.sub("",line)
#将每封邮件出现的词保存在wordsList中
spam.get_word_list(line,wordsList,stopList)
#统计每个词在所有邮件中出现的次数
spam.addToDict(wordsList, wordsDict)
normDict=wordsDict.copy()
output = open('norm.pkl','wb')
pickle.dump(normDict,output,-1)
output.close()
#获得垃圾邮件中的词频
wordsDict.clear()
for fileName in spamFileList:
wordsList.clear()
for line in open("../data/spam/"+fileName,encoding='gbk'):
rule=re.compile(r"[^\u4e00-\u9fa5]")
line=rule.sub("",line)
spam.get_word_list(line,wordsList,stopList)
spam.addToDict(wordsList, wordsDict)
spamDict=wordsDict.copy()
output = open('spam.pkl','wb')
pickle.dump(spamDict,output,-1)
output.close()
output = open('model.pkl','wb')
pickle.dump(spam,output,-1)
output.close()
# 测试邮件
for fileName in testFileList:
testDict.clear( )
wordsDict.clear()
wordsList.clear()
for line in open("../data/test2/"+fileName):
#for line in open("../data/test/"+fileName,encoding='gbk'):
rule=re.compile(r"[^\u4e00-\u9fa5]")
line=rule.sub("",line)
spam.get_word_list(line,wordsList,stopList)
spam.addToDict(wordsList, wordsDict)
testDict=wordsDict.copy()
#通过计算每个文件中p(s|w)来得到对分类影响最大的15个词
wordProbList=spam.getTestWords(testDict, spamDict,normDict,normFilelen,spamFilelen)
#对每封邮件得到的15个词计算贝叶斯概率
p=spam.calBayes(wordProbList, spamDict, normDict)
if(p>0.9):
testResult.setdefault(fileName,1)
else:
testResult.setdefault(fileName,0)
#计算分类准确率(测试集中文件名低于1000的为正常邮件)
testAccuracy=spam.calAccuracy(testResult)
for i,ic in testResult.items():
print(i+"/"+str(ic))
print(testAccuracy)
|