# -*- encoding : utf-8 -*-classBeyesClassifier::BaseextendBeyesClassifier::Storage::ActAsStorableattr_reader:nameattr_reader:word_listattr_reader:category_listattr_reader:training_countattr_accessor:tokenizerattr_accessor:languageattr_accessor:thresholdsattr_accessor:min_probstorable:version,:word_list,:category_list,:training_count,:thresholds,:min_prob# opts :# language# stemming : true | false# weight# assumed_prob# storage# purge_state ?definitialize(name,opts={})@version=BeyesClassifier::VERSION@name=name# This values are nil or are loaded from storage@word_list={}@category_list={}@training_count=0# storagepurge_state=opts[:purge_state]@storage=opts[:storage]||BeyesClassifier::Base.storageunlesspurge_state@storage.load_state(self)else@storage.purge_state(self)end# This value can be set during initialization or overrided after load_state@thresholds=opts[:thresholds]||{}@min_prob=opts[:min_prob]||0.0@ignore_words=nil@tokenizer=BeyesClassifier::Tokenizer.new(opts)enddefincr_word(word,category)@word_list[word]||={}@word_list[word][:categories]||={}@word_list[word][:categories][category]||=0@word_list[word][:categories][category]+=1@word_list[word][:_total_word]||=0@word_list[word][:_total_word]+=1# words count by categroy@category_list[category]||={}@category_list[category][:_total_word]||=0@category_list[category][:_total_word]+=1enddefincr_cat(category)@category_list[category]||={}@category_list[category][:_count]||=0@category_list[category][:_count]+=1@training_count||=0@training_count+=1end# return number of times the word appears in a categorydefword_count(word,category)return0.0unless@word_list[word]&&@word_list[word][:categories]&&@word_list[word][:categories][category]@word_list[word][:categories][category].to_fend# return the number of times the word appears in all categoriesdeftotal_word_count(word)return0.0unless@word_list[word]&&@word_list[word][:_total_word]@word_list[word][:_total_word].to_fend# return the number of words in a categoriesdeftotal_word_count_in_cat(cat)return0.0unless@category_list[cat]&&@category_list[cat][:_total_word]@category_list[cat][:_total_word].to_fend# return the number of training itemdeftotal_cat_count@training_countend# return the number of training document for a categorydefcat_count(category)@category_list[category][:_count]?@category_list[category][:_count].to_f:0.0end# return the number of time categories in wich a word appeardefcategories_with_word_count(word)return0unless@word_list[word]&&@word_list[word][:categories]@word_list[word][:categories].lengthend# return the number of categoriesdeftotal_categoriescategories.lengthend# return categories listdefcategories@category_list.keysend# train the classifierdeftrain(category,text)@tokenizer.each_word(text){|w|incr_word(w,category)}incr_cat(category)end# classify a textdefclassify(text,default=nil)# Find the category with the highest probabilitymax_prob=@min_probbest=nilscores=cat_scores(text)scores.eachdo|score|cat,prob=scoreifprob>max_probmax_prob=probbest=catendend# Return the default category in case the threshold condition was# not met. For example, if the threshold for :spam is 1.2## :spam => 0.73, :ham => 0.40 (OK)# :spam => 0.80, :ham => 0.70 (Fail, :ham is too close)returndefaultunlessbestthreshold=@thresholds[best]||1.0scores.eachdo|score|cat,prob=scorenextifcat==bestreturndefaultifprob*threshold>max_probendreturnbestenddefsave_state@storage.save_state(self)endclass<<selfattr_writer:storagedefstorage@storage=BeyesClassifier::InMemoryStorage.newunlessdefined?@storage@storageenddefopen(name)inst=self.new(name)ifblock_given?yieldinstinst.save_stateelseinstendendendend
# 训练函数deftrain(category,text)each_word(text){|w|increment_word(w,category)}increment_cat(category)end# 使用classifier.train:spam,"Grow your penis to 20 inches in just 1 week"classifier.train:ham,"I'm hungry, no I don't want your penis"
defclassify(text,default=nil)# Find the category with the highest probabilitymax_prob=0.0best=nilscores=cat_scores(text)scores.eachdo|score|cat,prob=scoreifprob>max_probmax_prob=probbest=catendend# Return the default category in case the threshold condition was# not met. For example, if the threshold for :spam is 1.2## :spam => 0.73, :ham => 0.40 (OK)# :spam => 0.80, :ham => 0.70 (Fail, :ham is too close)returndefaultunlessbestthreshold=@thresholds[best]||1.0scores.eachdo|score|cat,prob=scorenextifcat==bestreturndefaultifprob*threshold>max_probendreturnbestend