bayes

首页 > 代码库 > bayes

2024-11-16 22:23:02 203人阅读
from numpy import *import timestarttime = time.time()def loadDataSet():     postingList = [[‘my‘, ‘dog‘, ‘has‘, ‘flea‘,                    ‘problems‘, ‘help‘, ‘please‘],                    [‘maybe‘, ‘not‘, ‘take‘, ‘him‘,                    ‘to‘, ‘dog‘, ‘park‘, ‘stupid‘],                    [‘my‘, ‘dalmation‘, ‘is‘, ‘so‘, ‘cute‘,                    ‘I‘, ‘love‘, ‘him‘],                    [‘stop‘, ‘posting‘, ‘stupid‘, ‘worthless‘,                     ‘garbage‘],                    [‘mr‘, ‘licks‘, ‘ate‘, ‘my‘, ‘steak‘, ‘how‘,                    ‘to‘, ‘stop‘, ‘him‘],                    [‘quit‘, ‘buying‘, ‘worthless‘, ‘dog‘, ‘food‘,                    ‘stupid‘]]    classVec = [0, 1, 0, 1, 0, 1]     return postingList, classVecdef createVocabList(dataSet): # dataSet = postingList     vocabSet = set([]) # vocabSet = set(dataSet)    for document in dataSet:        vocabSet = vocabSet | set(document) #     return list(vocabSet) # createVocabList = list(set(dataSet)) def setOfWords2Vec(vocabList, inputSet):     returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0    for word in vocabList:        if word in inputSet:            returnVec[vocabList.index(word)] = 1 + 1.0        else:            returnVec[vocabList.index(word)] = 1.0            print "the word: %s is not in my Vocabulary!" % word    return returnVec def txt2trainxy(filename1, filename2):    import re    reg = re.compile(r‘\W*‘) #    # step 1: loading data...    print "stet 1: loading data..."    from os import listdir    ld1 = listdir(‘email/‘ + filename1); ld2 = listdir(‘email/‘ + filename2)    filelist = ld1 + ld2    trainy = ((filename1 + ‘\t‘) * len(ld1) + (filename2 + ‘\t‘) * len(ld2)).split()        trainx = []; fulltext = []; i = 0    for File in filelist:        if i < len(ld1):            fr = reg.split(open(‘email/‘ + filename1 + ‘/‘ + File).readlines()[0].lower())        else:            fr = reg.split(open(‘email/‘ + filename2 + ‘/‘ + File).readlines()[0].lower())        trainx.append([f for f in fr if len(f) > 2]) #        fulltext.extend([f for f in fr if len(f) > 2]) #        i += 1    fulltext = list(set(fulltext))    # set of words    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]    # bag of words     trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]    return trainxws, trainxwb, trainy, trainx, fulltextdef testx2vec(testx, fulltext):    # set of words    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #    # bag of words     testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #    for word in testx:        if word not in fulltext:            print "the word: %s is not in my fulltext!" % word    return testxws, testxwbdef bayes(testx, trainx, trainy, fulltext):    print "---Getting Prob..."    s = set(trainy); l = len(trainy); r = len(trainx[0])    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]    logprobx = [[log(p[i]) for i in range(r)] for p in probx]    print "---Printing Prob..."    #print probx    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big    print trainy[IDs[0][0]]    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]    print trainy[IDs[1][0]]    """    print IDs    print numbxv    print logprobx    """    # step 4: showing the result...    print "---Showing the result..."    # set of words    sumlogpxws = sum(array(logprobx) * testx, 1)    sumlogpxyws = array(sumlogpxws) + array(logproby)    #print logprobx    print sumlogpxws    print sum(array(probx) * testx, 1)    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]    print "---From set of words: ", bestyws    """    # bag of words    sumlogpxwb = sum(array(logprobx) * testxwb, 1)    sumlogpxywb = array(sumlogpxwb) + array(logproby)    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]    print "---From bag of words: ", bestywb    """    return bestyws    def main():    # step 1: loading data...    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy(‘spam‘,‘ham‘)    print fulltext    # step 2: training...    print "step 2: training..."    pass    # step 3: testing...    print "step 3: testing..."    print "---Preparing testdata..."    import random    l = len(trainy)    testid = random.sample(range(l), 20)    testxxx = [trainxws[i] for i in testid]    testyyy = [trainy[i] for i in testid]    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]    testtrainy = [trainy[i] for i in range(l) if i not in testid]    print "---Testing now..."    errorcount = 0; p = len(testid)    for i in range(p):        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:            errorcount += 1    print errorcount    print p    print "---Errorrate is: ", (errorcount / float(p))    # step 4: showing the result    print "step 4: using..."    testx = [‘love‘, ‘my‘, ‘dalmation‘]    print "the testx is: ", testx    print "---Changing testx into vector..."    testxws, testxwb = testx2vec(testx, fulltext)    #print testxws    bayes(testxws, testtrainxws, testtrainy, fulltext)main()"""trainx, trainy = loadDataSet()fulltext = createVocabList(trainx)print fulltextprint setOfWords2Vec(fulltext, trainx[0])trainxws = []for t in trainx:    trainxws.append(setOfWords2Vec(fulltext, t))testEntry1 = [‘love‘, ‘my‘, ‘dalmation‘]testEntry2 = [‘stupid‘, ‘garbage‘]bayes(testEntry1, trainxws, trainy, fulltext)"""
bayes
声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。
联系
我们
首页 > 代码库 > bayes

bayes

看完仍有疑问？有类似问题直接问程序猿