首页 > 代码库 > bayes

bayes

from numpy import *import timestarttime = time.time()def loadDataSet():     postingList = [[my, dog, has, flea,                    problems, help, please],                    [maybe, not, take, him,                    to, dog, park, stupid],                    [my, dalmation, is, so, cute,                    I, love, him],                    [stop, posting, stupid, worthless,                     garbage],                    [mr, licks, ate, my, steak, how,                    to, stop, him],                    [quit, buying, worthless, dog, food,                    stupid]]    classVec = [0, 1, 0, 1, 0, 1]     return postingList, classVecdef createVocabList(dataSet): # dataSet = postingList     vocabSet = set([]) # vocabSet = set(dataSet)    for document in dataSet:        vocabSet = vocabSet | set(document) #     return list(vocabSet) # createVocabList = list(set(dataSet)) def setOfWords2Vec(vocabList, inputSet):     returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0    for word in vocabList:        if word in inputSet:            returnVec[vocabList.index(word)] = 1 + 1.0        else:            returnVec[vocabList.index(word)] = 1.0            print "the word: %s is not in my Vocabulary!" % word    return returnVec def txt2trainxy(filename1, filename2):    import re    reg = re.compile(r\W*) #    # step 1: loading data...    print "stet 1: loading data..."    from os import listdir    ld1 = listdir(email/ + filename1); ld2 = listdir(email/ + filename2)    filelist = ld1 + ld2    trainy = ((filename1 + \t) * len(ld1) + (filename2 + \t) * len(ld2)).split()        trainx = []; fulltext = []; i = 0    for File in filelist:        if i < len(ld1):            fr = reg.split(open(email/ + filename1 + / + File).readlines()[0].lower())        else:            fr = reg.split(open(email/ + filename2 + / + File).readlines()[0].lower())        trainx.append([f for f in fr if len(f) > 2]) #        fulltext.extend([f for f in fr if len(f) > 2]) #        i += 1    fulltext = list(set(fulltext))    # set of words    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]    # bag of words     trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]    return trainxws, trainxwb, trainy, trainx, fulltextdef testx2vec(testx, fulltext):    # set of words    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #    # bag of words     testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #    for word in testx:        if word not in fulltext:            print "the word: %s is not in my fulltext!" % word    return testxws, testxwbdef bayes(testx, trainx, trainy, fulltext):    print "---Getting Prob..."    s = set(trainy); l = len(trainy); r = len(trainx[0])    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]    logprobx = [[log(p[i]) for i in range(r)] for p in probx]    print "---Printing Prob..."    #print probx    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big    print trainy[IDs[0][0]]    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]    print trainy[IDs[1][0]]    """    print IDs    print numbxv    print logprobx    """    # step 4: showing the result...    print "---Showing the result..."    # set of words    sumlogpxws = sum(array(logprobx) * testx, 1)    sumlogpxyws = array(sumlogpxws) + array(logproby)    #print logprobx    print sumlogpxws    print sum(array(probx) * testx, 1)    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]    print "---From set of words: ", bestyws    """    # bag of words    sumlogpxwb = sum(array(logprobx) * testxwb, 1)    sumlogpxywb = array(sumlogpxwb) + array(logproby)    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]    print "---From bag of words: ", bestywb    """    return bestyws    def main():    # step 1: loading data...    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy(spam,ham)    print fulltext    # step 2: training...    print "step 2: training..."    pass    # step 3: testing...    print "step 3: testing..."    print "---Preparing testdata..."    import random    l = len(trainy)    testid = random.sample(range(l), 20)    testxxx = [trainxws[i] for i in testid]    testyyy = [trainy[i] for i in testid]    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]    testtrainy = [trainy[i] for i in range(l) if i not in testid]    print "---Testing now..."    errorcount = 0; p = len(testid)    for i in range(p):        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:            errorcount += 1    print errorcount    print p    print "---Errorrate is: ", (errorcount / float(p))    # step 4: showing the result    print "step 4: using..."    testx = [love, my, dalmation]    print "the testx is: ", testx    print "---Changing testx into vector..."    testxws, testxwb = testx2vec(testx, fulltext)    #print testxws    bayes(testxws, testtrainxws, testtrainy, fulltext)main()"""trainx, trainy = loadDataSet()fulltext = createVocabList(trainx)print fulltextprint setOfWords2Vec(fulltext, trainx[0])trainxws = []for t in trainx:    trainxws.append(setOfWords2Vec(fulltext, t))testEntry1 = [‘love‘, ‘my‘, ‘dalmation‘]testEntry2 = [‘stupid‘, ‘garbage‘]bayes(testEntry1, trainxws, trainy, fulltext)"""

 

bayes