#!/sw/bin/python import math import sys import glob import pickle import optimize from dicts import DefaultDict # In the documentation and variable names below "class" is the same # as "category" def train_maxent (dirs): """Train and return a MaxEnt classifier. The datastructure returned is dictionary whose keys are ('classname','word') tuples. The values in the dictionary are the parameters (lambda weights) of the classifier. Note that this method does not return the list of classnames, but the caller has those available already, since it is exactly the 'dirs' argument. If you need to recover the classnames from the diciontary itself, you'd need to do something like: maxent = train_maxent(dirs) classes = list(set([c for (c,v) in maxent.keys()])) Some typical usage: dirs = ['spam','ham'] # where these are sub-directories of the CWD maxent = train_maxent(dirs) # interested in seeing the weight of "nigerian" in the "spam" class? lambda_spam_nigerian = maxent[('spam','nigerian')] # to classify a document scores = classify(maxent,dirs,"spam/file123") """ classes = dirs maxent = DefaultDict(0) # Gather the "constraints" and initialize all-zero maxent dictionary constraints = DefaultDict(0) for cls in classes: maxent[(cls,'DEFAULT')] = 0 print cls for file in glob.glob(cls+"/*"): for word in open(file).read().split(): word = word.lower() constraints[(cls,word)] += 1 for clss in classes: maxent[(clss,word)] = 0 # Remember the maxent features, and get the starting point for optimization features = maxent.keys() lambda0 = maxent.values() # Here call an optimizer to find the best lambdas lambdaopt = optimize.fminNCG(value, lambda0, gradient, args=(features,dirs), printmessg=1) # Put the final optimal parameters are in returned dictionary assert maxent.keys() == features # Make sure the keys have not changed order maxent2 = dict([(k,v) for (k,v) in zip(maxent.keys(),lambdaopt)]) return maxent2 def gradient (lambdas, keys, dirs): # TO DO: Implement this! return None def value (lambdas, keys, dirs): """Return the log-likelihood of the true labels of the documents in the directories in the list 'dirs', using the parameters given in lambdas, where those lambdas correspond to the (word,class) keys given in 'keys'.""" # Build a MaxEnt classifier dictionary from the keys and lambdas maxent = dict([(k,v) for (k,v) in zip(keys,lambdas)]) # Use this MaxEnt classifier to classify all the documents in dirs # Accumulate the log-likelihood of the correct class classes = dirs total_log_prob = 0 for c in classes: for file in glob.glob(c+"/*"): probs = classify(maxent,file) # Pull out of 'probs' the log-prob of class c # Remember, probs looks like [(0.85,'spam'), (0.15,'ham')] true_class = [x[0] for x in probs if x[1] == c] true_class_prob = true_class[0] total_log_prob += math.log(true_class_prob) # Return the NEGATIVE total_log_prob because fminNCG minimizes, # and we want to MAXIMIZE log probability # TO DO: Incorporate a Gaussian prior on parameters here! return - total_log_prob def classify (maxent, classes, filename): """Given a trained MaxEnt classifier returned by train_maxent(), and the filename of a test document, d, return an array of tuples, each containing a class label; the array is sorted by log-probability of the class, log p(c|d)""" scores = [] print 'Classifying', filename for c in classes: # Put in the weight for the default feature score = maxent[(c,'DEFAULT')] # Put in the weight for all the words in the document for word in open(filename).read().split(): word = word.lower() score += maxent[(c,word)] scores.append(score) # exp() and normalize the scores to turn them into probabilities minimum = min(scores) scores = [math.exp(x-minimum) for x in scores] normalizer = sum(scores) scores = [x/normalizer for x in scores] # make the scores list actually contain tuples like (0.84,"spam") scores = zip(scores,classes) scores.sort() return scores if __name__ == '__main__': print 'argv', sys.argv print "Usage:", sys.argv[0], "classdir1 classdir2 [classdir3...] testfile" dirs = sys.argv[1:-1] testfile = sys.argv[-1] maxent = train_maxent (dirs) print classify(maxent, dirs, testfile) pickle.dump(maxent, open("maxent.pickle",'w')) # E.g. type at command line # python maxent.py spam ham spam/file123 # You will need the Numeric and MLab libraries to be installed. # Otherwise you can implement your own conjugate gradient method, # which isn't very hard either. For example, see "Numeric Recipes in C".