#!/sw/bin/python

import math
import sys
import glob
import pickle
import optimize
import numpy
from dicts import DefaultDict
from random import shuffle

# In the documentation and variable names below "class" is the same
# as "category"

def train_maxent(train_instances, classes, gaussian_prior_variance):
    """Train and return a MaxEnt classifier.  
    The datastructure returned is dictionary whose keys are
    ('classname','word') tuples.  The values in the dictionary are
    the parameters (lambda weights) of the classifier.
    Note that this method does not return the list of classnames, 
    but the caller has those available already, since it is exactly the
    'classes' argument.  

    If you need to recover the classnames from the dictionary itself, 
    you'd need to do something like:
    maxent = train_maxent(instances, classes, variance)
    classes = list(set([c for (c,v) in maxent.keys()]))

    Some typical usage:
    classes = ['spam','ham']
    maxent = train_maxent(make_instances(classes), classes, 1.0)
    # interested in seeing the weight of "nigerian" in the "spam" class?
    lambda_spam_nigerian = maxent[('spam','nigerian')]
    # to classify some documents in directories corresponding to classes:
    for label, doc, name in make_instances(dirs):
        scores = classify_doc(maxent, classes, doc, name)
    """

    maxent = DefaultDict(0)

    for cls in classes:
        maxent[(cls,'DEFAULT')] = 0
        for label, doc, name in train_instances:
            for word, v in doc.iteritems():
                maxent[(cls, word)] = 0

    # Remember the maxent features, and get the starting point for optimization
    features = maxent.keys()
    lambda0 = [0] * len(features)
    # Here call an optimizer to find the best lambdas
    lambdaopt = optimize.fminNCG(value, lambda0, gradient, args=(features, classes, train_instances, gaussian_prior_variance), printmessg=1, maxiter=5, avextol=1e-4)
    # Put the final optimal parameters are in returned dictionary
    assert maxent.keys() == features # Make sure the keys have not changed order
    maxent2 = DefaultDict(0)
    for k, v in zip(features, lambdaopt):
        maxent2[k] = v
    return maxent2

def make_instances(dirs):
    classes = dirs
    instances = []
    for cls in classes:
        for file in glob.glob(cls+"/*"):
            doc = DefaultDict(0)
            name = file
            for word in open(file).read().split():
                word = word.lower()
                doc[word] = 1
            instances.append((cls, doc, name))
    return instances

def gradient(lambdas, features, classes, instances, gaussian_prior_variance):
    feature_count = len(lambdas)
    grad = numpy.zeros(feature_count) # optimize expects this to be a numpy array

    # TO DO: implement the gradient of the likelihood function (including prior)
    # remember to return the negative gradient because fminNCG minimizes.

    return grad

def value(lambdas, features, classes, instances, gaussian_prior_variance):
    """Return the log-likelihood of the true labels
    of the instances, using the parameters given in lambdas, where those lambdas
    correspond to the (word,class) keys given in 'features'."""
    # Build a MaxEnt classifier dictionary from the keys and lambdas
    maxent = dict([(k,v) for (k,v) in zip(features, lambdas)])
    # Use this MaxEnt classifier to classify all the documents in dirs
    # Accumulate the log-likelihood of the correct class
    total_log_prob = 0
    for label, doc, name in instances:
        class_probs = classify_doc(maxent, classes, doc, name)
        classes_to_probs = dict([(cls, prob) for prob, cls in class_probs])
        true_class_prob = classes_to_probs[label]
        total_log_prob += math.log(true_class_prob)

    prior_log_prob = 0.0
    # TO DO: Incorporate a Gaussian prior on parameters here!

    print "value:"
    print (total_log_prob + prior_log_prob)    

    # Return the NEGATIVE total_log_prob because fminNCG minimizes, 
    # and we want to MAXIMIZE log probability
    return -(total_log_prob + prior_log_prob)

def classify_doc(maxent, classes, doc, name):
    """Given a trained MaxEnt classifier returned by train_maxent(), and
    a test document tuple, doc, return an array of tuples, each
    containing a class label and the probability of the class according
    to the classifier."""
    scores = []
    # print 'Classifying', name
    for c in classes:
        # Put in the weight for the default feature
        score = maxent[(c, 'DEFAULT')]
        # Put in the weight for all the words in the document
        for word, v in doc.iteritems():
            weight = maxent[(c, word)]
            score += weight * v
        scores.append(score)
    # exp() and normalize the scores to turn them into probabilities
    maximum = max(scores)
    scores = [math.exp(x - maximum) for x in scores]
    normalizer = sum(scores)
    scores = [x / normalizer for x in scores]
    # make the scores list actually contain tuples like (0.84, "spam")
    scores = zip(scores, classes)
    return scores

def test_classifier(maxent, classes, instances, set_name):
    if len(instances) == 0:
        return
    correct = 0.0
    for label, doc, name in instances:
        # print "========="
        # print "true label: " + label
        res = classify_doc(maxent, dirs, doc, name)
        score, cls = max(res, key = lambda x:  x[0])
        # print "pred label: " + cls
        if cls == label:
            correct += 1.0
    print set_name + " accuracy: "
    print (correct / len(instances))

if __name__ == '__main__':
    print 'argv', sys.argv
    print "Usage:", sys.argv[0], "classdir1 classdir2 [classdir3...] train_portion"
    dirs = sys.argv[1:-1]
    train_portion = sys.argv[-1]
    # dirs = ["spamham/easy_ham_2", "spamham/spam_2"]
    classes = dirs
    # train_portion = 0.7
    gaussian_prior_variance = 10.0

    instances = make_instances(dirs)
    shuffle(instances)

    num_train = int(train_portion * len(instances))

    train_instances = instances[:num_train]
    test_instances = instances[num_train:]

    maxent = train_maxent (train_instances, classes, gaussian_prior_variance)

    test_classifier(maxent, classes, train_instances, "Train set")
    test_classifier(maxent, classes, test_instances, "Test set")

    pickle.dump(maxent, open("maxent.pickle", 'w'))

# E.g. type at command line
# python maxent.py spam ham 0.7
# where 0.7 is the portion of the data to use to train the model,
# and the model's accuracy will be evaluated on the other unseen portion.
# You will need the Numpy library to be installed.
# Otherwise you can implement your own conjugate gradient method, 
# which isn't very hard either.  For example, see "Numeric Recipes in C".