from __future__ import division from collections import defaultdict import os import random # Global class labels. POS_LABEL = 'pos' NEG_LABEL = 'neg' # Path to dataset - FILL IN THE FOLLOWING LINE PATH_TO_DATA = "/PATH/TO/YOU/DATASET" TRAIN_DIR = os.path.join(PATH_TO_DATA, "train") TEST_DIR = os.path.join(PATH_TO_DATA, "test") ################################### # Utilities def dict_subtract(vec1, vec2): """treat vec1 and vec2 as dict representations of sparse vectors""" out = defaultdict(float) out.update(vec1) for k in vec2: out[k] -= vec2[k] return dict(out) def dict_argmax(dct): """Return the key whose value is largest. In other words: argmax_k dct[k]""" return max(dct.iterkeys(), key=lambda k: dct[k]) def dict_dotprod(d1, d2): """Return the dot product (aka inner product) of two vectors, where each is represented as a dictionary of {index: weight} pairs, where indexes are any keys, potentially strings. If a key does not exist in a dictionary, its value is assumed to be zero.""" smaller = d1 if len(d1)