# This code should help get you started, but it is not guaranteed to # be bug free! If you find problems, please report to # compling-class@cs.umass.edu import sys from dicts import DefaultDict from random import choice def Dict(**args): """Return a dictionary with argument names as the keys, and argument values as the key values""" return args def hmm (file): """Given an open FILE, e.g. from the open(filename) function, Read pre-tagged sentences of WSJ, one per line. Return an HMM, here represented as a tuple containing (1) the transition probabilities, and (2) the emmission probabilities.""" transitions = DefaultDict(DefaultDict(0)) emissions = DefaultDict(DefaultDict(0)) wordcounts = DefaultDict(0) # For each sentence (one per line) for line in file.xreadlines(): # for each word in the sentence (space separated) prevtag = 'START' # Before each sentence, begin in START state for taggedword in line.split(): (word, tag) = taggedword.split('/') transitions[prevtag][tag] += 1 emissions[tag][word] += 1 wordcounts[word] += 1 # At test time we will need estimates for "unknown words"---the words # the words that never occurred in the training data. One recommended # way to do this is to turn all training words occurring just once # into '' and use this as the stand-in for all "unknown words" # at test time. Below we make all the necessary transformations # to ''. for tag,dict in emissions.items(): for word,count in dict.items(): if wordcounts[word] == 1: del emissions[tag][word] emissions[tag][''] += 1 # Here you need to add code that will turn these dictionaries # of counts into dictionaries of smoothed conditional probabilities return (transitions, emissions) def viterbi_tags (untagged_sentence): """Given a string containing the space-separated words of a sentence; (there should even be spaces on either side of punctuation, as in the WSJ training data), return an array containing the mostl likely sequence of part-of-speech tags.""" wordarray = untagged_sentence.split() # Implement Viterbi here # return the mostly likely sequence of part-of-speech tags def true_tags (tagged_sentence): """Given a string containing the space-separated words/POS of a sentence; (there should even be spaces on either side of punctuation, as in the WSJ training data) pull out and return the tag sequence.""" wordarray = tagged_sentence.split() tags = [word.split('/')[1] for word in wordarray] return tags if __name__ == '__main__': print "Usage:", sys.argv[0], "wsjtrainfile wsjtestfile" dirs = sys.argv[1:-1] testfile = sys.argv[-1] h = hmm (sys.stdin) print h[0] print '------' print h[1] print true_tags ('The/DT August/NNP deficit/NN and/CC the/DT #/# 2.2/CD billion/CD gap/NN')