# Run this cell! It sets some things up for you.

# This code makes plots appear inline in this document rather than in a new window.
import matplotlib.pyplot as plt

# This code imports your work from hw1.py
from hw1 import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (5, 4) # set default size of plots

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


# download the IMDB large movie review corpus to a file location on your computer

PATH_TO_DATA = 'large_movie_review_dataset'  # set this variable to point to the location of the IMDB corpus on your computer
POS_LABEL = 'pos'
NEG_LABEL = 'neg'
TRAIN_DIR = os.path.join(PATH_TO_DATA, "train")
TEST_DIR = os.path.join(PATH_TO_DATA, "test")

for label in [POS_LABEL, NEG_LABEL]:
    if len(os.listdir(TRAIN_DIR + "/" + label)) == 12500:
        print("Great! You have 12500 {} reviews in {}".format(label, TRAIN_DIR + "/" + label))
    else:
        print("Oh no! Something is wrong. Check your code which loads the reviews")


# Actually reading the data you are working with is an important part of NLP! Let's look at one of these reviews

print (open(TRAIN_DIR + "/neg/3740_2.txt").read())


# We have provided a tokenize_doc function in hw1.py. Here is a short demo of how it works

d1 = "This SAMPLE doc has   words tHat  repeat repeat"
bow = tokenize_doc(d1)

assert bow['this'] == 1
assert bow['sample'] == 1
assert bow['doc'] == 1
assert bow['has'] == 1
assert bow['words'] == 1
assert bow['that'] == 1
assert bow['repeat'] == 2

bow2 = tokenize_doc("Computer science is both practical and abstract.")
for b in bow2:
    print(b)


import glob
import codecs
from collections import defaultdict, Counter
word_counts = Counter() # Counters are often useful for NLP in python. Similar to dicts (you can also use those)

for label in [POS_LABEL, NEG_LABEL]:
    for directory in [TRAIN_DIR, TEST_DIR]:
        for fn in glob.glob(directory + "/" + label + "/*txt"):
            doc = open(fn, 'r', encoding='utf8') # Open the file with UTF-8 encoding
            # IMPLEMENT ME


if word_counts["movie"] == 61492:
    print ("yay! there are {} total instances of the word type movie in the corpus".format(word_counts["movie"]))
else:
    print ("hmm. Something seems off. Double check your code")


print ("there are {} word types in the corpus".format(n_word_types(word_counts)))
print ("there are {} word tokens in the corpus".format(n_word_tokens(word_counts)))


# IMPLEMENT ME!


nb = NaiveBayes(PATH_TO_DATA, tokenizer=tokenize_doc)
nb.train_model()

if len(nb.vocab) == 251637:
    print("Great! The vocabulary size is {}".format(251637))
else:
    print("Oh no! Something seems off. Double check your code before continuing. Maybe a mistake in update_model?")


print("TOP 10 WORDS FOR CLASS " + POS_LABEL + ":")
for tok, count in nb.top_n(POS_LABEL, 10):
    print('', tok, count)
print()

print("TOP 10 WORDS FOR CLASS " + NEG_LABEL + ":")
for tok, count in nb.top_n(NEG_LABEL, 10):
    print('', tok, count)
print()


print("P('amazing'|pos):",  nb.p_word_given_label("amazing", POS_LABEL))
print("P('amazing'|neg):",  nb.p_word_given_label("amazing", NEG_LABEL))
print("P('dull'|pos):",  nb.p_word_given_label("dull", POS_LABEL))
print("P('dull'|neg):",  nb.p_word_given_label("dull", NEG_LABEL))


print("P('car-thievery'|pos):",  nb.p_word_given_label("car-thievery", POS_LABEL))
print("P('car-thievery'|neg):",  nb.p_word_given_label("car-thievery", NEG_LABEL))


print("P('stop-sign.'|pos):",  nb.p_word_given_label_and_alpha("stop-sign.", POS_LABEL, 0.2))


# Implement the nb.likelihood_ratio function and use it to investigate the likelihood ratio of "amazing" and "dull"
print ("LIKELIHOOD RATIO OF 'amazing':", nb.likelihood_ratio('amazing', 0.2))
print ("LIKELIHOOD RATIO OF 'dull':", nb.likelihood_ratio('dull', 0.2))
print ("LIKELIHOOD RATIO OF 'and':", nb.likelihood_ratio('and', 0.2))
print ("LIKELIHOOD RATIO OF 'to':", nb.likelihood_ratio('to', 0.2))


# Implement me!


print(nb.evaluate_classifier_accuracy(0.2))


print(nb.evaluate_classifier_accuracy(1000.0))


# in this cell, print out a review that your classifier got wrong. Print out the text of the review along with the label


# from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
from nltk.stem.snowball import SnowballStemmer 
from nltk.util import ngrams
stemmer = SnowballStemmer('english')
# stopset = set(stopwords.words('english'))
def tokenize_doc_and_more(doc): 
  """
  Return some representation of a document.
  At a minimum, you need to perform tokenization, the rest is up to you. 
  """
  # Implement me!
  bow = defaultdict(float)
  # your code goes here

  return bow


nb = NaiveBayes(PATH_TO_DATA, tokenizer=tokenize_doc_and_more)
nb.train_model()
nb.evaluate_classifier_accuracy(1.0)


# Your experiments and explanations go here

Problem set 1, Applications of Natural Language Processing, Fall 2023¶

This is due on September 29th at 11:59pm. Please see detailed submission instructions below. 110 points total.¶

How to do this problem set:¶

How to submit this problem set:¶

Academic honesty:¶

Part One: Intro to NLP in Python: types, tokens and Unix commands¶

Types and tokens¶

Unix Text Processing¶

Part Two: Naive Bayes¶

Exploratory analysis¶