"""
this was the code used to create the ngram count tables.
it is NOT needed to do the assignment.
"""

from __future__ import division
from collections import defaultdict
from collections import Counter
import os
import re
import nltk
import json

def load_files(PATH_TO_DATA):
    input_file_list=os.listdir(PATH_TO_DATA)

    sub_sample_rate=0.1
    fname2content={}
    for f in input_file_list:
        with open(os.path.join(PATH_TO_DATA,f),'r') as doc:
            content = doc.read()
            fname2content[f]=content
    return fname2content

filter_tokens_set=set(["br","/",">","<"])

def remove_non_ASCII(content):
    content_printable_list=[c for c in content if (32 <= ord(c) and ord(c) <= 126)]
    return ''.join(content_printable_list)


def collect_all_sentences(fname2content,sentence_splitter):
    
    all_sentence_list=[]

    for filename,content in fname2content.items():
        content_printable=remove_non_ASCII(content)
        sentences_raw = sentence_splitter.sentences_from_text(content_printable)
        sentences_toks_origcase = [nltk.word_tokenize(sent_text) for sent_text in sentences_raw]
        sentences_toks = [[w.lower() for w in sent_toks if w not in filter_tokens_set] for sent_toks in sentences_toks_origcase]
        all_sentence_list+=sentences_toks

    return all_sentence_list


def make_ngrams(tokens, ngram_size):
    """Return a list of ngrams, of given size, from the input list of tokens.
    Also include **START** and **END** tokens appropriately."""
    ngrams = []
    tokens = ['**START**'] * (ngram_size-1) + tokens + ['**END**'] * (ngram_size-1)
    for i in range(ngram_size, len(tokens)+1):
        ngrams.append( tuple(tokens[i-ngram_size:i]))
    
    return ngrams

class NgramModelCounts:
    def __init__(self):
        self.vocabulary = set()
        self.ngram_size = None
        self.ngram_counts = defaultdict(lambda:defaultdict(int))

def get_ngram_counts(sentences, ngram_size):
    """'Train' a fixed-order ngram model by doing the necessary ngram counts.
    Return a data structure that represents the counts."""
    model = NgramModelCounts()
    model.ngram_size = ngram_size
    model.vocabulary.add("**START**")
    model.vocabulary.add("**END**")
    if(ngram_size == 1):
        model.ngram_counts = defaultdict(int)
    for sent_tokens in sentences:
        if(ngram_size == 1):
            model.ngram_counts["**START**"]+=1
            model.ngram_counts["**END**"]+=1
            for tok in sent_tokens:
                model.ngram_counts[tok] += 1
        else:
            ngrams = make_ngrams(sent_tokens, ngram_size)
            for ngram in ngrams:
                #prefix = tuple(ngram[:ngram_size-1])
                prefix = ngram[0]
                model.ngram_counts[prefix][ngram[-1]] += 1
        for tok in sent_tokens:
            model.vocabulary.add(tok)
    return model


# http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')

PATH_TO_DATA="/Users/ken77921/Dropbox/course/NLP_TA/for hw1,hw2 - old wordstats, lm assignment/imdb_pos_sample"
fname2content=load_files(PATH_TO_DATA)
movie_review_sents=collect_all_sentences(fname2content,sentence_splitter)

print "Total number of sentences we get: ", len(movie_review_sents)
print movie_review_sents[:5]

uni_gram = get_ngram_counts(movie_review_sents,1)
bi_gram = get_ngram_counts(movie_review_sents,2)


with open('unigram_count_IMDB.json', 'w') as fp:
    json.dump(uni_gram.ngram_counts, fp)

with open('bigram_count_IMDB.json', 'w') as fp:
    json.dump(bi_gram.ngram_counts, fp)