/* general.h */
/* Header file for Bayesian Model Merging and Hidden Markov Model code */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <netinet/in.h>
#include <netdb.h>
#include <signal.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdarg.h>

#define	DEBUG	0
static float no_prob = 10.5;
static char unk_word[] = "<UNK>";

typedef struct count_dist_t {
    char    *label;
    char    *path;
    int     num_types;
    double  num_tokens;
    int     vocab_size;
    double  *alphas;
    double  log_beta;
} count_dist;

typedef struct multinomial_t {
    char    *label;
    char    *path;
    int     num_types;
    int     num_tokens;
    int     vocab_size; /* Inclusive - vocab_size = max_id */
    double  uni_alpha;
    double  total_count;	/* Denominator count under appropriate mode */
    struct tc_t *counts;        /* linked list */
    float   *lprobs;   		/* linear array */
    struct count_dist_t *prior;	/* linear array */
    double  prior_weight_adjustment;
} multinomial;

/* Structure for holding prob values */
typedef struct tp_t {
    float   lprob;
    int     id;
    struct tp_t *next;
} tp;

/* Structure for holding count values */
typedef struct tc_t {
    int     count;
    int     id;
    struct tc_t *next;
} tc;

/* State linked list */
typedef struct ts_t {
    struct state_t *s1;
    struct ts_t *next;
    struct ts_t *prev;
} ts;


typedef struct trans_t {
   struct state_t *source;
   struct state_t *dest;
   double prob; 
   int count; /* observation count */
   double alpha; /* prior count */
   struct trans_t *next_source;
   struct trans_t *prev_source;
   struct trans_t *next_dest;
   struct trans_t *prev_dest;
} trans;

typedef struct ltrans_t {
   struct trans_t *trans;
   struct ltrans_t *next;
} ltrans;

typedef struct state_t {
    int     id;
    int     seen;
    char    *label;
    int     duration;
    double  lprior;
    double  llikelihood;
    double  trans_prior_weight_adjustment;
    multinomial   *O;
    trans   *in;
    trans   *out;
} state;

typedef struct state_pairs_t {
    state   *s1;
    state   *s2;
    double  value;
    struct state_pairs_t *next;
} state_pairs;

typedef struct sdata_t {
    char    *label;
    char    *word;
    struct sdata_t *next;
} sdata;

typedef struct shead_t {
    sdata    *string;
    int      count;
} shead;

typedef struct path_t {
    state *current;
    char *word;
    double ltprob;  /* Prob of transitioning from previous state to current state */
    double leprob;  /* Prob of current state emitting word */
    struct path_t *next;
} path;

typedef struct path_head_t {
    double lprob;
    path   *first;
} path_head;


/* For Viterbi search */
typedef struct delta_t {
    state  *source;  /* Backpointer to previous state, same as gamma variable */
    state *dest;     /* Current state */
    int    duration; /* Count of how long the path has been in the dest state */
    double lprob;    /* Log prob of being in current state at current time */
    double leprob;   /* Prob of transitioning from source state to dest state */
    double ltprob;   /* Prob of dest state emitting word */
    double value;    /* logprob of being in previous state and transitioning to current state - value to maximize */
    struct delta_t *next; /* Next path state to check */
} delta;

/* For forward search */
typedef struct alpha_t {
    state  *current;      /* current state */
    double prob;          /* alpha value */
    double eprob;          /* alpha value */
    double tprob;          /* alpha value */
    struct alpha_t *next; /* alpha for next state possible at current step */
} alpha;

/* For backward search */
typedef struct beta_t {
    state  *current;      /* current state */
    double prob;          /* beta value */
    struct beta_t *next;  /* beta for next state possible at current step */
} beta;




/* In basic.c: */
multinomial *create_distribution(char *label, char *dist_file_path, char *dist_word, int word_count, count_dist *prior_dist, int vocab_size, double obs_uni_alpha, int closed_vocab);
int get_dist_index(multinomial **dists, char *label, int num_dists);
int get_state_index(state **states, int id, int num_states);
state *create_state(int id, char *label, multinomial *dist, int duration);
void add_trans(state *from_state, state *to_state, int count, double alpha, double prob);
void remove_trans(state *from_state, state *to_state);
multinomial *combine_emissions(multinomial *O1, multinomial *O2, int id);
void combine_trans(state *new, state *s1, state *s2, trans **p_trans_list, double uni_alpha);
void free_state(state *s1);
int set_trans_alpha(state *from_state, state *to_state, double uni_alpha);
double calc_emis_alpha(multinomial *O, int id, double prior_weight_adjustment);
double calc_trans_alpha(state *s1, double alpha);
int read_vocab(char *vocab_file_path, int closed_vocab);
double retrieve_count(multinomial *O, int id);
float calc_lprob(multinomial *O, int id, int mode);
double get_count(multinomial *O, int id, int mode);
char *kmalloc(size_t size);
FILE *kopen_r(char *file_path);
FILE *kopen_w(char *file_path);
FILE *kopen_wgz(char *file_path);
FILE *kopen_a(char *file_path);
int quit(int rc, char *msg, ...);

/* In model.c: */
shead **read_strings(FILE *string_file, int *p_num_strings, int read_count, int read_label, int read_id);
shead *read_one_string(FILE *string_file, int read_id, int read_label, int read_count);
int count_sample_strings(FILE *string_file, int read_count);
void map_words_to_unk(shead *new_string);
void free_string(shead *start);
void create_initial_model_from_data(shead **strings, int *num_of_states, int num_strings, double trans_uni_alpha, double obs_uni_alpha, state **p_initial, state **p_end, count_dist **priors, int num_prior_dists, int vocab_size);
void add_string_to_model(shead *new_string, state *start, state *end, int *ptr_num_states, int *ptr_max_state_label, count_dist **priors, int num_prior_dists, int vocab_size, double trans_uni_alpha, double obs_uni_alpha);
state *read_model_from_file(char *model_file_path, int *num_states);
void print_state_info(state *current_s, int mode);
void print_children_of(state *current_s, int *printed, int mode);
void print_model(state **states, int num_states, int mode);
void print_model_to_file(FILE *outfile, char *output_dir, char *vocab_file, state **states, int num_states, char *emissions_dir, int smooth, int mode, int iteration);
void print_distribution(char *dist_file_path, char *vocab_file, multinomial *dist, int smooth, int mode);
double *smooth_counts(multinomial *O, int vocab_size, int smooth, int mode, double *ptr_zeroton_lprob);
void collect_state_info(state *current_s, int num_states, multinomial **dists, ltrans **first_trans);
void add_children_of(state *current_s, int num_states, state **states, multinomial **dists, ltrans **first_trans);
void free_linked_transitions(ltrans *first_trans);
void set_state_prior_adjustment(state *s1, int trans_only, int narrow_emis, double dir_prior_weight);
void set_state_parameters(state *s1, int mode, int trans_only, int narrow_emis);
void set_model_parameters(state **states, int num_states, int mode, int narrow_emis, double dir_prior_weight);
void set_model_priors_and_likelihoods(state **states, int num_states, int mode, int use_prior, int narrow_emis);
void update_model_parameters(state *new, int mode, int narrow_emis, double dir_prior_weight);
void update_model_priors_and_likelihoods(state *new, int num_states, int mode, int use_prior, int narrow_emis);
void print_observation_string(shead *start);
void run_initial_model_exit(char *output_dir, char *vocab_file, state **states, int num_states, char *emissions_dir, int smooth, int mode, int narrow_emis, int iteration, double dir_prior_weight);
void evaluate_intermediate_model(char *obs_file_path, char *output_dir, state *initial, state **states, int num_states, int vocab_size, int smooth, int mode, int iteration);
int path_already_exists(state *initial, shead *current_string);
void update_emission_count(state *s1, int count);
void update_transition_count(state *from_state, state *to_state, int count);
void add_incremental_strings(FILE *string_file, int *ptr_num_states, state *initial, state *end, int read_id, int read_label, int read_count, int *ptr_max_state_label, int num_to_add, count_dist  **priors, int num_prior_dists, int vocab_size, double trans_uni_alpha, double obs_uni_alpha, int *ptr_more_strings, int *ptr_num_added);

/* In viterbi.c: */
path_head *find_vit_path(shead *data, state *initial, int details, int punc_trans, int read_label, float trans_weight);
void print_path(path_head *max_path);
void print_path_to_file(path_head *max_path, FILE *output_file, shead *start, int obs_with_id, int print_state_id, int print_probs);
void collect_pp_stats(path_head *max_path, int *ptr_total_obs, double *ptr_total_obs_logprob, double *ptr_total_t_logprob, double *ptr_total_e_logprob);
double get_emission_lprob(state *dest, char *word);
int count_num_symbols(shead *data);
alpha *create_alpha(state *dest, double trans_prob, float trans_weight, double emission_prob, double alpha_value, double alpha_tvalue, double alpha_evalue);
void update_alpha(alpha *temp_alpha, double trans_prob, float trans_weight, double alpha_value, double alpha_tvalue, double alpha_evalue);
void scale_alphas(int current_step, alpha **alphas, double *scale, double *escale, double *tscale);
double calc_forward_prob(shead *data, state *initial, int details, int punc_trans, int read_label, float trans_weight, double *elprob, double *tlprob);
void free_path(path_head *vit_path);
void free_deltas(delta **deltas, int num_steps);
void print_forward_prob_to_file(double prob, FILE *output_file, shead *start, int obs_with_id);
void free_alphas(alpha **alphas, int num_steps);
int ends_in_punc(char *prev_word);
void remove_punc(char *prev_word);
path_head *find_max_path(shead *data, state **states, int num_states, int read_label);

/* In lm.c: */
multinomial *read_arpa_1gram(char *dist_file_path, int closed_vocab);
multinomial **load_dists(char *dist_file_path, int *p_num_dists);
count_dist **load_prior_counts(char *dist_file_path, int *p_num_dists, int vocab_size);
count_dist *read_counts(char *dist_path, char *dist_label, int vocab_size);
count_dist *matching_prior(char *label, count_dist **priors, int num_prior_dists);
int get_max_id(char *dist_path);
void calculate_log_beta_factors(count_dist **priors, int num_dists, double obs_uni_alpha, int vocab_size, double dir_prior_weight);
void print_arpa_unigram(FILE *outfile, float *p, int vocab_size);
void print_abbrev_arpa_unigram(FILE *outfile, char *vocab_file, double *p, int vocab_size, double zeroton_lprob);

/* In command.c */
char *read_com_string(int *p_argc, char **argv, char *label);
int read_com_float(int *p_argc, char **argv, char *label, float *p_value);
int read_com_double(int *p_argc, char **argv, char *label, double *p_value);
int read_com_int(int *p_argc, char **argv, char *label, int *p_value);
int read_com_two_int(int *p_argc, char **argv, char *label, int *p_value1, int *p_value2);
char *read_com_one_string_two_int(int *p_argc, char **argv, char *label, int *p_value1, int *p_value2);
int read_com_noarg(int *p_argc, char **argv, char *label);
void check_extra_args(int *p_argc, char **argv);
void remove_arg(int *p_argc, char **argv, int to_remove);

/* In socket.c */
void rainbow_socket_init (const char *socket_name, int use_unix_socket);
void rainbow_serve (state *hmm, int obs_with_id, int run_vit, int print_probs, int run_forward, int vit_details, int forward_details, int punc_trans, int max_class, state **states, int num_states, int read_label, float trans_weight, int print_state_id);
int hmm_query(FILE *in, FILE *out, state *hmm, int obs_with_id, int run_vit, int print_probs, int run_forward, int vit_details, int forward_details, int punc_trans, int max_class, state **states, int num_states, int read_label, float trans_weight, int print_state_id);

/* In merge.c */
state_pairs *compute_all_candidates(state *model, state **states, int num_states, int same_label, int neighbors_only);
state **collect_states(state *initial, int *ptr_num_states);
void collect_children_of(state *current_s, int num_states, state **states);
void update_candidates(state_pairs **p_candidates, state_pairs *max_cand, state **states, int num_states, state *new_state, int same_label, int neighbors_only);
state *merge_states(state *s1, state *s2, trans **p_trans_list, double trans_uni_alpha);
void unmerge_states(state *new_state, trans *trans_list);
void collapse_same_tags(state *s1, int *p_num_states, double trans_uni_alpha);
int num_different_dest_states(state *s1);
int num_different_source_states(state *s1);
void free_trans_list(trans *trans_list);
void free_candidates(state_pairs *candidates);
double compute_candidate_contribution(state *s1, state *s2, int num_states, float prior_weight, int use_prior, int mode);
double compute_new_state_contribution(state *new_state, int num_states, float prior_weight, int use_prior, int mode);
double compute_state_prior(state *s1, int num_states, int mode, int use_prior, int narrow_emis);
double compute_structure_prior(state *s1, int num_states);
double mdl_prior(state *s1, int num_states);
double narrow_structure_prior(state *s1, int num_states);
double compute_parameter_prior(state *s1, int num_states, int narrow_emis, int mode);
double compute_state_likelihood(state *s1, int num_states, int mode, int narrow_emis);
double compute_vit_likelihood(state *s1, int mode);
double compute_struct_state_likelihood(state *s1, int num_states, int narrow_emis);
double log_beta(double *values, int num_values);
double gammln(double xx);
int have_same_label(state *s1, state *s2);
int are_neighbors(state *s1, state *s2);
int set_seen_to_zero(state **states, int num_states);
state **collapse_adjacent_states(state **states, int *ptr_num_states, state *initial, double trans_uni_alpha);
state **collapse_V_states(state **states, int *ptr_num_states, state *initial, state *end, double trans_uni_alpha);

/* In queue.c */
void collapse_V_tags_forward(state *initial, int *p_num_states, double trans_uni_alpha);
void collapse_V_tags_backward(state *end, int *p_num_states, double trans_uni_alpha);
void collapse_state_V_tags_forward(state *s1, ts **p_state_q, int *p_num_states, double trans_uni_alpha);
void collapse_state_V_tags_backward(state *s1, ts **p_state_q, int *p_num_states, double trans_uni_alpha);
void push_on_top(ts **p_state_q, state *s1);
void push_on_bottom(ts **p_state_q, state *s1);
state *pop_from_top(ts **p_state_q);
state *pop_from_bottom(ts **p_state_q);
void remove_from_q(ts **p_state_q, state *s1);

/* In bw.c */
state *iterate_bw(shead **strings, int num_strings, state *hmm, int num_states, int details, int punc_trans, int uniform, int random, int vocab_size, int trans_only, char *emissions_dir, FILE *outfile);
void set_uniform_model_parameters(double **aij, int num_states, state **states);
void set_random_model_parameters(double **aij, int num_states, state **states, int random);
void fill_in_trans_probs(double **aij, int num_states, state **states);
void initialize_double(double **var, int tot_i, int tot_j);
void compute_alphas(double **alphas, double **aij_old, float **bj, double *scale, char **obs, int num_obs, state **states, int num_states, int first_state_index, int last_state_index, int punc_trans, int vocab_size);
void compute_betas(double **betas, double **aij_old, float **bj, double *scale, char **obs, int num_obs, state **states, int num_states, int first_state_index, int last_state_index, int punc_trans, int vocab_size);
void print_betas(double **betas, char **obs, int num_obs, state **states, int num_states);
void print_alphas(double **alphas, char **obs, int num_obs, state **states, int num_states, double *scale);
int find_last_state_index(state **states, int num_states);
void copy_model_trans(state **states, int num_states, double **aij);
void update_parameters(double **num_aij_local, double **num_bj_local, double *denom_local, state **states, int num_states, double **alphas, double **betas, double *scale, char **obs, int num_obs, double **aij_old, float **bj, int first_state_index, int last_state_index, int details, int vocab_size, int trans_only);
void get_word(char *orig_word, char *word);
double ran0(int *idum);
void print_trans(double **var, int num_states, state **states);
double get_bj_prob(float **bj, state **states, int state_id, char *word, int vocab_size);
void print_emission_dists(char *emissions_dir, float **bj, int num_states, int vocab_size, state **states);
void print_reestimated_model_to_file(FILE *outfile, state **states, int num_states, char *emissions_dir);
float **set_emission_probs(int num_states, int vocab_size, state **states);

/* In smooth.c */
double *maximum_likelihood(multinomial *O, int vocab_size, int mode, double *ptr_zeroton_lprob);
double *absolute_discounting(multinomial *O, int vocab_size, int mode, double *ptr_zeroton_lprob);
double *three_way_linear_interpolation(multinomial *O, int vocab_size, int mode, double *ptr_zeroton_lprob);
double *loo_linear_interpolation(double **counts, int num_dists, int vocab_size);
double *combine_mixture_model(double **counts, int num_dists, int vocab_size, double *lambda);


/* More functions to write:

update_viterbi_paths(); - reparse strings through current model to reset the transition and emission counts

"smarter" creation of model by recruiting new states only when viterbi search fails

beam search

*/


