HMM.py

'''
Hidden markov model, implemented by Emily de Jong for CS155 homework #6,

Modified for Project #3 by Emily de Jong & Daniel Mukasa
'''

import random

class HiddenMarkovModel:
    '''
    Class implementation of Hidden Markov Models.
    '''

    def __init__(self, A, O, dictionary):
        '''
        Initializes an HMM. Assumes the following:
            - States and observations are integers starting from 0. 
            - There is a start state (see notes on A_start below). There
              is no integer associated with the start state, only
              probabilities in the vector A_start.
            - There is no end state.

        Arguments:
            A:          Transition matrix with dimensions L x L.
                        The (i, j)^th element is the probability of
                        transitioning from state i to state j. Note that
                        this does not include the starting probabilities.

            O:          Observation matrix with dimensions L x D.
                        The (i, j)^th element is the probability of
                        emitting observation j given state i.

            dictionary: Dictionary with unique words as keys and their corresponding        

        Parameters:
            L:          Number of states.
            
            D:          Number of observations.
            
            A:          The transition matrix.
            
            O:          The observation matrix.
            
            A_start:    Starting transition probabilities. The i^th element
                        is the probability of transitioning from the start
                        state to state i. For simplicity, we assume that
                        this distribution is uniform.
        '''

        self.L = len(A)
        self.D = len(O[0])
        self.A = A
        self.O = O
        self.dictionary = dictionary
        self.A_start = [1. / self.L for _ in range(self.L)]


    def viterbi(self, x):
        '''
        Uses the Viterbi algorithm to find the max probability state 
        sequence corresponding to a given input sequence.

        Arguments:
            x:          Input sequence in the form of a list of length M,
                        consisting of integers ranging from 0 to D - 1.

        Returns:
            max_seq:    State sequence corresponding to x with the highest
                        probability.
        '''

        M = len(x)      # Length of sequence.

        # The (i, j)^th elements of probs and seqs are the max probability
        # of the prefix of length i ending in state j and the prefix
        # that gives this probability, respectively.
        #
        # For instance, probs[1][0] is the probability of the prefix of
        # length 1 ending in state 0.
        probs = [[0. for _ in range(self.L)] for _ in range(M + 1)]
        seqs = [['' for _ in range(self.L)] for _ in range(M + 1)]

        max_seq = ''

        # initial states
        for state in range(self.L):
            probs[0][state] = self.A_start[state] * self.O[state][x[0]]
            seqs[0][state] = ''
        
        for observation_j in range(M-1):
            observation_i = observation_j + 1
            for state in range(self.L):
                probk = 0
                maxprobk = 0
                maxk = -1
                for k in range(self.L):
                    # look at all the transitions
                    probk = probs[observation_i-1][k]*self.A[k][state]*self.O[state][x[observation_i]]
                    if probk > maxprobk:
                        maxprobk = probk
                        maxk = k
                # save the highest probability and the best transition state
                probs[observation_i][state] = maxprobk
                seqs[observation_i][state] = maxk
                
        max_seq = [-1 for _ in range(M)]

        # now find the end state with the highest probability
        probk_M = 0
        maxk_M = -1
        maxprobk_M = 0
        for k in range(self.L):
            probk_M = probs[M-1][k]
            if probk_M > maxprobk_M:
                maxprobk_M = probk_M
                maxk_M = k
        
        max_seq[M-1] = maxk_M

        # loop in reverse to recover the best sequence
        for j in range(M-1,0,-1):
            max_seq[j-1] = seqs[j][max_seq[j]]

        max_seq_str = ''
        for j in range(M):
            max_seq_str += str(max_seq[j])

        return max_seq_str


    def forward(self, x, normalize=False):
        '''
        Uses the forward algorithm to calculate the alpha probability
        vectors corresponding to a given input sequence.

        Arguments:
            x:          Input sequence in the form of a list of length M,
                        consisting of integers ranging from 0 to D - 1.

            normalize:  Whether to normalize each set of alpha_j(i) vectors
                        at each i. This is useful to avoid underflow in
                        unsupervised learning.

        Returns:
            alphas:     Vector of alphas.

                        The (i, j)^th element of alphas is alpha_j(i),
                        i.e. the probability of observing prefix x^1:i
                        and state y^i = j.

                        e.g. alphas[1][0] corresponds to the probability
                        of observing x^1:1, i.e. the first observation,
                        given that y^1 = 0, i.e. the first state is 0.
        '''

        M = len(x)      # Length of sequence.
        # convert to a sequence of integers
        #x = []
        #for word in seq:
        #    obs = self.dictionary[word]
        #    x.append(obs)
        #print(x)
        alphas = [[0. for _ in range(self.L)] for _ in range(M)]

        ## My code here
        # initial state is always start state
        for state in range(self.L):
            alphas[0][state] = self.A_start[state]*self.O[state][x[0]]
       
        if normalize:
            sum0 = sum(alphas[0][:])
            for state in range(self.L):
                alphas[0][state] /= sum0

        # now go forward, given the current prefix
        for i in range(1,M):
            # keep a running count of alpha*A's for each state
            P_state = [0. for _ in range(self.L)]
            for state in range(self.L):
                for j in range(self.L):
                    P_state[state] += alphas[i-1][j] * self.A[j][state] 
                alphas[i][state] = self.O[state][x[i]]*P_state[state]
            #print(alphas[i])
            if normalize:
                sumi = sum(alphas[i])
                if sumi > 0:
                    #print(sumi)
                    for state in range(self.L):
                        alphas[i][state] /= sumi
        #print(alphas[M])
        
        return alphas

        
    def backward(self, x, normalize=False):
        '''
        Uses the backward algorithm to calculate the beta probability
        vectors corresponding to a given input sequence.

        Arguments:
            x:          Input sequence in the form of a list of length M,
                        consisting of integers ranging from 0 to D - 1.

            normalize:  Whether to normalize each set of alpha_j(i) vectors
                        at each i. This is useful to avoid underflow in
                        unsupervised learning.

        Returns:
            betas:      Vector of betas.

                        The (i, j)^th element of betas is beta_j(i), i.e.
                        the probability of observing prefix x^(i+1):M and
                        state y^i = j.

                        e.g. betas[M][0] corresponds to the probability
                        of observing x^M+1:M, i.e. no observations,
                        given that y^M = 0, i.e. the last state is 0.
        '''

        M = len(x)      # Length of sequence.
        # convert to a sequence of integers
        #x = []
        #for word in seq:
        #    obs = self.dictionary[word]
        #    x.append(obs)
        #print(x)
        betas = [[0. for _ in range(self.L)] for _ in range(M + 1)]

        # MY CODE
        for state in range(self.L):
            betas[M][state] = 1
        if normalize:
            sumM = sum(betas[M][:])
            for state in range(self.L):
                betas[M][state] /= sumM
        #print(betas[M])

        for i in range(M-1,-1,-1):
            for state in range(self.L):
                for j in range(self.L):
                    betas[i][state] += betas[i+1][j]*self.A[state][j]*self.O[j][x[i]]
            if normalize:
                sumi = sum(betas[i][:])
                if sumi > 0:
                    #print(sumi)
                    for state in range(self.L):
                        betas[i][state] /= sumi
                
        return betas


    def supervised_learning(self, X, Y):
        '''
        Trains the HMM using the Maximum Likelihood closed form solutions
        for the transition and observation matrices on a labeled
        datset (X, Y). Note that this method does not return anything, but
        instead updates the attributes of the HMM object.

        Arguments:
            X:          A dataset consisting of input sequences in the form
                        of lists of variable length, consisting of integers 
                        ranging from 0 to D - 1. In other words, a list of
                        lists.

            Y:          A dataset consisting of state sequences in the form
                        of lists of variable length, consisting of integers 
                        ranging from 0 to L - 1. In other words, a list of
                        lists.

                        Note that the elements in X line up with those in Y.
        '''

        # MY CODE
        
        # first train the a,b-th entry of A
        for a in range(self.L):
            for b in range(self.L):
                numerator = 0
                denominator = 0
                for x, y in zip(X, Y):
                    M = len(x)
                    for i in range(M-1):
                        if y[i] == b:
                            denominator += 1
                            if y[i+1] == a:
                                numerator += 1
                if denominator != 0:
                    self.A[b][a] = numerator/denominator

        # next train the w,z-th entry of O
        for w in range(self.L):
            for z in range(self.D):
                numerator = 0
                denominator = 0
                for x,y in zip(X,Y):
                    M = len(x)
                    for i in range(M):
                        if y[i] == z:
                            denominator += 1
                            if x[i] == w:
                                numerator += 1
                if denominator != 0:
                    self.O[z][w] = numerator / denominator
        pass


    def unsupervised_learning(self, X, N_iters):
        '''
        Trains the HMM using the Baum-Welch algorithm on an unlabeled
        datset X. Note that this method does not return anything, but
        instead updates the attributes of the HMM object.

        Arguments:
            X:          A dataset consisting of input sequences in the form
                        of lists of length M, consisting of integers ranging
                        from 0 to D - 1. In other words, a list of lists.

            N_iters:    The number of iterations to train on.
        '''
        # MY CODE
        import time
        start_time = time.time()
        for iter in range(N_iters):
            A_num = [[0. for _ in range(self.L)] for _ in range(self.L)]
            A_denom = [[0. for _ in range(self.L)] for _ in range(self.L)]
            O_num = [[0. for _ in range(self.D)] for _ in range(self.L)]
            O_denom = [[0. for _ in range(self.D)] for _ in range(self.L)]
            #A_start_denom = [0. for _ in range(self.L)]
            #A_start_num = [0. for _ in range(self.L)]
            #print(len(X))
            for x in X:
            # Calculate each element of A using the M-step formulas.
                M = len(x)
                
                alphas = self.forward(x,normalize=True)
                betas = self.backward(x,normalize=True)
                betas = betas[1:len(betas)] # drop the start state
                #print(len(alphas))
                #print(len(betas))
                # first calculate xi = P(Yt = i, Yt+1 = j, given X, A, O, etc.)
                xis = [[[0. for _ in range(self.L)] for _ in range(self.L)] for _ in range(M)]
                for t in range(M-1):
                    denom = 0
                    for i in range(self.L):
                        for j in range(self.L):
                            #print(i,j,t,M,self.L)
                            xis[t][i][j] = alphas[t][i]*self.A[i][j]*betas[t+1][j]*self.O[j][x[t+1]]#/denom
                            denom += xis[t][i][j]
                    for i in range(self.L):
                        for j in range(self.L):
                            xis[t][i][j] /= denom
                #print(xis[M-1][self.L-1][self.L-1])
                #next calculate gamma = P(Yt = i, given x, A, O, etc.)
                gammas = [[0. for _ in range(self.L)] for _ in range(M)]
                for t in range(M):
                    for i in range(self.L):
                        denom = 0
                        for j in range(self.L):
                            denom += alphas[t][j]*betas[t][j]
                        
                        gammas[t][i] = alphas[t][i]*betas[t][i]/denom  
                #print(gammas[M-1][self.L-1])  
            # sum up over all sequences
                for i in range(self.L):
                    for j in range(self.L):
                        for t in range(M-1):
                            A_num[i][j] += xis[t][i][j]
                            A_denom[i][j] += gammas[t][i]
                
                for i in range(self.L):
                    for obs in range(self.D):
                        for t in range(M):
                            if x[t] == obs:
                                O_num[i][obs] += gammas[t][i]
                            O_denom[i][obs] += gammas[t][i]
                #for i in range(self.L):
                    #A_start_denom[i] += 1
                    #A_start_num[i] += gammas[0][i]
                #print(A_num[1][1])
                #print(O_num[1][1])
            # update the matrices
            for i in range(self.L):
                for j in range(self.L):
                    self.A[i][j] = A_num[i][j]/A_denom[i][j]

            for i in range(self.L):
                for obs in range(self.D):
                    self.O[i][obs] = O_num[i][obs] / O_denom[i][obs]

            #for i in range(self.L):
            #    self.A_start[i] = A_start_num[i]/A_start_denom[i]
                # now calculate the new transition matrix entries:
            
            if iter%1 == 0:
                print(iter, ' out of ', N_iters, ' iterations complete; time elapsed = ', time.time() - start_time)

        pass


    def generate_emission(self, M, start_state=-1):
        '''
        Generates an emission of length M, assuming that the starting state
        is chosen uniformly at random. 

        Arguments:
            M:          Length of the emission to generate.

        Returns:
            emission:   The randomly generated emission as a list.

            states:     The randomly generated states as a list.
        '''
  
        from random import choices
        emission = []
        states = []
        state_opts = [i for i in range(self.L)]

        # get a start state
        if start_state == -1:
            initial_state = choices(state_opts,self.A_start)[0]
            #random.seed(155)
        else:
            initial_state = start_state
        
        states.append(initial_state)
        prev_state = initial_state

        # get the rest of the states
        for t in range(1,M+1):
            next_state = choices(state_opts,self.A[int(prev_state)])[0]
            states.append(next_state)
            prev_state = next_state
        
        obs_opts = [i for i in range(self.D)]
        for t in range(0,M):
            obs = choices(obs_opts,self.O[states[t]])[0]
            emission.append(obs)

        return emission, states


    def probability_alphas(self, x):
        '''
        Finds the maximum probability of a given input sequence using
        the forward algorithm.

        Arguments:
            x:          Input sequence in the form of a list of length M,
                        consisting of integers ranging from 0 to D - 1.

        Returns:
            prob:       Total probability that x can occur.
        '''

        # Calculate alpha vectors.
        alphas = self.forward(x)
        # alpha_j(M) gives the probability that the state sequence ends
        # in j. Summing this value over all possible states j gives the
        # total probability of x paired with any state sequence, i.e.
        # the probability of x.
        prob = sum(alphas[-1])
        return prob


    def probability_betas(self, x):
        '''
        Finds the maximum probability of a given input sequence using
        the backward algorithm.

        Arguments:
            x:          Input sequence in the form of a list of length M,
                        consisting of integers ranging from 0 to D - 1.

        Returns:
            prob:       Total probability that x can occur.
        '''

        betas = self.backward(x)

        # beta_j(1) gives the probability that the state sequence starts
        # with j. Summing this, multiplied by the starting transition
        # probability and the observation probability, over all states
        # gives the total probability of x paired with any state
        # sequence, i.e. the probability of x.
        prob = sum([betas[1][j] * self.A_start[j] * self.O[j][x[0]] \
                    for j in range(self.L)])

        return prob


def supervised_HMM(X, Y):
    '''
    Helper function to train a supervised HMM. The function determines the
    number of unique states and observations in the given data, initializes
    the transition and observation matrices, creates the HMM, and then runs
    the training function for supervised learning.

    Arguments:
        X:          A dataset consisting of input sequences in the form
                    of lists of variable length, consisting of integers 
                    ranging from 0 to D - 1. In other words, a list of lists.

        Y:          A dataset consisting of state sequences in the form
                    of lists of variable length, consisting of integers 
                    ranging from 0 to L - 1. In other words, a list of lists.
                    Note that the elements in X line up with those in Y.
    '''
    # Make a set of observations.
    observations = set()
    for x in X:
        observations |= set(x)

    # Make a set of states.
    states = set()
    for y in Y:
        states |= set(y)
    
    # Compute L and D.
    L = len(states)
    D = len(observations)

    # Randomly initialize and normalize matrix A.
    A = [[random.random() for i in range(L)] for j in range(L)]

    for i in range(len(A)):
        norm = sum(A[i])
        for j in range(len(A[i])):
            A[i][j] /= norm
    
    # Randomly initialize and normalize matrix O.
    O = [[random.random() for i in range(D)] for j in range(L)]

    for i in range(len(O)):
        norm = sum(O[i])
        for j in range(len(O[i])):
            O[i][j] /= norm

    # Train an HMM with labeled data.
    HMM = HiddenMarkovModel(A, O)
    HMM.supervised_learning(X, Y)

    return HMM

def unsupervised_HMM(seqs, n_states, observation_dict, N_iters):
    '''
    Helper function to train an unsupervised HMM. The function determines the
    number of unique observations in the given data, initializes
    the transition and observation matrices, creates the HMM, and then runs
    the training function for unsupervised learing.

    Arguments:
        X:          A dataset consisting of input sequences in the form
                    of lists of variable length, consisting of integers 
                    ranging from 0 to D - 1. In other words, a list of lists.

        n_states:   Number of hidden states to use in training.
        
        N_iters:    The number of iterations to train on.
    '''

    # Make a set of observations.
    '''observations = set()
    for x in X:
        observations |= set(x)
    '''
    # Compute L and D.
    L = n_states
    D = len(observation_dict)

    # Randomly initialize and normalize matrix A.
    random.seed(2020)
    A = [[random.random() for i in range(L)] for j in range(L)]

    for i in range(len(A)):
        norm = sum(A[i])
        for j in range(len(A[i])):
            A[i][j] /= norm
    
    # Randomly initialize and normalize matrix O.
    random.seed(155)
    O = [[random.random() for i in range(D)] for j in range(L)]

    for i in range(len(O)):
        norm = sum(O[i])
        for j in range(len(O[i])):
            O[i][j] /= norm
    
    # create the HMM
    HMM = HiddenMarkovModel(A, O, observation_dict)

    X = []
    for i, seq in enumerate(seqs):
        M = len(seq)      # Length of sequence.
        # convert to a sequence of integers
        x = []
        for word in seq:
            obs = HMM.dictionary[word]
            x.append(obs)
        X.append(x)
    
    #print(X)

    # Train an HMM with unlabeled data.
    HMM.unsupervised_learning(X, N_iters)

    return HMM