-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMCEM_DGMM.py
974 lines (740 loc) · 38.9 KB
/
MCEM_DGMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
# -*- coding: utf-8 -*-
"""
Created on Thu May 14 18:26:18 2020
@author: RobF
"""
from scipy.linalg import block_diag
from scipy.stats import multivariate_normal as mvnorm
from copy import deepcopy
import autograd.numpy as np
from autograd.numpy import transpose as t
from autograd.numpy import newaxis as n_axis
from autograd.numpy.random import multivariate_normal
from autograd.numpy.linalg import pinv
#=============================================================================
# MC Step functions
#=============================================================================
def draw_z_s(mu_s, sigma_s, eta, M, center_last_layer = False):
''' Draw from f(z^{l} | s) for all s in Omega and return the centered and
non-centered draws
mu_s (list of nd-arrays): The means of the Gaussians starting at each layer
sigma_s (list of nd-arrays): The covariance matrices of the Gaussians
starting at each layer
eta (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters
for each layer
M (dict of int): The number of MC to draw on each layer
center_last_layer (Bool): Whether or not to return the last centered
-------------------------------------------------------------------------
returns (list of ndarrays): z^{l} | s for all s in Omega and all l in L
'''
L = len(mu_s)
r = [mu_s[l].shape[1] for l in range(L)]
S = [mu_s[l].shape[0] for l in range(L)]
z_s = []
zc_s = [] # z centered (denoted c) or all l
for l in range(L):
zl_s = multivariate_normal(size = (M[l], 1), \
mean = mu_s[l].flatten(order = 'C'), cov = block_diag(*sigma_s[l]))
zl_s = zl_s.reshape(M[l], S[l], r[l], order = 'C')
z_s.append(t(zl_s, (0, 2, 1)))
if (l < L - 1) or center_last_layer:
kl = eta[l].shape[0]
eta_ = np.repeat(t(eta[l], (2, 0, 1)), S[l] // kl, axis = 1)
zc_s.append(zl_s - eta_)
return z_s, zc_s
def draw_z2_z1s(chsi, rho, M, r):
''' Draw from f(z^{l+1} | z^{l}, s, Theta)
chsi (list of nd-arrays): The chsi parameters for all paths starting at each layer
rho (list of ndarrays): The rho parameters (covariance matrices) for
all paths starting at each layer
M (dict): The number of MC to draw on each layer
r (dict): The dimension of each layer
---------------------------------------------------------------------------
returns (list of nd-arrays): z^{l+1} | z^{l}, s, Theta for all (l,s)
'''
L = len(chsi)
S = [chsi[l].shape[0] for l in range(L)]
z2_z1s = []
for l in range(L):
z2_z1s_l = np.zeros((M[l], M[l + 1], S[l], r[l + 1]))
for s in range(S[l]):
for m in range(M[l]):
z2_z1s_l[m,:, s] = multivariate_normal(size = M[l + 1], \
mean = rho[l][m, s].flatten(order = 'C'), \
cov = chsi[l][s])
z2_z1s.append(z2_z1s_l)
return z2_z1s
def draw_z_s_all_network(mu_s_c, sigma_s_c, mu_s_d, sigma_s_d, yc, eta_c, \
eta_d, S_1L, L, M):
''' Draw z^{(l)h} from both heads and then from the tail
mu_s_* (list of nd-arrays): The means of the Gaussians starting at each layer of the head *
sigma_s_* (list of nd-arrays): The covariance matrices of the Gaussians of the head *
yc (n x p_continuous): The continuous data
eta_* (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters
for each layer of head *
S_1L (dict): The number of paths starting at each layer
L (dict): The number of layers on each head or tail
M (dict): The number of MC points to draw on each head or tail
--------------------------------------------------------------------------
returns (tuple of length 4): The latent variables existing in the network
'''
#============================
# Continuous head.
#============================
# The first z for the continuous head is actually the data,
# which we do not resimulate
z_s_c, zc_s_c = draw_z_s(mu_s_c[1:], sigma_s_c[1:],\
eta_c[1:], M['c'][1:], False)
yc_rep = np.repeat(yc[..., n_axis], S_1L['c'][0], -1)
z_s_c = [yc_rep] + z_s_c
eta_rep = np.repeat(eta_c[0], S_1L['c'][1], axis = 0)
yc_centered_rep = t(z_s_c[0] - eta_rep.T, (0, 2, 1))
zc_s_c = [yc_centered_rep] + zc_s_c
#============================
# Discrete head.
#============================
z_s_d, zc_s_d = draw_z_s(mu_s_d[:L['d']], sigma_s_d[:L['d']],\
eta_d[:L['d']], M['d'][:L['d']], True)
#============================
# Common tail
#============================
# The samples of the tail are shared by both heads
z_s_d = z_s_d + z_s_c[(L['c'] + 1):]
zc_s_d = zc_s_d + zc_s_c[(L['c'] + 1):]
return z_s_c, zc_s_c, z_s_d, zc_s_d
def draw_z2_z1s_network(chsi_c, chsi_d, rho_c, rho_d, M, r_1L, L):
''' Draw z^{(l + 1)h} | z^{(l)h} from both heads and then from the tail
chsi_* (list of nd-arrays): The chsi parameters for all paths starting at each layer of head *
rho_* (list of ndarrays): The rho parameters (covariance matrices) for
all paths starting at each layer of head *
M (dict of lists): The number of MC to draw on each layer
r_1L (dict of lists): The dimension of each layer
L (dict): The number of layers in the networks
---------------------------------------------------------------------
returns (tuple of length 2): The latent variables of z^{(l + 1)h} | z^{(l)h} existing in the network
'''
# Draw z^{l+1} | z^l, s from head to tail for the continuous head
z2_z1s_c = draw_z2_z1s(chsi_c, rho_c, M['c'], r_1L['c'])
# Draw z^{l+1} | z^l, s from head to the first common layer for the discrete head
z2_z1s_d = draw_z2_z1s(chsi_d[:L['d']], rho_d[:L['d']], \
M['d'][:(L['d'] + 1)], r_1L['d'][:(L['d'] + 1)])
# Common z^{l+1} | z^l, s draws are shared by the two heads
z2_z1s_d = z2_z1s_d + z2_z1s_c[(L['c'] + 1):]
return z2_z1s_c, z2_z1s_d
#=============================================================================
# E Step functions
#=============================================================================
def fz_s(z_s, mu, sigma):
''' Compute p(z | s)
z_s (list of ndarrays): The latent variables on each path
mu (list of nd-arrays): The means of the Gaussians starting at each layer
sigma (list of nd-arrays): The covariance matrices of the Gaussians
---------------------------------------------------------------------
returns (list of ndarrays): Compute the density estimates of p(z^{(l)} | s^{(l)})
'''
epsilon = 1E-16
L = len(z_s)
M = [z_s[l].shape[0] for l in range(L)]
S = [z_s[l].shape[-1] for l in range(L)]
pz_s = []
for l in range(L):
pz_sl = np.zeros((M[l], S[l]))
for s in range(S[l]):
pz_sl[:, s] = mvnorm.pdf(z_s[l][:,:, s], \
mean = mu[l][s, :, 0], \
cov = sigma[l][s], allow_singular = True)
# Normalization to check
norm_cste = pz_sl.sum(0, keepdims = True)
norm_cste = np.where(norm_cste <= epsilon, epsilon, norm_cste)
pz_sl = pz_sl / norm_cste
pz_sl = np.tile(pz_sl, (1, S[0]//S[l]))
pz_s.append(pz_sl)
return pz_s
def fz2_z1s(pzl1_ys, z2_z1s, chsi, rho, S):
''' Compute p(z^{(l)}| z^{(l-1)}, y)
pzl1_ys (ndarray): p(z1 |y, s)
z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s
chsi (list of nd-arrays): The chsi parameters for all paths starting at each layer
rho (list of ndarrays): The rho parameters (covariance matrices) for
all paths starting at each layer
S (list of int): The number of paths starting at each layer
-------------------------------------------------------------------------
returns (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y)
'''
epsilon = 1E-16
L = len(z2_z1s)
M = [z2_z1s[l].shape[0] for l in range(L)] + [z2_z1s[-1].shape[1]]
if len(pzl1_ys) > 0: # For discrete head
pz2_z1s = [pzl1_ys]
else: # For continuous head
pz2_z1s = []
for l in range(L):
pz2_z1sm = np.zeros((M[l], M[l + 1], S[l]))
for s in range(S[l]):
for m in range(M[l]):
try:
pz2_z1sm[m, :, s] = mvnorm.pdf(z2_z1s[l][m, :, s], \
mean = rho[l][m, s, :, 0], \
cov = chsi[l][s], allow_singular = True)
except:
print(chsi[l][s])
raise RuntimeError('Singular Matrix')
norm_cste = pz2_z1sm.sum(1, keepdims = True)
norm_cste = np.where(norm_cste <= epsilon, epsilon, norm_cste)
pz2_z1sm = pz2_z1sm / norm_cste
pz2_z1sm = np.tile(pz2_z1sm, (1, 1, S[0]//S[l]))
pz2_z1s.append(pz2_z1sm)
return pz2_z1s
def continuous_lik(yc, mu, sigma, w):
''' Compute p(y^C) and p(s^C | y^C)
yc (n x p_continuous): The continuous data
mu (list of nd-arrays): The means of the Gaussians starting at each layer
sigma (list of nd-arrays): The covariance matrices of the Gaussians
w (list): The flatten path probabilities
-------------------------------------------------------------------------
returns (tuple of length 2): p(y^C) and p(s^C | y^C)
'''
epsilon = 1E-16
#==========================
# p(y|s), p(y,s)
#==========================
numobs = yc.shape[0]
S0 = mu.shape[0]
py_s = np.zeros((numobs, S0))
for s in range(S0):
try:
py_s[:,s] = mvnorm.logpdf(yc, mu[s][:,0], sigma[s],\
allow_singular = True)
except:
print(sigma[s])
raise RuntimeError('Singular matrix')
pys = np.log(w)[n_axis] + py_s
#==========================
# Numeric stability block
#==========================
pys_max = np.max(pys, axis = 1, keepdims = True)
pys = pys - pys_max # For numeric stability
pys = np.exp(pys) # Max-normalized p(y,s)
#==========================
# p(s|y), p(y)
#==========================
norm_cste = np.sum(pys, axis = 1, keepdims = True)
norm_cste = np.where(norm_cste <= epsilon, epsilon, norm_cste)
ps_y = pys / norm_cste # Normalized p(s|y)
py = np.exp(pys_max)[:, 0] * np.sum(pys, axis = 1) # p(y) = sum_{s} p(y,s)
py_s = np.exp(py_s)
# Numeric stability issues handling
ps_y = np.where(ps_y <= epsilon, epsilon, ps_y)
py = np.where(py == 0, epsilon, py)
return ps_y, py_s, py[..., n_axis]
def fz_ys(pzl1_ys, pz2_z1s):
''' Compute p(z^{l} | y, s) in a recursive manner
pzl1_ys (ndarray): p(z1 |y, s)
pz2_z1s (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y)
------------------------------------------------------------
returns (list of ndarrays): p(z^{l} | y, s)
'''
L = len(pz2_z1s) - 1
if len(pzl1_ys) > 0: # For discrete head
pz_ys = [pzl1_ys]
else: # For continuous head
pz_ys = [pz2_z1s[0]]
for l in range(L):
pz_ys_l = np.expand_dims(pz_ys[l], 2)
pz2_z1s_l = pz2_z1s[l + 1][n_axis]
# Was a mean before: error...
pz_ys.append((pz_ys_l * pz2_z1s_l).sum(1))
return pz_ys
def fy_zs(pz_ys, py_s):
''' Compute p(y^h | z^{(l)h}, s^h)
pz_ys (list of ndarrays): p(z^{(l)h} |y^h, s^h)
py_s (ndarray): p(y^h| s^h)
----------------------------------------------------------------
returns (list of ndarrays): p(y^h | z^{(l)h}, s^h)
'''
L = len(pz_ys)
epsilon = 1E-16
fy_zs = []
for l in range(L):
norm_cste = (pz_ys[l] * np.expand_dims(py_s, 1)).sum(0, keepdims = True)
norm_cste = np.where(norm_cste <= epsilon, epsilon, norm_cste)
# Norm constant to check
fy_zsl = pz_ys[l] * np.expand_dims(py_s, 1) / norm_cste
fy_zs.append(fy_zsl)
return fy_zs
def fz_yCyDs(py_zs_c, pz_ys_d, py_s_c, M, S_1L, L):
''' Compute p(zt | yC, yD, sC, sD) for all common layers
py_zs_c (list of ndarrays): p(y^C | z^C, s^C)
pz_ys_d (list of ndarrays): p(z^D | y^D, s^D)
py_s_c (ndarray): p(y^C | s^C)
M (dict): The number of MC points to draw on each head or tail
S_1L (dict): The number of paths starting at each layer
L (dict): The number of layers on each head or tail
----------------------------------------------------------------
returns (list of ndarrays): p(zt | yC, yD, sC, sD)
'''
epsilon = 1E-16
numobs = py_zs_c[0].shape[0]
fz_yCyDs = []
for l in range(L['t']):
#====================================================
# Reshaping
#====================================================
py_zs_c_l = py_zs_c[L['c'] + l].reshape(numobs, M['c'][L['c'] + l + 1], \
S_1L['c'][0]// S_1L['t'][0], S_1L['t'][0],\
order = 'C')
pz_ys_d_l = pz_ys_d[L['d'] + l].reshape(numobs, M['d'][L['d'] + l], \
S_1L['d'][0]// S_1L['t'][0], S_1L['t'][0],\
order = 'C')
py_zs_c_l = np.expand_dims(py_zs_c_l, 3)
pz_ys_d_l = np.expand_dims(pz_ys_d_l, 2)
# The shape is (numobs, M[l], S^C, S^D, S^t)
fz_yCyDs_l = py_zs_c_l * pz_ys_d_l
# To check for axis 4
norm_cste = fz_yCyDs_l.sum((1, 3, 4), keepdims = True)
norm_cste = np.where(norm_cste <= epsilon, epsilon, norm_cste)
fz_yCyDs_l = fz_yCyDs_l / norm_cste
fz_yCyDs.append(fz_yCyDs_l)
return fz_yCyDs
def fst_yCyD(py_zs_c, py_zs_d, pz_s_d, w_s_c, w_s_d, k_1L, L):
''' Compute p(s^L0 | y^C, y^D)
py_zs_c (list of ndarrays): p(y^C | z^C, s^C)
py_zs_d (list of ndarrays): p(y^D | z^D, s^D)
pz_s_d (list of ndarrays): p(z^D | s^D)
w_s_* (list of length s1): The path probabilities for all s in S^h
of the given head or tail *
k_1L (list of int): The number of component on each layer including the common layers
L (dict): The number of layers on each head or tail
----------------------------------------------------------------
returns (list of ndarrays): p(s^t | y^C, y^D)
'''
epsilon = 1E-16
numobs = py_zs_c[0].shape[0]
# 1st and 2nd term
# p(y^h | zt^{(1)}, s^h, Theta) for h in {C, D}.
py_zts_c = py_zs_c[L['c']] # shape = (numobs, Mt, Sc)
Mt = py_zts_c.shape[1] # Nb of MC points of the first tail layer
py_zts_c = py_zts_c.reshape(numobs, Mt, np.prod(k_1L['c'][:L['c'] + 1]),\
np.prod(k_1L['t']), order = 'C')
py_zts_d = py_zs_d[L['d']] # shape = (numobs, Mt, Sd)
# shape = (numobs, Mt, k['d'], S['t'])
py_zts_d = py_zts_d.reshape(numobs, Mt, np.prod(k_1L['d'][:L['d']]),\
np.prod(k_1L['t']), order = 'C')
# 3rd term
# p(zt^{(1)} | s^t, Theta)
pz_s_t = pz_s_d[L['d']]
pz_s_t = pz_s_t.reshape(Mt, *k_1L['d'], order = 'C')
# Quantities are tiled hence take the mean of the quantity not the sum
idx_to_mean = tuple(range(1, L['d'] + 1))
pz_s_t = pz_s_t.mean(idx_to_mean).reshape(Mt, -1, order = 'C')[n_axis]
# 4th term
# p(s^D | Theta)
ps_d = w_s_d.reshape(*k_1L['d'], order = 'C')
idx_to_sum = tuple(range(L['d'], L['d'] + L['t']))
ps_d = ps_d.sum(idx_to_sum).reshape(-1, order = 'C')[n_axis, n_axis, ..., n_axis]
# sum_s^C p(y^C | z^t, s^C, s^t) p(s^C, s^t)
ps_ct = w_s_c.reshape(np.prod(k_1L['c'][:L['c'] + 1]),\
np.prod(k_1L['t']), order = 'C')
pyst_zt_c = (py_zts_c * ps_ct[n_axis, n_axis]).sum(2)
# sum_s^D p(y^C | z^t, s^D, s^t) p(s^D)
pysD_ztst_d = (py_zts_d * ps_d).sum(2)
pst_yCyD = (pyst_zt_c * pysD_ztst_d * pz_s_t).sum(1)
# Normalization
norm_cste = np.where(pst_yCyD == 0.0, epsilon, pst_yCyD)
pst_yCyD = pst_yCyD / norm_cste.sum(1, keepdims = True)
return pst_yCyD
def E_step_DGMM_c(H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, Sc, Lc):
''' Compute the expectations of the E step for all DGMM layers of the continuous head
H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters
for each layer
z_s (list of nd-arrays): z^{(l)} | s^l for all s^l and all l.
zc_s (list of nd-arrays): (z^{(l)C} | s^{(l)C} - eta{k_l}^{(l)C} for all s^l and all l.
z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s
pz_ys (list of ndarrays): p(z^{l} | y, s)
pz2_z1s (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y)
Sc (list of int): The number of paths starting at each layer of the continuous head
Lc (list of int): The number of layers on the continuous head
------------------------------------------------------------
returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s),
E(z^{(l+1)}z^{(l+1)T} | y, s),
E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)}
'''
k = [H[l].shape[0] for l in range(Lc + 1)]
Ez_ys = [t(z_s[0], (0, 2, 1)) ] # E(y | y ,s) = y
E_z1z2T_ys = []
E_z2z2T_ys = []
EeeT_ys = []
for l in range(Lc + 1):
#print(l)
# Broadcast the quantities to the right shape
z1_s = t(z_s[l], (0, 2, 1))
z1_s = np.tile(z1_s, (1, Sc[0] // Sc[l], 1))[..., n_axis]
z1c_s = np.tile(zc_s[l], (1, np.prod(k[:l]), 1))
z2_s = z_s[l + 1].transpose((0, 2, 1))
z2_s = np.tile(z2_s, (1, np.prod(k[:l + 1]), 1))[..., n_axis]
pz1_ys = pz_ys[l - 1][..., n_axis]
pz2_ys = pz_ys[l][..., n_axis]
H_formated = np.tile(H[l], (np.prod(k[:l]), 1, 1))
H_formated = np.repeat(H_formated, Sc[l + 1], axis = 0)[n_axis]
#=========================================================
# E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1}
#=========================================================
E_z2_z1s = z2_z1s[l].mean(1)
E_z2_z1s = np.tile(E_z2_z1s, (1, Sc[0] // Sc[l], 1))
if l == 0:
Ez_ys_l = E_z2_z1s
else:
Ez_ys_l = (pz1_ys * E_z2_z1s[n_axis]).sum(1)
Ez_ys.append(Ez_ys_l)
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | z^{l}, s)
#=========================================================
E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \
np.expand_dims(z2_z1s[l], 3)).mean(1)
E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, Sc[0] // Sc[l], 1, 1))
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | y, s)
#=========================================================
if l == 0:
E_z2z2T_ys.append(E_z2z2T_z1s)
else:
E_z2z2T_ys_l = (pz1_ys[..., n_axis] * \
E_z2z2T_z1s[n_axis]).sum(1)
E_z2z2T_ys.append(E_z2z2T_ys_l)
#=========================================================
# E(z^{l}z^{l + 1}^T | y, s)
#=========================================================
if l == 0: # E(y, z^{1} | y, s) = y @ E(z^{1} | y, s)
# OK TO REMOVE THE SUM ?
E_z1z2T_ys_l = z1_s @ np.expand_dims(Ez_ys_l, axis = 2)
else:
# To check
E_z1z2T_ys_l = (pz1_ys[..., n_axis] * (z1_s[n_axis] @ \
np.expand_dims(np.expand_dims(Ez_ys_l, axis = 1), 3))).sum(1)
E_z1z2T_ys.append(E_z1z2T_ys_l)
#=========================================================
# E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s]
#=========================================================
e = (np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s, (3, 0, 1, 2)))[..., n_axis]
eeT = e @ t(e, (0, 1, 2, 4, 3))
if l == 0:
EeeT_ys_l = (pz2_ys[...,n_axis] * eeT).sum(1)
else:
pz1z2_ys = np.expand_dims(pz_ys[l - 1], 2) * pz2_z1s[l][n_axis]
pz1z2_ys = pz1z2_ys[..., n_axis, n_axis]
EeeT_ys_l = (pz1z2_ys * eeT[n_axis]).sum((1, 2))
EeeT_ys.append(EeeT_ys_l)
return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys
def E_step_DGMM_d(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, Sd, Ld):
''' Compute the expectations of the E step for all DGMM layers of the discrete head
zl1_ys ((M1, numobs, r1, S1) nd-array): z^{(1)} | y, s
H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters
for each layer
z_s (list of nd-arrays): zl | s^l for all s^l and all l.
zc_s (list of nd-arrays): (zl | s^l) - eta{k_l}^{(l)} for all s^l and all l.
z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s
pz_ys (list of ndarrays): p(z^{l} | y, s)
pz2_z1s (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y)
Sd (list of int): The number of paths starting at each layer of the discrete head
Ld (list of int): The number of layers on the discrete head
------------------------------------------------------------
returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s),
E(z^{(l+1)}z^{(l+1)T} | y, s),
E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)}
'''
k = [H[l].shape[0] for l in range(Ld)]
Ez_ys = []
E_z1z2T_ys = []
E_z2z2T_ys = []
EeeT_ys = []
if len(zl1_ys) != 0:
Ez_ys.append(t(np.mean(zl1_ys, axis = 0), (0, 2, 1)))
for l in range(Ld):
#=========================================================
# Broadcast the quantities to the right shape
#=========================================================
z1_s = z_s[l].transpose((0, 2, 1))[..., n_axis]
z1_s = np.tile(z1_s, (1, np.prod(k[:l]), 1, 1)) # To recheck when L > 3
z1c_s = np.tile(zc_s[l], (1, np.prod(k[:l]), 1))
z2_s = t(z_s[l + 1], (0, 2, 1))
z2_s = np.tile(z2_s, (1, Sd[0] // Sd[l + 1], 1))[..., n_axis]
pz1_ys = pz_ys[l][..., n_axis]
H_formated = np.tile(H[l], (np.prod(k[:l]), 1, 1))
H_formated = np.repeat(H_formated, Sd[l + 1], axis = 0)[n_axis]
#=========================================================
# E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1}
#=========================================================
E_z2_z1s = z2_z1s[l].mean(1)
E_z2_z1s = np.tile(E_z2_z1s, (1, Sd[0] // Sd[l], 1))
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | z^{l}, s)
#=========================================================
E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \
np.expand_dims(z2_z1s[l], 3)).mean(1)
E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, Sd[0] // Sd[l], 1, 1))
#==========================================================
#### E(z^{l + 1} | y, s) = integral_z^l [ p(z^l | y, s) * E(z^{l + 1} | z^l, s) ]
#==========================================================
E_z2_ys_l = (pz1_ys * E_z2_z1s[n_axis]).sum(1)
Ez_ys.append(E_z2_ys_l)
#=========================================================
# E(z^{l}z^{l + 1}^T | y, s)
#=========================================================
E_z1z2T_ys_l = (pz1_ys[..., n_axis] * \
(z1_s @ np.expand_dims(E_z2_z1s, 2))[n_axis]).sum(1)
E_z1z2T_ys.append(E_z1z2T_ys_l)
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | y, s)
#=========================================================
E_z2z2T_ys_l = (pz1_ys[..., n_axis] * E_z2z2T_z1s[n_axis]).sum(1)
E_z2z2T_ys.append(E_z2z2T_ys_l)
#=========================================================
# E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s]
#=========================================================
pz1z2_ys = np.expand_dims(pz_ys[l], 2) * pz2_z1s[l + 1][n_axis]
pz1z2_ys = pz1z2_ys[..., n_axis, n_axis]
e = (np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s, (3, 0, 1, 2)))[..., n_axis]
eeT = e @ t(e, (0, 1, 2, 4, 3))
EeeT_ys_l = (pz1z2_ys * eeT[n_axis]).sum((1, 2))
EeeT_ys.append(EeeT_ys_l)
return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys
def E_step_DGMM_t(H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1st, S_1L, L, k_1L):
''' Compute the expectations of the E step for all DGMM layers of the commont tail
zl1_ys ((M1, numobs, r1, S1) nd-array): z^{(1)} | y, s
H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters
for each layer
z_s (list of nd-arrays): zl | s^l for all s^l and all l.
zc_s (list of nd-arrays): (zl | s^l) - eta{k_l}^{(l)} for all s^l and all l.
z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s
pz_ys (list of ndarrays): p(z^{l} | y, s)
pz2_z1st (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y)
S_1L (dict): The number of paths starting at each layer in the network
L (dict): The number of layers on each head or tail
k_1L (list of int): The number of component on each layer including the common layers
------------------------------------------------------------
returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s),
E(z^{(l+1)}z^{(l+1)T} | y, s),
E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)}
'''
Ez_ys = []
E_z1z2T_ys = []
E_z2z2T_ys = []
EeeT_ys = []
kt = k_1L['t']
for l in range(L['t'] - 1):
#===============================================
# Broadcast the quantities to the right shape
#===============================================
z1_s = z_s[l].transpose((0, 2, 1))
z1_s = np.expand_dims(np.expand_dims(z1_s, 1), 2)
z1_s = np.tile(z1_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l], 1))
z1c_s = np.expand_dims(np.expand_dims(zc_s[l], 1), 2)
z1c_s = np.tile(z1c_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l], 1))
z2_s = t(z_s[l + 1], (0, 2, 1))
z2_s = np.expand_dims(np.expand_dims(z2_s, 1), 2)
z2_s = np.tile(z2_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l + 1], 1))
pz1_ys = pz_ys[l][..., n_axis]
pz2_ys = pz_ys[l + 1][..., n_axis]
pz2_z1s = np.expand_dims(np.expand_dims(pz2_z1st[l], 2), 2)[n_axis]
if l == 0:
Ez_ys_l = (pz1_ys * z1_s[n_axis]).sum(1)
Ez_ys.append(Ez_ys_l)
H_formated = np.tile(H[l], (np.prod(kt[: l]), 1, 1))
H_formated = np.repeat(H_formated, S_1L['t'][l + 1], axis = 0)
H_formated = H_formated[n_axis, n_axis, n_axis]
#===============================================
# Compute the expectations
#===============================================
#=========================================================
# E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1}
#=========================================================
E_z2_z1s = z2_z1s[l].mean(1)
E_z2_z1s = np.tile(E_z2_z1s, (1, S_1L['t'][0] // S_1L['t'][l], 1))
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | z^{l}, s)
#=========================================================
E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \
np.expand_dims(z2_z1s[l], 3)).mean(1)
E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, S_1L['t'][0] // S_1L['t'][l], 1, 1))
# Create a new axis for the information coming from the discrete head
E_z2_z1s = np.expand_dims(np.expand_dims(E_z2_z1s, 1), 1)
E_z2z2T_z1s = np.expand_dims(np.expand_dims(E_z2z2T_z1s, 1), 1)
#==========================================================
# E(z^{l + 1} | y, s) = integral_z^l [ p(z^l | y, s) * E(z^{l + 1} | z^l, s) ]
#==========================================================
E_z2_ys_l = (pz2_ys * z2_s[n_axis]).sum(1)
Ez_ys.append(E_z2_ys_l)
#=========================================================
# E(z^{l}z^{l + 1}^T | y, s)
#=========================================================
E_z1z2T_ys_l = (pz1_ys[..., n_axis] * \
(z1_s[..., n_axis] @ np.expand_dims(E_z2_z1s, 4))[n_axis]).sum(1)
E_z1z2T_ys.append(E_z1z2T_ys_l)
#=========================================================
# E(z^{l + 1}z^{l + 1}^T | y, s)
#=========================================================
E_z2z2T_ys_l = (pz1_ys[..., n_axis] * E_z2z2T_z1s[n_axis]).sum(1)
E_z2z2T_ys.append(E_z2z2T_ys_l)
#=========================================================
# E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s]
#=========================================================
pz1z2_ys = np.expand_dims(pz_ys[l], 2) * pz2_z1s
pz1z2_ys = pz1z2_ys[..., n_axis, n_axis]
e = np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s[..., n_axis], (5, 0, 1, 2, 3, 4))
e = e[..., n_axis]
eeT = e @ t(e, (0, 1, 2, 3, 4, 6, 5))
eeT = eeT[n_axis]
EeeT_ys_l = (pz1z2_ys * eeT).sum((1, 2))
EeeT_ys.append(EeeT_ys_l)
return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys
def M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H_old, k, L_1Lh, rh):
'''
Compute the estimators of eta, Lambda and Psi for all components and all layers of both heads
Ez_ys (list of ndarrays): E(z^{(l)} | y, s) for all (l,s)
E_z1z2T_ys (list of ndarrays): E(z^{(l)}z^{(l+1)T} | y, s)
E_z2z2T_ys (list of ndarrays): E(z^{(l+1)}z^{(l+1)T} | y, s)
EeeT_ys (list of ndarrays): E(z^{(l+1)}z^{(l+1)T} | y, s),
E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)}
ps_y ((numobs, S) nd-array): p(s | y) for all s in Omega
H_old (list of ndarrays): The previous iteration values of Lambda estimators
k (dict): The number of component on each layer
L_1Lh (list of int): The number layers starting from the head h until the end of the common tail
rh (list): The dimension of each layer of the head h
--------------------------------------------------------------------------
returns (list of ndarrays): The new estimators of eta, Lambda and Psi
for all components and all layers
'''
epsilon = 1E-16
Lh = len(E_z1z2T_ys)
numobs = len(Ez_ys[0])
eta = []
H = []
psi = []
for l in range(Lh):
#===============================================
# Broadcast the quantities to the right shape
#===============================================
Ez1_ys_l = Ez_ys[l].reshape(numobs, *k, rh[l], order = 'C')
Ez2_ys_l = Ez_ys[l + 1].reshape(numobs, *k, rh[l + 1], order = 'C')
E_z1z2T_ys_l = E_z1z2T_ys[l].reshape(numobs, *k, rh[l], rh[l + 1], order = 'C')
E_z2z2T_ys_l = E_z2z2T_ys[l].reshape(numobs, *k, rh[l + 1], rh[l + 1], order = 'C')
EeeT_ys_l = EeeT_ys[l].reshape(numobs, *k, rh[l], rh[l], order = 'C')
# Sum all the path going through the layer
idx_to_sum = tuple(set(range(1, L_1Lh)) - set([l + 1]))
ps_yl = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum)[..., n_axis, n_axis]
# Compute common denominator
den = ps_yl.sum(0)
den = np.where(den < epsilon, epsilon, den)
#===============================================
# eta estimator
#===============================================
eta_num = Ez1_ys_l.sum(idx_to_sum)[..., n_axis] -\
H_old[l][n_axis] @ Ez2_ys_l.sum(idx_to_sum)[..., n_axis]
eta_new = (ps_yl * eta_num).sum(0) / den
eta.append(eta_new)
#===============================================
# Lambda estimator
#===============================================
H_num = E_z1z2T_ys_l.sum(idx_to_sum) - \
eta_new[n_axis] @ np.expand_dims(Ez2_ys_l.sum(idx_to_sum), 2)
H_new = (ps_yl * H_num @ pinv(E_z2z2T_ys_l.sum(idx_to_sum))).sum(0) / den
H.append(H_new)
#===============================================
# Psi estimator
#===============================================
psi_new = (ps_yl * EeeT_ys_l.sum(idx_to_sum)).sum(0) / den
psi.append(psi_new)
return eta, H, psi
# L and L_1L are redundant
def M_step_DGMM_t(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, \
ps_y_c, ps_y_d, pst_yCyD, H_old, S_1L, k_1L, L_1L, L, rh):
'''
Compute the estimators of eta, Lambda and Psi for all components and all layers of the tail
Ez_ys (list of ndarrays): E(z^{(l)} | y, s) for all (l,s)
E_z1z2T_ys (list of ndarrays): E(z^{(l)}z^{(l+1)T} | y, s)
E_z2z2T_ys (list of ndarrays): E(z^{(l+1)}z^{(l+1)T} | y, s)
EeeT_ys (list of ndarrays): E(z^{(l+1)}z^{(l+1)T} | y, s),
E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)}
ps_y_* ((numobs, S) nd-array): p(s | y) for all s in Omega^*
pst_yCyD ((numobs, S) nd-array): p(s^t | y) for all s in the tail
H_old (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda
estimators of the previous iteration for each layer
S_1L (dict): The number of paths starting at each layer until the common tail
k_1L (list of int): The number of component on each layer including the common tail
L_1L (dict): The number of layers where the lists include the heads and the tail layers
L (dict): The number of layers in the networks (to delete in future versions)
rh (list): The dimension of each layer of the head h
--------------------------------------------------------------------------
returns (tuple of length 4): The new estimators of eta, Lambda and Psi
for all components and all layers
and the associated latent expectancy
'''
Lh = len(E_z1z2T_ys)
numobs = len(Ez_ys[0])
eta = []
H = []
psi = []
#==========================================================================
# Broadcast path probabilities
#==========================================================================
psc_y = ps_y_c.reshape(numobs, S_1L['c'][0] // S_1L['t'][0],\
S_1L['t'][0], order = 'C')
psd_y = ps_y_d.reshape(numobs, S_1L['d'][0] // S_1L['t'][0],\
S_1L['t'][0], order = 'C')
psc_y = psc_y.sum(-1, keepdims = True)
psc_y = np.expand_dims(psc_y, 2)
psd_y = psd_y.sum(-1, keepdims = True)
psd_y = np.expand_dims(psd_y, 1)
# p(sC, sD, st) = p(sC | yC) p(sD | yD) p(st | yC, yD)
# Add a normalization ?
psCsDst_y = psc_y * psd_y * np.expand_dims(np.expand_dims(pst_yCyD, 1), 1)
Ezst_y = []
for l in range(Lh):
#======================================================================
# Compute the full expectations multiplying by p(sC, sD, st | yC, yD)
#======================================================================
Ez1_yst = (psCsDst_y[..., n_axis] * Ez_ys[l]).sum((1, 2))
Ezst_y.append(deepcopy(Ez1_yst))
Ez2_yst = (psCsDst_y[..., n_axis] * Ez_ys[l + 1]).sum((1, 2))
E_z1z2T_yst = (psCsDst_y[..., n_axis, n_axis] * \
E_z1z2T_ys[l]).sum((1, 2))
E_z2z2T_yst = (psCsDst_y[..., n_axis, n_axis] * \
E_z2z2T_ys[l]).sum((1, 2))
EeeT_yst = (psCsDst_y[..., n_axis, n_axis] * \
EeeT_ys[l]).sum((1, 2))
#======================================================================
# Broadcast the expectations
#======================================================================
Ez1_yst = Ez1_yst.reshape(numobs, *k_1L['t'], rh[l], order = 'C')
Ez2_yst = Ez2_yst.reshape(numobs, *k_1L['t'], rh[l + 1], order = 'C')
E_z1z2T_yst = E_z1z2T_yst.reshape(numobs, *k_1L['t'], rh[l], rh[l + 1], order = 'C')
E_z2z2T_yst = E_z2z2T_yst.reshape(numobs, *k_1L['t'], rh[l + 1], rh[l + 1], order = 'C')
EeeT_yst = EeeT_yst.reshape(numobs, *k_1L['t'], rh[l], rh[l], order = 'C')
# Sum all the paths going through the components of the layer
idx_to_sum = tuple(set(range(1, L_1L['t'] + 1)) - set([l + 1]))
#======================================================================
# Compute the estimators
#======================================================================
# One common denominator for all estimators
den = psCsDst_y.sum((1, 2))
den = den.reshape(numobs, *k_1L['t'], order = 'C').sum(idx_to_sum)
den = den.sum(0)[..., n_axis, n_axis]
den = np.where(den < 1E-14, 1E-14, den)
# eta estimator
eta_num = Ez1_yst.sum(idx_to_sum)[..., n_axis] -\
H_old[l][n_axis] @ Ez2_yst.sum(idx_to_sum)[..., n_axis]
eta_new = eta_num.sum(0) / den
eta.append(eta_new)
# Lambda estimator
H_num = E_z1z2T_yst.sum(idx_to_sum) - \
eta_new[n_axis] @ np.expand_dims(Ez2_yst.sum(idx_to_sum), 2)
try:
H_new = (H_num @ pinv(E_z2z2T_yst.sum(idx_to_sum), rcond=1e-2)).sum(0) / den
except:
print(E_z2z2T_yst.sum(idx_to_sum))
raise RuntimeError('Overflow ?')
H.append(H_new)
# Psi estimator
psi_new = EeeT_yst.sum(idx_to_sum).sum(0) / den
psi.append(psi_new)
return eta, H, psi, Ezst_y