diff --git a/examples/20231102_multinom.qmd b/examples/20231102_multinom.qmd new file mode 100644 index 0000000..fa1d0c8 --- /dev/null +++ b/examples/20231102_multinom.qmd @@ -0,0 +1,109 @@ +## Experimentation with multinomial model + +```{python} +import pandas as pd +import pymc as pm + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import arviz as az +import statsmodels.api as sm + +import matplotlib.dates as mdates +import matplotlib.ticker as ticker +import matplotlib.cm as cm + +import covvfit as cv +``` + +Load the data: +```{python} +data_path = '../private/data/robust_deconv2_noisy13.csv' + +variants = [ +# 'B.1.1.7', 'B.1.351', 'P.1', 'undetermined', + 'B.1.617.2', 'BA.1', 'BA.2', 'BA.4', 'BA.5', 'BA.2.75', + 'BQ.1.1', 'XBB.1.5', 'XBB.1.9', 'XBB.1.16', 'XBB.2.3', 'EG.5', "BA.2.86" +] + +cities = ['Lugano (TI)', 'Zürich (ZH)', 'Chur (GR)', 'Altenrhein (SG)', + 'Laupen (BE)', 'Genève (GE)', 'Basel (BS)', 'Porrentruy (JU)', + 'Lausanne (VD)', 'Bern (BE)', 'Luzern (LU)', 'Solothurn (SO)', + 'Neuchâtel (NE)', 'Schwyz (SZ)'] + + +data = cv.load_data(data_path) +data2 = cv.preprocess_df(data, cities, variants, date_min='2021-11-01') + +ts_lst, ys_lst = cv.make_data_list(data2, cities, variants) +``` + + +Let's load one city only: +```{python} +ys = ys_lst[1] +ys = ys / ys.sum(0) +ts = ts_lst[1] +``` + +Now we can create model for this one city: +```{python} +from pymc.distributions.dist_math import factln + +# model for just one city +def create_model5( + ts_lst, + ys_lst, + n=1.0, + coords={ +# "cities":cities, + "variants":variants, + }, + n_pred=60 +): + ts_pred = np.arange(n_pred) + ts_lst.max() + with pm.Model(coords=coords) as model: +# sigma_var = pm.InverseGamma("sigma", alpha=2.1, beta=0.015, dims=["cities","variants"]) + midpoint_var = pm.Normal("midpoint", mu=0.0, sigma=500.0, dims="variants") +# midpoint_sig = pm.InverseGamma("midpoint_sig", alpha=7.0, beta=60.0) + rate_var = pm.Gamma("rate", mu=0.15, sigma=0.1, dims="variants") +# rate_sig = pm.InverseGamma("rate_sigma", alpha=2.0005, beta=0.05) + n_eff_inv = pm.InverseGamma("n_eff_inv", alpha=20.0, beta=2.0) + n_eff = pm.Deterministic("n_eff", 1/n_eff_inv) +# n_eff = pm.TruncatedNormal("n_eff", mu=10, sigma=10, lower=1.0) +# n_eff = pm.Gamma("n_eff", alpha=1000, beta=100) + + # Kaan's trick to avoid overflows + def softmax(x, rates, midpoints): + E = rates[:, None] * (x - midpoints[:, None]) + E_max = E.max(axis=0) + un_norm = pm.math.exp(E - E_max) + return un_norm / (pm.math.sum(un_norm, axis=0)) + + ys_smooth = pm.Deterministic(f"ys_ideal",softmax(ts_lst, rate_var, midpoint_var), dims="variants") + ys_pred = pm.Deterministic(f"ys_pred",softmax(ts_pred, rate_var, midpoint_var), dims="variants") +# ys_wiggly = pm.Beta(f"ys_wiggly", mu=ys_smooth, nu=n_eff) + + # make Multinom/n likelihood + def log_likelihood(y, p, n): + return n*pm.math.sum(y * pm.math.log(p) - factln(n*y), axis=0) + pm.math.log(n) + factln(n) + + ys_noisy = pm.DensityDist( + f"ys_noisy", + ys_smooth, + n_eff, + logp=log_likelihood, + observed=ys_lst, + ) + + return model + + + +with create_model(ts, ys, coords={ + "variants":variants, + }): + idata_posterior = pm.sample(random_seed=65, chains=2, tune=500, draws=500) +``` + diff --git a/examples/frequentist_fitting.qmd b/examples/frequentist_fitting.qmd new file mode 100644 index 0000000..629a891 --- /dev/null +++ b/examples/frequentist_fitting.qmd @@ -0,0 +1,332 @@ +## Frequentist fitting + +```{python} +import pandas as pd +import pymc as pm + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import arviz as az +import statsmodels.api as sm + +import matplotlib.dates as mdates +import matplotlib.ticker as ticker +import matplotlib.cm as cm +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import matplotlib.patches as mpatches + +from scipy.special import expit + +import covvfit + +from covvfit.plotting import colors_covsp + +data_path = "../private/data/robust_deconv2_noisy14.csv" + +cities = ['Lugano (TI)', 'Zürich (ZH)', 'Chur (GR)', 'Altenrhein (SG)', + 'Laupen (BE)', 'Genève (GE)', 'Basel (BS)', 'Porrentruy (JU)', + 'Lausanne (VD)', 'Bern (BE)', 'Luzern (LU)', 'Solothurn (SO)', + 'Neuchâtel (NE)', 'Schwyz (SZ)'] + +variants_full = [ + 'B.1.1.7', 'B.1.351', 'P.1', + 'B.1.617.2', 'BA.1', 'BA.2', 'BA.4', 'BA.5', 'BA.2.75', + 'BQ.1.1', + 'XBB.1.5', 'XBB.1.9', 'XBB.1.16', 'XBB.2.3', 'EG.5', "BA.2.86", "JN.1" +] + +variants = [ + 'XBB.1.5', 'XBB.1.9', 'XBB.1.16', 'XBB.2.3', 'EG.5', "BA.2.86", "JN.1" +] + +variants_other = [var for var in variants_full if var not in variants] +``` + +Let's load the data: +Now +```{python} +data = covvfit.load_data(data_path) +variants2 = ['other'] + variants +data2 = covvfit.preprocess_df(data, cities, variants_full, date_min='2023-04-01') +data2["other"] = data2[variants_other].sum(axis=1) +data2[variants2] = data2[variants2].div(data2[variants2].sum(axis=1), axis=0) + +ts_lst, ys_lst = covvfit.make_data_list(data2, cities=cities, variants=variants) +ts_lst2, ys_lst2 = covvfit.make_data_list(data2, cities=cities, variants=variants2) +``` + +This model takes into account the complement of the variants to be monitored, and sets its fitness to zero. However, due to the `pm.math.concatenate` operation, we cannot use it for finding the Hessian matrix: + +```{python} +def create_model_fixed2( + ts_lst, + ys_lst, + n=1.0, + coords={ + "cities":[], + "variants":[], + }, + n_pred=60 +): + """function to create a fixed effect model with varying intercepts and one rate vector""" + with pm.Model(coords=coords) as model: + midpoint_var = pm.Normal("midpoint", mu=0.0, sigma=300.0, dims=["cities", "variants"]) + rate_var = pm.Gamma("rate", mu=0.15, sigma=0.1, dims="variants") + + # Kaan's trick to avoid overflows + def softmax(x, rates, midpoints): + E = rates[:, None] * x + midpoints[:, None] + E_max = E.max(axis=0) + un_norm = pm.math.exp(E - E_max) + return un_norm / (pm.math.sum(un_norm, axis=0)) + + ys_smooth = [softmax( + ts_lst[i], + pm.math.concatenate([[0], rate_var]), + pm.math.concatenate([[0], midpoint_var[i,:]]), + ) for i, city in enumerate(coords["cities"])] + + # make Multinom/n likelihood + def log_likelihood(y, p, n): + # return n*pm.math.sum(y * pm.math.log(p), axis=0) + n*(1-pm.math.sum(y, axis=0))*pm.math.log(1-pm.math.sum(p, axis=0)) + return n*pm.math.sum(y * pm.math.log(p), axis=0) + + ys_noisy = [pm.DensityDist( + f"ys_noisy_{city}", + ys_smooth[i], + n, + logp=log_likelihood, + observed=ys_lst[i], + ) for i, city in enumerate(coords["cities"])] + + return model +``` + +Let's run this model: +```{python} + +n_iterations = 5_000 + +# TODO(Pawel, David): The shapes don't work +# if we use ts_lst and ys_lst... + +with create_model_fixed2(ts_lst2, ys_lst2, coords={ + "cities": cities, + "variants": variants, + }): + model_map_fixed = pm.find_MAP(maxeval=n_iterations, seed=12313) +``` + +This model takes into account the complement of the variants to be monitored, and sets its fitness to zero. However, it has some numerical instabilities that make it not suitable for finding the MAP or MLE, but I use it for the Hessian. + +```{python} +def create_model_fixed3( + ts_lst, + ys_lst, + n=1.0, + coords={ + "cities":[], + "variants":[], + }, + n_pred=60 +): + """function to create a fixed effect model with varying intercepts and one rate vector""" + with pm.Model(coords=coords) as model: + midpoint_var = pm.Normal("midpoint", mu=0.0, sigma=1500.0, dims=["cities", "variants"]) + rate_var = pm.Gamma("rate", mu=0.15, sigma=0.1, dims="variants") + + # Kaan's trick to avoid overflows + def softmax_1(x, rates, midpoints): + E = rates[:, None] * x + midpoints[:, None] + E_max = E.max(axis=0) + un_norm = pm.math.exp(E - E_max) + return un_norm / (pm.math.exp(-E_max) + pm.math.sum(un_norm, axis=0)) + + ys_smooth = [softmax_1(ts_lst[i], rate_var, midpoint_var[i,:]) for i, city in enumerate(coords["cities"])] + + # make Multinom/n likelihood + def log_likelihood(y, p, n): + return n*pm.math.sum(y * pm.math.log(p), axis=0) + n*(1-pm.math.sum(y, axis=0))*pm.math.log(1-pm.math.sum(p, axis=0)) +# return n*pm.math.sum(y * pm.math.log(p), axis=0) + + ys_noisy = [pm.DensityDist( + f"ys_noisy_{city}", + ys_smooth[i], + n, + logp=log_likelihood, + observed=ys_lst[i], + ) for i, city in enumerate(coords["cities"])] + + return model + + +with create_model_fixed3(ts_lst2, ys_lst2, coords={ + "cities": cities, + "variants": variants2, + }): + model_hessian_fixed = pm.find_hessian( + model_map_fixed + ) + +# TODO(Pawel): I get an error stemming from `pm.find_hessian`... +``` + +Let's calculate the predictions: +```{python} +y_fit_lst = covvfit.freq.fitted_values(ts_lst, model_map_fixed, cities) +ts_pred_lst, y_pred_lst = covvfit.freq.pred_values([i.max()-1 for i in ts_lst], model_map_fixed, cities, horizon=60) +``` + +Now we'll try to calculate the uncertainty (but it won't work if Hessian couldn't be calculated) +```{python} +pearson_r_lst, overdisp_list, overdisp_fixed = covvfit.freq.compute_overdispersion(ys_lst2, y_fit_lst, cities) +fitness_diff, fitness_diff_se, fitness_diff_lower, fitness_diff_upper = covvfit.freq.make_fitness_confints(model_map_fixed['rate'], model_hessian_fixed, overdisp_fixed, g=7.0) +``` + + +Finally, we will visualize the plots: + +```{python} +fig, axes_tot = plt.subplots(5,3,figsize=(22,15)) +# colors = default_cmap = plt.cm.get_cmap('tab10').colors +colors = [colors_covsp[var] for var in variants] +# axes=[axes_tot] +axes=axes_tot.flatten() +p_variants = len(variants) +p_params = model_hessian_fixed.shape[0] +model_hessian_inv = np.linalg.inv(model_hessian_fixed) + +for k, city in enumerate(cities): + ax = axes[k+1] + y_fit = y_fit_lst[k] + ts=ts_lst[k] + ts_pred = ts_pred_lst[k] + y_pred = y_pred_lst[k] + ys = ys_lst2[k] + hessian_indices = np.concatenate([np.arange(p_variants) + k*p_variants, np.arange(model_hessian_fixed.shape[0] - p_variants, p_params)]) + tmp_hessian = model_hessian_inv[hessian_indices,:][:,hessian_indices] + y_fit_logit = (np.log(y_fit) - np.log(1-y_fit)) + logit_se = np.array([project_se(model_map_fixed['rate'], model_map_fixed['midpoint'][k,:], t, tmp_hessian, overdisp_list[k]) for t in ts]).T + y_pred_logit = (np.log(y_pred) - np.log(1-y_pred)) + logit_se_pred = np.array([project_se(model_map_fixed['rate'], model_map_fixed['midpoint'][k,:], t, tmp_hessian, overdisp_list[k]) for t in ts_pred]).T + + for i, variant in enumerate(variants): + + # grid + ax.vlines(x=(pd.date_range(start='2021-11-01', end='2024-02-01', freq='MS')- pd.to_datetime('2023-01-01')).days, + ymin=-0.05, ymax=1.05, color="grey", alpha=0.02) + ax.hlines(y=[0, 0.25,0.5,0.75, 1], + xmin=(pd.to_datetime('2021-10-10')- pd.to_datetime('2023-01-01')).days, + xmax=(pd.to_datetime('2024-02-20')- pd.to_datetime('2023-01-01')).days, + color="grey", alpha=0.02) + ax.fill_between(x=ts_pred, y1=0, y2=1, color="grey", alpha=0.01) + + # plot fitted + sorted_indices = np.argsort(ts) + ax.plot(ts[sorted_indices], y_fit[i,:][sorted_indices], color=colors[i], label="fit") + # plot pred + ax.plot(ts_pred, y_pred[i,:], color=colors[i], linestyle="--", label="predict") +# plot confints + ax.fill_between( + ts[sorted_indices], + expit(y_fit_logit[i,:][sorted_indices] - 1.96 * logit_se[i,:][sorted_indices]), + expit(y_fit_logit[i,:][sorted_indices] + 1.96 * logit_se[i,:][sorted_indices]), + color=colors[i], alpha=0.2, label='Confidence band' + ) + ax.fill_between( + ts_pred, + expit(y_pred_logit[i,:] - 1.96 * logit_se_pred[i,:]), + expit(y_pred_logit[i,:] + 1.96 * logit_se_pred[i,:]), + color=colors[i], alpha=0.2, label='Confidence band' + ) + + # plot empirical + ax.scatter(ts, ys[i,:], label="observed", alpha=0.5, color=colors[i], s=4) + + ax.set_ylim((-0.05,1.05)) + ax.set_xticks((pd.date_range(start='2021-11-01', end='2023-12-01', freq='MS') - pd.to_datetime('2023-01-01')).days) + date_formatter = ticker.FuncFormatter(num_to_date) + ax.xaxis.set_major_formatter(date_formatter) + tick_positions = [0, 0.5, 1] + tick_labels = ['0%', '50%', '100%'] + ax.set_yticks(tick_positions) + ax.set_yticklabels(tick_labels) + ax.set_ylabel("relative abundances") + ax.set_xlim((pd.to_datetime(['2023-03-15', '2024-01-05'])- pd.to_datetime('2023-01-01')).days) + ax.set_title(f"{city}") + +## Plot estimates + +ax = axes[0] + +fitness_diff, fitness_diff_se, fitness_diff_lower, fitness_diff_upper = make_fitness_confints(model_map_fixed['rate'], model_hessian_fixed, overdisp_fixed, g=7.0) + +fitness_diff = fitness_diff * 100 +fitness_diff_lower = fitness_diff_lower * 100 +fitness_diff_upper = fitness_diff_upper * 100 + +# Get the indices for the upper triangle, starting at the diagonal (k=0) +upper_triangle_indices = np.triu_indices_from(fitness_diff, k=0) + +# Assign np.nan to the upper triangle including the diagonal +fitness_diff[upper_triangle_indices] = np.nan +fitness_diff_lower[upper_triangle_indices] = np.nan +fitness_diff_upper[upper_triangle_indices] = np.nan + +fitness_diff[:-2,:] = np.nan +fitness_diff_lower[:-2,:] = np.nan +fitness_diff_upper[:-2,:] = np.nan + +# Calculate the error (distance from the point to the error bar limit) +error = np.array([ + fitness_diff - fitness_diff_lower, # Lower error + fitness_diff_upper - fitness_diff # Upper error +]) + +# Define the width of the offset +offset_width = 0.1 +num_sets = fitness_diff.shape[0] +# num_sets = 2 +mid = (num_sets - 1) / 2 + +# grid +ax.vlines(x=np.arange(len(variants)-1), + ymin=np.nanmin(fitness_diff_lower), ymax=np.nanmax(fitness_diff_upper), + color="grey", alpha=0.2) +ax.hlines(y=np.arange(-25, 126, step=25), + xmin=-0.5, + xmax=len(variants)-2+0.5, + color="grey", alpha=0.2) + +# Plot each set of points with error bars +for i, y_vals in enumerate(fitness_diff): + # Calculate offset for each set + offset = (i - mid) * offset_width + # Create an array of x positions for this set +# x_positions = np.arange(len(variants)) + offset + x_positions = np.arange(len(variants)) + offset - 0.25 + # We need to transpose the error array to match the shape of y_vals + ax.errorbar(x_positions, y_vals, yerr=error[:, i, :], fmt='o', label=variants[i], color=colors_covsp[variants[i]]) + +# Set the x-ticks to be at the middle of the groups of points +ax.set_xticks(np.arange(len(variants)-1)) +ax.set_xticklabels(variants[:-1]) + +# Add some labels and a legend +ax.set_xlabel('Variants') +ax.set_ylabel('% weekly growth advantage') +ax.set_title("growth advantages") + + + + +fig.tight_layout() +fig.legend(handles=make_legend(colors, variants), loc = 'lower center', ncol=9, bbox_to_anchor = (0.5, -0.04), frameon=False) + + +fig.savefig("growth_rates20231108.pdf", bbox_inches='tight') + +plt.show() +``` \ No newline at end of file diff --git a/src/covvfit/__init__.py b/src/covvfit/__init__.py index 1a1a6c1..1a9545c 100644 --- a/src/covvfit/__init__.py +++ b/src/covvfit/__init__.py @@ -1,6 +1,18 @@ from covvfit._splines import create_spline_matrix +from covvfit._preprocess_abundances import make_data_list, preprocess_df, load_data + +import covvfit._frequentist as freq +import covvfit.plotting as plot VERSION = "0.1.0" -__all__ = ["create_spline_matrix", "VERSION"] +__all__ = [ + "create_spline_matrix", + "make_data_list", + "preprocess_df", + "load_data", + "VERSION", + "freq", + "plot", +] diff --git a/src/covvfit/_frequentist.py b/src/covvfit/_frequentist.py new file mode 100644 index 0000000..4520dae --- /dev/null +++ b/src/covvfit/_frequentist.py @@ -0,0 +1,202 @@ +"""utilities to fit frequentist models""" +import numpy as np +import pymc as pm + + +def create_model_fixed( + ts_lst, + ys_lst, + n: float = 1.0, + coords: dict | None = None, +) -> pm.Model: + """Creates a fixed effect model + with varying intercepts and one rate vector.""" + if n < 0: + raise ValueError("n must be positive") + + if coords is None: + coords = { + "cities": [], + "variants": [], + } + + with pm.Model(coords=coords) as model: + midpoint_var = pm.Normal( + "midpoint", mu=0.0, sigma=1500.0, dims=["cities", "variants"] + ) + rate_var = pm.Gamma("rate", mu=0.15, sigma=0.1, dims="variants") + + # Kaan's trick to avoid overflows + def softmax_1(x, rates, midpoints): + E = rates[:, None] * x + midpoints[:, None] + E_max = E.max(axis=0) + un_norm = pm.math.exp(E - E_max) # pyright: ignore + return un_norm / ( + pm.math.exp(-E_max) + pm.math.sum(un_norm, axis=0) # pyright: ignore + ) + + ys_smooth = [ + softmax_1(ts_lst[i], rate_var, midpoint_var[i, :]) + for i, city in enumerate(coords["cities"]) + ] + + # make Multinom/n likelihood + def log_likelihood(y, p, n): + return n * pm.math.sum(y * pm.math.log(p), axis=0) # pyright: ignore + + [ + pm.DensityDist( + f"ys_noisy_{city}", + ys_smooth[i], + n, + logp=log_likelihood, + observed=ys_lst[i], + ) + for i, city in enumerate(coords["cities"]) + ] + + return model + + +def softmax(E): + """softmax with the trick to avoid overflows""" + E_max = E.max(axis=0) + un_norm = np.exp(E - E_max) + return un_norm / (np.exp(-E_max) + np.sum(un_norm, axis=0)) + + +def softmax_1(x, rates, midpoints): + """Softmax with the trick to avoid overflows, + assuming an additional class with rate fixed to 0""" + E = rates[:, None] * x + midpoints[:, None] + E_max = E.max(axis=0) + un_norm = np.exp(E - E_max) + return un_norm / (np.exp(-E_max) + np.sum(un_norm, axis=0)) + + +def fitted_values(ts_lst, model_map_fixed, cities): + """function to make the fitted values of a model""" + y_fit_lst = [ + softmax_1(ts_lst[i], model_map_fixed["rate"], model_map_fixed["midpoint"][i, :]) + for i, city in enumerate(cities) + ] + + return y_fit_lst + + +def pred_values(ts_lst, model_map_fixed, cities, horizon=60): + """function to make the fitted values of a model""" + ts_pred_lst = [np.arange(horizon) + tt.max() + 1 for tt in ts_lst] + y_pred_lst = [ + softmax_1( + ts_pred_lst[i], model_map_fixed["rate"], model_map_fixed["midpoint"][i, :] + ) + for i, city in enumerate(cities) + ] + + return ts_pred_lst, y_pred_lst + + +def compute_overdispersion(ys_lst, y_fit_lst, cities): + """compute overdispersion from a quasimultinom model""" + pearson_r_lst = [ + (y_fit_lst[k] - ys_lst[k]) ** 2 / (y_fit_lst[k] * (1 - y_fit_lst[k])) + for k, city in enumerate(cities) + ] + overdisp_list2 = [ + i.sum() / ((i.shape[0]) * (i.shape[1] - 1)) for i in pearson_r_lst + ] + overdisp_fixed = np.concatenate(pearson_r_lst, axis=1).sum() / np.sum( + [((i.shape[0]) * (i.shape[1] - 1)) for i in pearson_r_lst] + ) + + return pearson_r_lst, overdisp_list2, overdisp_fixed + + +def make_jacobian(rate, midpoint, t): + """function to make the jacobian of logit(y), with the +1""" + linear_predict = rate * t + midpoint + tiled_array = np.tile(linear_predict, (linear_predict.shape[0], 1)) + np.fill_diagonal(tiled_array, 0) + softmax_array = np.apply_along_axis(softmax, 1, tiled_array) * (-1) + np.fill_diagonal(softmax_array, 1) + array_out = np.concatenate([softmax_array, softmax_array], axis=1) + return array_out + + +def project_se(rate, midpoint, t, model_hessian_inv, overdisp=1.0): + """function to project the standard errors on the logit(y) scale""" + jacobian = make_jacobian(rate, midpoint, t) + return np.sqrt(np.diag(jacobian.dot(model_hessian_inv).dot(jacobian.T))) * overdisp + + +def make_ratediff_confints( + t_rate, + model_hessian_fixed, + p_variants: int, + overdisp_fixed: float = 1.0, + g: float = 7.0, +): + """Projects the standard errors on the rate difference scale, + computes Wald confidence intervals, and transform to fitness scale""" + # TODO(Pawel, David): p_variants was not an argument before, so I added it naively + # but it should be checked if it is correct. + # Feel free to remove this exception once you take a look :) + raise NotImplementedError( + "The formulae with p_variants need to be " + "validated before this function can be used!" + ) + + t_hess_inv = np.linalg.inv(model_hessian_fixed)[-p_variants:, -p_variants:] + + rate_diff = np.array([[j - i for i in t_rate] for j in t_rate]) + fitness_diff = np.exp(rate_diff * g) - 1 + + rate_diff_se = np.sqrt( + np.array( + [ + [ + t_hess_inv[[i, j], :][:, [i, j]].diagonal().sum() + - np.fliplr(t_hess_inv[[i, j], :][:, [i, j]]).diagonal().sum() + for i, _ in enumerate(t_rate) + ] + for j, _ in enumerate(t_rate) + ] + ) + ) + rate_diff_se = overdisp_fixed * rate_diff_se + rate_diff_lower = rate_diff - 1.96 * rate_diff_se + rate_diff_upper = rate_diff + 1.96 * rate_diff_se + fitness_diff_lower = np.exp(rate_diff_lower * g) - 1 + fitness_diff_upper = np.exp(rate_diff_upper * g) - 1 + + return fitness_diff, rate_diff_se, fitness_diff_lower, fitness_diff_upper + + +def make_fitness_confints(t_rate, model_hessian_fixed, overdisp_fixed=1.0, g=7.0): + """project the standard errors on the fitness scale, compute wald confints""" + p_variants = t_rate.shape[0] + t_hess_inv = np.linalg.inv(model_hessian_fixed)[-p_variants:, -p_variants:] + + rate_diff = np.array([[j - i for i in t_rate] for j in t_rate]) + fitness_diff = np.exp(rate_diff * g) - 1 + + fitness_diff_se = [] + for i, r1 in enumerate(t_rate): + t_lst = [] + for j, r2 in enumerate(t_rate): + t_jacobian = np.array( + [g * np.exp(g * (r2 - r1)), -g * np.exp(g * (r2 - r1))] + ) + se = np.sqrt( + t_jacobian.dot(t_hess_inv[[i, j], :][:, [i, j]]).dot(t_jacobian.T) + ) + t_lst.append(se) + fitness_diff_se.append(t_lst) + fitness_diff_se = np.array(fitness_diff_se) + + fitness_diff_se = overdisp_fixed * fitness_diff_se + fitness_diff_lower = fitness_diff - 1.96 * fitness_diff_se + fitness_diff_upper = fitness_diff + 1.96 * fitness_diff_se + + return fitness_diff, fitness_diff_se, fitness_diff_lower, fitness_diff_upper diff --git a/src/covvfit/_preprocess_abundances.py b/src/covvfit/_preprocess_abundances.py new file mode 100644 index 0000000..181bd8d --- /dev/null +++ b/src/covvfit/_preprocess_abundances.py @@ -0,0 +1,57 @@ +"""utilities to preprocess relative abundances""" +import pandas as pd + + +def load_data(file) -> pd.DataFrame: + wwdat = pd.read_csv(file) + wwdat = wwdat.rename(columns={wwdat.columns[0]: "time"}) # pyright: ignore + return wwdat + + +def preprocess_df( + df: pd.DataFrame, + cities: list[str], + variants: list[str], + undertermined_thresh: float = 0.01, + zero_date: str = "2023-01-01", + date_min: str | None = None, + date_max: str | None = None, +) -> pd.DataFrame: + """Preprocessing.""" + df = df.copy() + + # Convert the 'time' column to datetime + df["time"] = pd.to_datetime(df["time"]) + + # Remove days with too high undetermined + df = df[df["undetermined"] < undertermined_thresh] # pyright: ignore + + # Subset the 'BQ.1.1' column + df = df[["time", "city"] + variants] # pyright: ignore + + # Subset only the specified cities + df = df[df["city"].isin(cities)] # pyright: ignore + + # Create a new column which is the difference in days between zero_date and the date + df["days_from"] = (df["time"] - pd.to_datetime(zero_date)).dt.days + + # Subset dates + if date_min is not None: + df = df[df["time"] >= pd.to_datetime(date_min)] # pyright: ignore + if date_max is not None: + df = df[df["time"] < pd.to_datetime(date_max)] # pyright: ignore + + return df + + +def make_data_list( + df: pd.DataFrame, + cities: list[str], + variants: list[str], +) -> tuple[list, list]: + ts_lst = [df[(df.city == city)].days_from.values for city in cities] + ys_lst = [ + df[(df.city == city)][variants].values.T for city in cities # pyright: ignore + ] + + return (ts_lst, ys_lst) diff --git a/src/covvfit/plotting/__init__.py b/src/covvfit/plotting/__init__.py index 1998f18..0a01bc9 100644 --- a/src/covvfit/plotting/__init__.py +++ b/src/covvfit/plotting/__init__.py @@ -1,5 +1,13 @@ """Plotting functionalities.""" from covvfit.plotting._grid import plot_grid, set_axis_off from covvfit.plotting._simplex import plot_on_simplex +from covvfit.plotting._timeseries import make_legend, num_to_date, colors_covsp -__all__ = ["plot_on_simplex", "plot_grid", "set_axis_off"] +__all__ = [ + "plot_on_simplex", + "plot_grid", + "set_axis_off", + "make_legend", + "num_to_date", + "colors_covsp", +] diff --git a/src/covvfit/plotting/_timeseries.py b/src/covvfit/plotting/_timeseries.py new file mode 100644 index 0000000..b0130f1 --- /dev/null +++ b/src/covvfit/plotting/_timeseries.py @@ -0,0 +1,62 @@ +"""utilities to plot""" +import pandas as pd + +import matplotlib.lines as mlines +import matplotlib.patches as mpatches + + +colors_covsp = { + "B.1.1.7": "#D16666", + "B.1.351": "#FF6666", + "P.1": "#FFB3B3", + "B.1.617.1": "#A3FFD1", + "B.1.617.2": "#66C266", + "BA.1": "#A366A3", + "BA.2": "#CFAFCF", + "BA.4": "#8467F6", + "BA.5": "#595EF6", + "BA.2.75": "#DE9ADB", + "BQ.1.1": "#8fe000", + "XBB.1.9": "#dd6bff", + "XBB.1.5": "#ff5656", + "XBB.1.16": "#e99b30", + "XBB.2.3": "#b4b82a", + "EG.5": "#359f99", + "BA.2.86": "#FF20E0", + "JN.1": "#00e9ff", + "undetermined": "#999696", +} + + +def make_legend(colors, variants): + """make a shared legend for the plot""" + # Create a patch (i.e., a colored box) for each variant + variant_patches = [ + mpatches.Patch(color=color, label=variants[i]) for i, color in enumerate(colors) + ] + + # Create lines for "fitted", "predicted", and "observed" labels + fitted_line = mlines.Line2D([], [], color="black", linestyle="-", label="fitted") + predicted_line = mlines.Line2D( + [], [], color="black", linestyle="--", label="predicted" + ) + observed_points = mlines.Line2D( + [], [], color="black", marker="o", linestyle="None", label="daily estimates" + ) + blank_line = mlines.Line2D([], [], color="white", linestyle="", label="") + + # Combine all the legend handles + handles = variant_patches + [ + blank_line, + fitted_line, + predicted_line, + observed_points, + ] + + return handles + + +def num_to_date(num, pos=None, date_min="2023-01-01", fmt="%b. '%y"): + """convert days number into a date format""" + date = pd.to_datetime(date_min) + pd.to_timedelta(num, "D") + return date.strftime(fmt)