from statsmodels.stats import multitest as multi
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
#Import data for Vendor A
genotek=pd.read_csv('genotek_complete.csv')
#set index
genotek.set_index('public_client_id',inplace=True)
#generate new variables sqrt min
genotek['min_bray_sqrt']=np.sqrt(genotek['min_bray'])
genotek['min_bray_g_sqrt']=np.sqrt(genotek['min_bray_genus'])
print(genotek['age'].min())
print(genotek['age'].max())
print(genotek['age'].mean())
print(genotek['age'].std())
print(genotek.groupby(by='sex').size())
21
82
47.45844820795589
12.054126130560013
sex
F 1588
M 951
dtype: int64
#correlation between age and Bray-Curtis Vendor A Fig. 2B
print("Vendor A")
print('ASV level Bray-Curtis',scipy.stats.spearmanr(genotek['min_bray'],genotek['age']))
print('Genus level Bray-Curtis',scipy.stats.spearmanr(genotek['min_bray_genus'],genotek['age']))
Vendor A
ASV level Bray-Curtis SpearmanrResult(correlation=0.1572768840256516, pvalue=1.577842568588764e-15)
Genus level Bray-Curtis SpearmanrResult(correlation=0.18208894446581123, pvalue=2.2855154835446022e-20)
# Import data from Vendor B
second_genome=pd.read_csv('second_genome_complete.csv')
#set index
second_genome.set_index('public_client_id',inplace=True)
#generate new variables sqrt min
second_genome['min_bray_sqrt']=np.sqrt(second_genome['min_bray'])
second_genome['min_bray_g_sqrt']=np.sqrt(second_genome['min_bray_genus'])
print(second_genome['age'].min())
print(second_genome['age'].max())
print(second_genome['age'].mean())
print(second_genome['age'].std())
print(second_genome.groupby(by='sex').size())
18
87
49.37881508078995
12.52067975055304
sex
F 590
M 524
dtype: int64
#correlation between age and Bray-Curtis Vendor A Fig. 2B
print("Vendor B")
print('ASV level Bray-Curtis',scipy.stats.spearmanr(second_genome['min_bray'],second_genome['age']))
print('Genus level Bray-Curtis',scipy.stats.spearmanr(second_genome['min_bray_genus'],second_genome['age']))
Vendor B
ASV level Bray-Curtis SpearmanrResult(correlation=0.18196533005049664, pvalue=9.49260731657714e-10)
Genus level Bray-Curtis SpearmanrResult(correlation=0.18897948927799696, pvalue=2.046764790654971e-10)
#Generate Figure 2A ASV
sns.set(font_scale=2.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[15,22.5], dpi=100)
#combine data
frames=[genotek,second_genome]
combined=pd.concat(frames,axis=0,sort=True)
#adjust for vendor
results=smf.ols('min_bray~vendor',data=combined).fit()
combined['corr']=results.resid+combined['min_bray'].mean()
#generate figure
ax=sns.boxplot(y=(combined['corr']),x=combined['age_1'],notch=False,fliersize=10.0, order=['<30','30-39','40-49','50-59','60-69','70-79','80+'],palette='Greys',showfliers=True,linewidth=4, meanline=False,showmeans=False)
ax.set_xlabel('Age')
ax.set_ylabel('Uniqueness (Bray-Curtis)')
plt.show()
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
![png](/PriceLab/AgingMicrobiome/raw/master/output_7_1.png)
#Generate Figure 2A Genus
sns.set(font_scale=2.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[15,22.5], dpi=100)
#adjust for vendor
results=smf.ols('min_bray_genus~vendor',data=combined).fit()
combined['corr_genus']=results.resid+combined['min_bray_genus'].mean()
#generate boxplot
ax=sns.boxplot(y=(combined['corr_genus']),x=combined['age_1'],notch=False,fliersize=10.0, order=['<30','30-39','40-49','50-59','60-69','70-79','80+'],palette='Blues',showfliers=True,linewidth=4, meanline=False,showmeans=False)
ax.set_xlabel('Age')
ax.set_ylabel('Uniqueness (Bray-Curtis)')
plt.show()
![png](/PriceLab/AgingMicrobiome/raw/master/output_8_0.png)
#Kernel density plots Fig.2
sns.set(font_scale=2.0,context='poster',font='Arial',style='white')
plt.figure(figsize=[15,10], dpi=100)
sns.distplot([combined['min_bray_genus']], hist = False, kde = True,color='lightblue',
kde_kws = {'shade': True, 'linewidth': 2})
sns.distplot(combined['min_bray'], hist = False, kde = True,color='black',
kde_kws = {'shade': True, 'linewidth': 2})
<matplotlib.axes._subplots.AxesSubplot at 0x7f37a096d748>
![png](/PriceLab/AgingMicrobiome/raw/master/output_9_1.png)
weight=pd.read_csv('weight_arivale.csv')
weight=weight.set_index('public_client_id')
combined['BMI']=weight['BMI_CALC']
sex=[]
for x in combined['sex']:
if x=='F':
sex.append(1)
elif x=='M':
sex.append(0)
else:
sex.append(9)
combined['sex_num']=sex
#Statistical analysis figure 1A
combined['min_bray_sqrt_genus']=np.sqrt(combined['min_bray_genus'])
results = smf.ols('min_bray_sqrt_genus ~ vendor+C(age_1, Treatment("<30"))+sex+BMI+Shannon_genus', data=combined).fit()
results.pvalues
Intercept 6.349282e-227
C(age_1, Treatment("<30"))[T.30-39] 9.087249e-01
C(age_1, Treatment("<30"))[T.40-49] 7.150687e-02
C(age_1, Treatment("<30"))[T.50-59] 3.570698e-03
C(age_1, Treatment("<30"))[T.60-69] 4.331771e-07
C(age_1, Treatment("<30"))[T.70-79] 8.157767e-09
C(age_1, Treatment("<30"))[T.80+] 7.903359e-03
sex[T.M] 6.851984e-05
vendor 2.306016e-31
BMI 6.676141e-04
Shannon_genus 7.394479e-119
dtype: float64
combined['min_bray_sqrt']=np.sqrt(combined['min_bray'])
results = smf.ols('min_bray_sqrt ~ vendor+C(age_1, Treatment("<30"))+sex+BMI+Shannon', data=combined).fit()
results.pvalues
Intercept 0.000000e+00
C(age_1, Treatment("<30"))[T.30-39] 4.018527e-01
C(age_1, Treatment("<30"))[T.40-49] 5.719993e-01
C(age_1, Treatment("<30"))[T.50-59] 3.517325e-02
C(age_1, Treatment("<30"))[T.60-69] 1.877557e-05
C(age_1, Treatment("<30"))[T.70-79] 1.470618e-09
C(age_1, Treatment("<30"))[T.80+] 1.123212e-02
sex[T.M] 1.277220e-11
vendor 5.504748e-106
BMI 4.594970e-01
Shannon 2.320058e-04
dtype: float64
#save df for metabolomics analysis
combined.to_csv('results_merged.csv')
#import demographic data on Arivale participants
whole_df=pd.read_csv('demographics_df.csv')
whole_df.set_index('public_client_id',inplace=True)
#missing responses are coded as 9 in the df, these will be converted to nan in the next cell
whole_df.groupby(by='Antibiotics').size()
Antibiotics
0.0 2193
1.0 360
9.0 15
dtype: int64
whole_df['sex']=combined['sex_num']
for x in ['Prescription Med','Sweets','Fruits','Vegetables','Grains','Alcohol','Tobacco','Sleep','Diarrhea','Antibiotics']:
whole_df[x].fillna(9,inplace=True)
#test associations between clinical, demographic, lifestyle factors and bray-curtis uniqueness
p=[]
analyte=[]
test_value=[]
r_squared=[]
coef=[]
missing=[]
variable=[]
results_ols=pd.DataFrame()
#exclude outliers
whole_df['min_bray_sqrt_genus']=np.sqrt(combined['min_bray_genus'])
no_outliers=whole_df[whole_df['min_bray_sqrt_genus']<(whole_df['min_bray_sqrt_genus'].mean()+3*whole_df['min_bray_sqrt_genus'].std())]
no_outliers=no_outliers[no_outliers['min_bray_sqrt_genus']>(whole_df['min_bray_sqrt_genus'].mean()-3*whole_df['min_bray_sqrt_genus'].std())]
print('size after outlier removal',no_outliers.shape)
demographics=['age','sex','Race(ref.white)','BMI']
for x in demographics:
variable.append('demographics')
no_outliers['response']=no_outliers[x]
missing.append(no_outliers['response'].isnull().sum())
results = smf.ols('min_bray_sqrt_genus ~ vendor+response', data=no_outliers).fit()
just_vendor = smf.ols('min_bray_sqrt_genus ~ vendor', data=no_outliers[no_outliers['response'].isnull()==False]).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[2]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[2]
coef.append(param)
r_squared.append((results.rsquared-just_vendor.rsquared)*100)
test_value.append(param)
clinical_tests=['Globulin','Sodium','Homocysteine','GGT','ALAT','ALP','HDL','CRP','LDL','Triglycerides','n6/n3','Insulin','Glucose','HOMA-IR','HbA1c','Creatinine','Vitamin D']
diet_list=['Fruits','Vegetables','Grains','Alcohol','Tobacco','Sleep','Diarrhea','Antibiotics']
for x in diet_list:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
df=no_outliers[no_outliers['response']<9]
missing.append(len(no_outliers['response'])-len(df))
results = smf.ols('min_bray_sqrt_genus ~ vendor+response', data=df).fit()
just_vendor = smf.ols('min_bray_sqrt_genus ~ vendor', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[2]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[2]
coef.append(param)
r_squared.append((results.rsquared-just_vendor.rsquared)*100)
test_value.append(param)
for x in ['Prescription Med','Sweets']:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
df=no_outliers[no_outliers['response']<9]
missing.append(len(no_outliers['response'])-len(df))
results = smf.ols('min_bray_sqrt_genus ~ response', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[1]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[1]
coef.append(param)
r_squared.append(results.rsquared*100)
test_value.append(param)
for x in clinical_tests:
variable.append('clinical_labs')
no_outliers['response']=no_outliers[x]
missing.append(no_outliers['response'].isnull().sum())
results = smf.ols('min_bray_sqrt_genus ~ vendor+response', data=no_outliers).fit()
just_vendor = smf.ols('min_bray_sqrt_genus ~ vendor', data=no_outliers[no_outliers['response'].isnull()==False]).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[2]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[2]
coef.append(param)
r_squared.append((results.rsquared-just_vendor.rsquared)*100)
test_value.append(param)
results_ols=pd.DataFrame()
results_ols['analyte']=analyte
results_ols['pvalue']=p
results_ols.set_index('analyte')
results_ols['r_squared']=r_squared
results_ols['coefficient']=coef
results_ols['missing']=missing
results_ols['variable']=variable
#multiple hypothesis correction
results_ols['corr_pval']=multi.multipletests(results_ols['pvalue'], alpha=0.05, method='bonferroni', is_sorted=False,returnsorted=False)[1]
#confirm no. of metaoblites tested
results_ols.sort_values(by='r_squared',ascending=False,inplace=True)
size after outlier removal (3618, 70)
direction=[]
for x in results_ols['coefficient']:
if x<0:
direction.append(-1)
else:
direction.append(1)
results_ols['direction']=direction
results_ols['r_squared_dir']=results_ols['r_squared']*results_ols['direction']
results_ols.sort_values(by='corr_pval')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
analyte |
pvalue |
r_squared |
coefficient |
missing |
variable |
corr_pval |
direction |
r_squared_dir |
0 |
age |
2.357540e-28 |
3.046453 |
0.000869 |
0 |
demographics |
7.308375e-27 |
1 |
3.046453 |
30 |
Vitamin D |
2.701616e-08 |
0.802681 |
0.005422 |
104 |
clinical_labs |
8.375009e-07 |
1 |
0.802681 |
7 |
Alcohol |
8.987587e-07 |
0.658101 |
-0.007863 |
269 |
diet/lifestyle |
2.786152e-05 |
-1 |
-0.658101 |
20 |
HDL |
2.030378e-05 |
0.472444 |
0.004181 |
104 |
clinical_labs |
6.294172e-04 |
1 |
0.472444 |
1 |
sex |
1.324238e-04 |
0.370316 |
0.007561 |
0 |
demographics |
4.105137e-03 |
1 |
0.370316 |
12 |
Prescription Med |
4.076586e-04 |
1.404503 |
0.014805 |
2732 |
diet/lifestyle |
1.263742e-02 |
1 |
1.404503 |
23 |
Triglycerides |
7.894477e-04 |
0.293367 |
-0.003296 |
104 |
clinical_labs |
2.447288e-02 |
-1 |
-0.293367 |
10 |
Diarrhea |
1.739344e-03 |
0.263366 |
-0.004061 |
204 |
diet/lifestyle |
5.391967e-02 |
-1 |
-0.263366 |
28 |
HbA1c |
8.454809e-03 |
0.180688 |
0.002618 |
104 |
clinical_labs |
2.620991e-01 |
1 |
0.180688 |
6 |
Grains |
1.234112e-02 |
0.169699 |
-0.003138 |
237 |
diet/lifestyle |
3.825746e-01 |
-1 |
-0.169699 |
24 |
n6/n3 |
1.567014e-02 |
0.152180 |
-0.002428 |
104 |
clinical_labs |
4.857742e-01 |
-1 |
-0.152180 |
3 |
BMI |
1.751712e-02 |
0.150074 |
-0.000390 |
170 |
demographics |
5.430308e-01 |
-1 |
-0.150074 |
11 |
Antibiotics |
2.031116e-02 |
0.195616 |
0.007551 |
1092 |
diet/lifestyle |
6.296458e-01 |
1 |
0.195616 |
22 |
LDL |
2.534290e-02 |
0.130326 |
-0.002186 |
104 |
clinical_labs |
7.856300e-01 |
-1 |
-0.130326 |
17 |
GGT |
2.980023e-02 |
0.123045 |
-0.002116 |
104 |
clinical_labs |
9.238072e-01 |
-1 |
-0.123045 |
14 |
Globulin |
3.009083e-02 |
0.122611 |
-0.002121 |
104 |
clinical_labs |
9.328157e-01 |
-1 |
-0.122611 |
8 |
Tobacco |
3.130034e-02 |
0.129973 |
-0.010575 |
359 |
diet/lifestyle |
9.703106e-01 |
-1 |
-0.129973 |
29 |
Creatinine |
8.225649e-01 |
0.001312 |
-0.000219 |
104 |
clinical_labs |
1.000000e+00 |
-1 |
-0.001312 |
16 |
Homocysteine |
5.360479e-02 |
0.097103 |
0.001899 |
104 |
clinical_labs |
1.000000e+00 |
1 |
0.097103 |
18 |
ALAT |
6.189030e-02 |
0.090877 |
-0.001825 |
104 |
clinical_labs |
1.000000e+00 |
-1 |
-0.090877 |
13 |
Sweets |
4.698986e-01 |
0.059903 |
-0.000889 |
2744 |
diet/lifestyle |
1.000000e+00 |
-1 |
-0.059903 |
19 |
ALP |
1.816952e-01 |
0.046511 |
0.001319 |
104 |
clinical_labs |
1.000000e+00 |
1 |
0.046511 |
25 |
Insulin |
2.018528e-01 |
0.042476 |
-0.001248 |
104 |
clinical_labs |
1.000000e+00 |
-1 |
-0.042476 |
21 |
CRP |
2.249516e-01 |
0.038398 |
-0.001186 |
104 |
clinical_labs |
1.000000e+00 |
-1 |
-0.038398 |
26 |
Glucose |
2.873025e-01 |
0.029526 |
0.001049 |
104 |
clinical_labs |
1.000000e+00 |
1 |
0.029526 |
4 |
Fruits |
3.120022e-01 |
0.027416 |
0.001187 |
198 |
diet/lifestyle |
1.000000e+00 |
1 |
0.027416 |
9 |
Sleep |
4.018094e-01 |
0.027333 |
0.002036 |
1257 |
diet/lifestyle |
1.000000e+00 |
1 |
0.027333 |
2 |
Race(ref.white) |
4.696943e-01 |
0.013621 |
0.001749 |
88 |
demographics |
1.000000e+00 |
1 |
0.013621 |
15 |
Sodium |
4.710159e-01 |
0.013551 |
0.000708 |
104 |
clinical_labs |
1.000000e+00 |
1 |
0.013551 |
27 |
HOMA-IR |
5.391534e-01 |
0.009835 |
-0.000599 |
104 |
clinical_labs |
1.000000e+00 |
-1 |
-0.009835 |
5 |
Vegetables |
8.625670e-01 |
0.000804 |
-0.000197 |
198 |
diet/lifestyle |
1.000000e+00 |
-1 |
-0.000804 |
#significant results after multiple hypothesis correction (0.05 two-sided)
results_ols[results_ols['corr_pval']<0.05]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
analyte |
pvalue |
r_squared |
coefficient |
missing |
variable |
corr_pval |
direction |
r_squared_dir |
0 |
age |
2.357540e-28 |
3.046453 |
0.000869 |
0 |
demographics |
7.308375e-27 |
1 |
3.046453 |
12 |
Prescription Med |
4.076586e-04 |
1.404503 |
0.014805 |
2732 |
diet/lifestyle |
1.263742e-02 |
1 |
1.404503 |
30 |
Vitamin D |
2.701616e-08 |
0.802681 |
0.005422 |
104 |
clinical_labs |
8.375009e-07 |
1 |
0.802681 |
7 |
Alcohol |
8.987587e-07 |
0.658101 |
-0.007863 |
269 |
diet/lifestyle |
2.786152e-05 |
-1 |
-0.658101 |
20 |
HDL |
2.030378e-05 |
0.472444 |
0.004181 |
104 |
clinical_labs |
6.294172e-04 |
1 |
0.472444 |
1 |
sex |
1.324238e-04 |
0.370316 |
0.007561 |
0 |
demographics |
4.105137e-03 |
1 |
0.370316 |
23 |
Triglycerides |
7.894477e-04 |
0.293367 |
-0.003296 |
104 |
clinical_labs |
2.447288e-02 |
-1 |
-0.293367 |
#Generate figure 2d
sns.set(font_scale=1.95,context='poster',font='Arial',style='white')
sns.set_color_codes("dark")
plt.figure(figsize=[35,30], dpi=100)
plt.ylim(-2.5,4.0)
clr=[]
for x in results_ols['variable']:
if x=='demographics':
clr.append('lightgreen')
elif x=='clinical_labs':
clr.append('lightblue')
elif x=='diet/lifestyle':
clr.append('gold')
results_ols['clr']=clr
ax=sns.barplot(x='analyte', y='r_squared_dir',data=results_ols,palette=results_ols['clr'].tolist(),label="Total", edgecolor='k')
for item in ax.get_xticklabels():
item.set_rotation(85)
ax.set(ylabel="Percent Variance Explained",
xlabel="")
sns.despine(left=True, bottom=True)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
![png](/PriceLab/AgingMicrobiome/raw/master/output_23_1.png)
#run linear regression assessing relationship of each variable with uniqueness adjusting for age
demographics=['sex','Race(ref.white)','BMI']
p=[]
analyte=[]
test_value=[]
results=pd.DataFrame()
lower=[]
upper=[]
for x in demographics:
variable.append('demographics')
no_outliers['response']=no_outliers[x]
no_outliers['response']=(no_outliers['response']-no_outliers['response'].mean())/no_outliers['response'].std()
results = smf.ols('min_bray_sqrt_genus ~ vendor+age+response', data=no_outliers).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in clinical_tests:
variable.append('clinical_labs')
no_outliers['response']=no_outliers[x]
no_outliers['response']=(no_outliers['response']-no_outliers['response'].mean())/no_outliers['response'].std()
missing.append(no_outliers['response'].isnull().sum())
results = smf.ols('min_bray_sqrt_genus ~ vendor+age+response', data=no_outliers).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in diet_list:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
df=no_outliers[no_outliers['response']<9]
df['response']=(df['response']-df['response'].mean())/df['response'].std()
results = smf.ols('min_bray_sqrt_genus ~ vendor+Age+response', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in ['Prescription Med','Sweets']:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
missing.append(len(no_outliers['response'][no_outliers['response']<9]))
df=no_outliers[no_outliers['response']<9]
df['response']=(df['response']-df['response'].mean())/df['response'].std()
results = smf.ols('min_bray_sqrt_genus ~ Age+response', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[2]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[2]
test_value.append(param)
results_coef=pd.DataFrame()
lower=[ x[0] for x in lower]
upper=[ x[0] for x in upper]
results_coef['analyte']=analyte
results_coef['pvalue']=p
results_coef['Beta_coeff']=test_value
results_coef.set_index('analyte')
results_coef['corr_pval']=multi.multipletests(results_coef['pvalue'], alpha=0.05, method='bonferroni', is_sorted=False,returnsorted=False)[1]
results_coef.sort_values(by='pvalue',inplace=True)
results_coef.sort_values(by='analyte',ascending=True,inplace=True)
results_coef=results_coef.set_index('analyte')
results_age_adj=results_coef
results_age_adj['adjusted']=1
#run linear regression assessing relationship of each variable with ASV-level uniqueness adjusting for age
demographics=['sex','Race(ref.white)','BMI']
p=[]
analyte=[]
test_value=[]
results_age_ASV=pd.DataFrame()
for x in demographics:
variable.append('demographics')
no_outliers['response']=no_outliers[x]
#scale/standardize
no_outliers['response']=(no_outliers['response']-no_outliers['response'].mean())/no_outliers['response'].std()
results = smf.ols('min_bray_sqrt ~ vendor+age+response', data=no_outliers).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in clinical_tests:
variable.append('clinical_labs')
no_outliers['response']=no_outliers[x]
no_outliers['response']=(no_outliers['response']-no_outliers['response'].mean())/no_outliers['response'].std()
results = smf.ols('min_bray_sqrt ~ vendor+age+response', data=no_outliers).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in diet_list:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
df=no_outliers[no_outliers['response']<9]
df['response']=(df['response']-df['response'].mean())/df['response'].std()
results = smf.ols('min_bray_sqrt ~ vendor+Age+response', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[3]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[3]
test_value.append(param)
for x in ['Prescription Med','Sweets']:
variable.append('diet/lifestyle')
no_outliers['response']=no_outliers[x]
df=no_outliers[no_outliers['response']<9]
df['response']=(df['response']-df['response'].mean())/df['response'].std()
results = smf.ols('min_bray_sqrt ~ Age+response', data=df).fit()
p_extract=results.pvalues.tolist()
p_test=p_extract[2]
analyte.append(x)
p.append(p_test)
parameters=results.params.tolist()
param=parameters[2]
test_value.append(param)
results_age_ASV['analyte']=analyte
results_age_ASV['pvalue']=p
results_age_ASV['Beta_coeff']=test_value
results_age_ASV.set_index('analyte')
results_age_ASV['corr_pval']=multi.multipletests(results_age_ASV['pvalue'], alpha=0.05, method='bonferroni', is_sorted=False,returnsorted=False)[1]
results_age_ASV.sort_values(by='analyte',ascending=True,inplace=True)
results_age_ASV['adjusted']=1
results_age_ASV
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
analyte |
pvalue |
Beta_coeff |
corr_pval |
adjusted |
7 |
ALAT |
3.108881e-01 |
-0.000684 |
1.000000e+00 |
1 |
8 |
ALP |
3.947624e-01 |
-0.000582 |
1.000000e+00 |
1 |
23 |
Alcohol |
9.378565e-02 |
-0.001167 |
1.000000e+00 |
1 |
27 |
Antibiotics |
5.016965e-05 |
0.003300 |
1.505089e-03 |
1 |
2 |
BMI |
1.416517e-02 |
-0.001678 |
4.249551e-01 |
1 |
10 |
CRP |
7.447765e-02 |
-0.001205 |
1.000000e+00 |
1 |
18 |
Creatinine |
4.216653e-04 |
-0.002378 |
1.264996e-02 |
1 |
26 |
Diarrhea |
4.187928e-02 |
0.001398 |
1.000000e+00 |
1 |
20 |
Fruits |
6.198069e-01 |
0.000340 |
1.000000e+00 |
1 |
6 |
GGT |
6.573135e-02 |
-0.001248 |
1.000000e+00 |
1 |
3 |
Globulin |
1.261844e-02 |
-0.001691 |
3.785532e-01 |
1 |
15 |
Glucose |
7.700407e-01 |
0.000200 |
1.000000e+00 |
1 |
22 |
Grains |
2.420615e-03 |
-0.002087 |
7.261846e-02 |
1 |
9 |
HDL |
1.806700e-05 |
0.002932 |
5.420100e-04 |
1 |
16 |
HOMA-IR |
6.052503e-01 |
-0.000349 |
1.000000e+00 |
1 |
17 |
HbA1c |
5.991392e-01 |
0.000364 |
1.000000e+00 |
1 |
5 |
Homocysteine |
5.301023e-03 |
-0.001931 |
1.590307e-01 |
1 |
14 |
Insulin |
2.348506e-01 |
-0.000802 |
1.000000e+00 |
1 |
11 |
LDL |
2.240127e-02 |
-0.001548 |
6.720381e-01 |
1 |
28 |
Prescription Med |
2.855336e-02 |
0.002816 |
8.566009e-01 |
1 |
1 |
Race(ref.white) |
2.157719e-01 |
-0.000841 |
1.000000e+00 |
1 |
25 |
Sleep |
1.294242e-01 |
0.001276 |
1.000000e+00 |
1 |
4 |
Sodium |
2.067305e-02 |
-0.001573 |
6.201915e-01 |
1 |
29 |
Sweets |
3.923745e-01 |
0.001076 |
1.000000e+00 |
1 |
24 |
Tobacco |
5.115015e-02 |
-0.001362 |
1.000000e+00 |
1 |
12 |
Triglycerides |
1.945830e-03 |
-0.002095 |
5.837490e-02 |
1 |
21 |
Vegetables |
8.262994e-01 |
0.000150 |
1.000000e+00 |
1 |
19 |
Vitamin D |
7.846881e-07 |
0.003380 |
2.354064e-05 |
1 |
13 |
n6/n3 |
4.194554e-02 |
-0.001438 |
1.000000e+00 |
1 |
0 |
sex |
3.005534e-12 |
0.004647 |
9.016602e-11 |
1 |
results_age_ASV.set_index('analyte',inplace=True)
results_age_ASV[results_age_ASV['corr_pval']<0.05]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
pvalue |
Beta_coeff |
corr_pval |
adjusted |
analyte |
|
|
|
|
Antibiotics |
5.016965e-05 |
0.003300 |
1.505089e-03 |
1 |
Creatinine |
4.216653e-04 |
-0.002378 |
1.264996e-02 |
1 |
HDL |
1.806700e-05 |
0.002932 |
5.420100e-04 |
1 |
Vitamin D |
7.846881e-07 |
0.003380 |
2.354064e-05 |
1 |
sex |
3.005534e-12 |
0.004647 |
9.016602e-11 |
1 |
#Generate supplementary table
results_final=results_ols[results_ols.columns[0:7]]
results_final.set_index('analyte',inplace=True)
results_final['age_adjusted_coeff']=results_age_adj['Beta_coeff']
results_final['age_adjusted_corr_pvalue']=results_age_adj['corr_pval']
results_final['age_adjusted_coeff_ASV_level']=results_age_ASV['Beta_coeff']
results_final['age_adjusted_corr_pvalue_ASV_level']=results_age_ASV['corr_pval']
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
pvalue |
r_squared |
coefficient |
missing |
variable |
corr_pval |
age_adjusted_coeff |
age_adjusted_corr_pvalue |
age_adjusted_coeff_ASV_level |
age_adjusted_corr_pvalue_ASV_level |
analyte |
|
|
|
|
|
|
|
|
|
|
age |
2.357540e-28 |
3.046453 |
0.000869 |
0 |
demographics |
7.308375e-27 |
NaN |
NaN |
NaN |
NaN |
Prescription Med |
4.076586e-04 |
1.404503 |
0.014805 |
2732 |
diet/lifestyle |
1.263742e-02 |
0.004069 |
1.000000 |
0.002816 |
0.856601 |
Vitamin D |
2.701616e-08 |
0.802681 |
0.005422 |
104 |
clinical_labs |
8.375009e-07 |
0.003676 |
0.004944 |
0.003380 |
0.000024 |
Alcohol |
8.987587e-07 |
0.658101 |
-0.007863 |
269 |
diet/lifestyle |
2.786152e-05 |
-0.003640 |
0.007137 |
-0.001167 |
1.000000 |
HDL |
2.030378e-05 |
0.472444 |
0.004181 |
104 |
clinical_labs |
6.294172e-04 |
0.002747 |
0.145723 |
0.002932 |
0.000542 |
#correlation in beta-coefficients for all variables between asv and genus level analysis
results_sig=results_final[results_final['pvalue']<0.05]
scipy.stats.spearmanr(results_sig['age_adjusted_coeff_ASV_level'].dropna(),results_sig['age_adjusted_coeff'].dropna())
SpearmanrResult(correlation=0.711764705882353, pvalue=0.0019840432477699634)
#save results
results_final.to_csv('demographics_results.csv')