Skip to content

Commit

Permalink
extra-trees-regressor model for evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
gihanpanapitiya committed Jul 15, 2024
1 parent 007aee6 commit 4785962
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 0 deletions.
60 changes: 60 additions & 0 deletions et/cl_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import coderdata as cd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import umap
import seaborn as sns
import matplotlib.patches as mpatches
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing


hcmi = cd.DatasetLoader('hcmi')
beataml = cd.DatasetLoader('beataml')
cptac = cd.DatasetLoader('cptac')
depmap = cd.DatasetLoader('broad_sanger')
mpnst = cd.DatasetLoader('mpnst')

# Join BeatAML and HCMI
joined_dataset0 = cd.join_datasets(beataml, hcmi)
# Join DepMap and CPTAC
joined_dataset1 = cd.join_datasets(depmap, cptac)
# Join Datasets
joined_dataset2 = cd.join_datasets(joined_dataset0,joined_dataset1)
# Final Join
joined_dataset3 = cd.join_datasets(joined_dataset2,mpnst)
joined_dataset3.transcriptomics= joined_dataset3.transcriptomics[["improve_sample_id", "transcriptomics", "entrez_id", "source", "study"]]


# select target type
data = joined_dataset3.experiments[joined_dataset3.experiments.dose_response_metric == 'auc']
# select study
ctrpv2 = data[data.study == 'CTRPv2']
# joined_dataset3.transcriptomics.study.unique()

# get cancer type map
cancer_type_sample_map = dict(zip(joined_dataset3.samples['improve_sample_id'], joined_dataset3.samples['cancer_type']))

# pivot
cl = joined_dataset3.transcriptomics.pivot_table(index='improve_sample_id', columns='entrez_id', values='transcriptomics')
cl = cl.dropna(axis=1)
cl.columns.name = None
cl.reset_index(inplace=True)



# add cancer type
cl['cancer_type'] = cl.improve_sample_id.map(cancer_type_sample_map)

# convert str labels to int
le = preprocessing.LabelEncoder()
le.fit(cl.cancer_type)
cl['cancer_type_int'] = le.transform(cl.cancer_type)
cl.to_csv('cl_features/cl.csv', index=False)



108 changes: 108 additions & 0 deletions et/et.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import coderdata as cd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import umap
import seaborn as sns
import matplotlib.patches as mpatches
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor


hcmi = cd.DatasetLoader('hcmi')
beataml = cd.DatasetLoader('beataml')
cptac = cd.DatasetLoader('cptac')
depmap = cd.DatasetLoader('broad_sanger')
mpnst = cd.DatasetLoader('mpnst')

# Join BeatAML and HCMI
joined_dataset0 = cd.join_datasets(beataml, hcmi)
# Join DepMap and CPTAC
joined_dataset1 = cd.join_datasets(depmap, cptac)
# Join Datasets
joined_dataset2 = cd.join_datasets(joined_dataset0,joined_dataset1)
# Final Join
joined_dataset3 = cd.join_datasets(joined_dataset2,mpnst)
joined_dataset3.transcriptomics= joined_dataset3.transcriptomics[["improve_sample_id", "transcriptomics", "entrez_id", "source", "study"]]

data = joined_dataset3.experiments[joined_dataset3.experiments.dose_response_metric == 'auc']
data.improve_drug_id.nunique()

ctrpv2 = data[data.study == 'CTRPv2']
ctrpv2.to_csv('cl_features/data.csv', index=False)
ctrpv2 = pd.read_csv('cl_features/data.csv')



#### load cl data

cl = pd.read_csv('cl_features/cl.csv')
cl_features = pd.read_pickle('cl_features/cl_1000.pkl')
# list(cl_features)[0]
cl_features = [str(i) for i in list(cl_features)]
cl = cl.loc[:, ['improve_sample_id'] + cl_features]



md = pd.read_csv('drugs/mdm.csv')
v = md.iloc[:, :-1]
remove=[]
for i in v.columns:

v2 = v.loc[:, i]
try:
[float(i) for i in v2.values]
except:
remove.append(i)


v = md.drop(remove, axis=1)

id2smiles = dict(joined_dataset3.drugs[['improve_drug_id', 'canSMILES']].drop_duplicates().values)
smiles2id = {v:k for k,v in id2smiles.items()}
v['improve_drug_id'] = [smiles2id[i] for i in v.smiles]

v2 = v.drop(columns=['smiles'], axis=1)

# np.where(np.isnan(v3))
# sc = StandardScaler()


data2 = ctrpv2[['improve_sample_id', 'improve_drug_id','dose_response_value']]
d2 = pd.merge(data2, cl, on='improve_sample_id', how='left')
d3 = pd.merge(d2, v2, on='improve_drug_id', how='left')
d3 = d3.dropna(axis=0)


# sc = StandardScaler()
d3.reset_index(drop=True, inplace=True)
train, test = train_test_split(d3, test_size=.1)
# test, val = train_test_split(val, test_size=.5)

train_tmp = train.sample(frac=.1)
# train_x = train.iloc[:, 3:]
train_x = train_tmp.iloc[:, 3:]
test_x = test.iloc[:, 3:]

# train_y = train.dose_response_value.values
train_y = train_tmp.dose_response_value.values
test_y = test.dose_response_value.values

sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.transform(test_x)


et = ExtraTreesRegressor()
et.fit(train_x, train_y)
et.predict(test_x)

0 comments on commit 4785962

Please sign in to comment.