You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have been researching learning graph neural networks recently and saw that your project ESPALOMA was replicated using your PHALKETHOH database. However, when I tried to replicate your ZINC dataset (which contains SDF files), I used a training script adapted from collection.py to read and convert SDFs. However, when adding the graph, I encountered an error stating that the dictionary xyz could not be found. I also encountered the same problem when using the PDBBIND dataset, After printing the dictionary for the key, it was found that there was no xyz. The image on the left shows phalkethoh imported as binary, while the image on the right shows the imported sdf of ligands in the pdbbind dataset. There is a lot of missing information on the right side of the graph
I don't know how to properly write the code for importing from sdf files to espgraph. I can import all the parameters of the molecules. I tried the method in collection.py in the project, but it doesn't work. Can you please advise me on how to write it?
Best wishes
`import sys
from os.path import exists
import os
from openff.toolkit.topology import Molecule
from rdkit import Chem
from rdkit.Chem import AllChem
import espaloma as esp
import torch
import csv
firs=-1
count = 0
fi = 0
gs = []
loss_fn = esp.metrics.GraphMetric(
base_metric=torch.nn.MSELoss(), # use mean-squared error loss
between=['u', "u_ref"], # between predicted and QM energies
level="g", # compare on graph level
)
for idx_epoch in range(10):
for g in ds_tr_loader:
optimizer.zero_grad()
if torch.cuda.is_available():
g = g.to("cuda:0")
g = espaloma_model(g)
loss = loss_fn(g)
loss.backward()
optimizer.step()
torch.save(espaloma_model.state_dict(), "%s.th" % idx_epoch)
# 保存模型参数
torch.save(espaloma_model.state_dict(), f"/home/twyuan/test/4/{idx_epoch}.th")
inspect_metric = esp.metrics.GraphMetric(
base_metric=torch.nn.L1Loss(), # use mean-squared error loss
between=['u', "u_ref"], # between predicted and QM energies
level="g", # compare on graph level
)
if torch.cuda.is_available():
g_vl = g_vl.to("cuda:0")
g_tr = g_tr.to("cuda:0")
loss_tr = []
loss_vl = []
for idx_epoch in range(10):
espaloma_model.load_state_dict(
torch.load("%s.th" % idx_epoch)
)
from matplotlib import pyplot as plt
plt.plot(loss_tr, label="train")
plt.plot(loss_vl, label="valid")
plt.yscale("log")
plt.legend()
plt.show()
plt.savefig('/home/twyuan/test/4/train_valid.png')
import torch
from openff.toolkit.topology import Molecule
from rdkit import Chem
import espaloma as esp
from espaloma.data.collection import zinc
import dgl
import numpy as np
def main():
# 假设你的数据文件在当前工作目录下,文件名为 data.zinc
ds = zinc(first=10) # 仅加载前10个分子
for item in ds:
print(item)
# 处理返回的 GraphDataset 对象
if isinstance(ds, esp.data.dataset.GraphDataset):
print("GraphDataset loaded successfully!")
# 这里可以根据需要使用 result 进行进一步的操作
typing = esp.graphs.legacy_force_field.LegacyForceField('gaff-1.81')
ds.apply(typing, in_place=True) # this modify the original data
ds_tr, ds_vl, ds_te = ds.split([8, 1, 1])
ds_tr = ds_tr.view('graph', batch_size=100, shuffle=True)
ds_te = ds_te.view('graph', batch_size=100)
ds_vl = ds_vl.view('graph', batch_size=100)
# define a layer
layer = esp.nn.layers.dgl_legacy.gn("SAGEConv")
# define a representation
representation = esp.nn.Sequential(
layer,
[128, "relu", 128, "relu", 128, "relu"],
)
# define a readout
readout = esp.nn.readout.node_typing.NodeTyping(
in_features=128,
n_classes=100
)
net = torch.nn.Sequential(
representation,
readout
)
loss_fn = esp.metrics.TypingAccuracy()
# define optimizer
optimizer = torch.optim.Adam(net.parameters(), 1e-5)
# train the model
for _ in range(3000):
for g in ds_tr:
optimizer.zero_grad()
net(g.heterograph)
loss = loss_fn(g.heterograph)
loss.backward()
optimizer.step()
if name == "main":
main()
`
The text was updated successfully, but these errors were encountered:
Dear espaloma devs,
I have been researching learning graph neural networks recently and saw that your project ESPALOMA was replicated using your PHALKETHOH database. However, when I tried to replicate your ZINC dataset (which contains SDF files), I used a training script adapted from collection.py to read and convert SDFs. However, when adding the graph, I encountered an error stating that the dictionary xyz could not be found. I also encountered the same problem when using the PDBBIND dataset, After printing the dictionary for the key, it was found that there was no xyz. The image on the left shows phalkethoh imported as binary, while the image on the right shows the imported sdf of ligands in the pdbbind dataset. There is a lot of missing information on the right side of the graph
I don't know how to properly write the code for importing from sdf files to espgraph. I can import all the parameters of the molecules. I tried the method in collection.py in the project, but it doesn't work. Can you please advise me on how to write it?
Best wishes
`import sys
from os.path import exists
import os
from openff.toolkit.topology import Molecule
from rdkit import Chem
from rdkit.Chem import AllChem
import espaloma as esp
import torch
import csv
firs=-1
count = 0
fi = 0
gs = []
定义解压目标文件夹
extract_folder = r'/home/twyuan/test/pdb/refined_set'
for root, dirs, files in os.walk(extract_folder):
for file in files:
# 确保文件是.sdf文件
if file.endswith('.sdf'):
sdf_file_path = os.path.join(root, file)
print(fi)
print(len(gs))
ds = esp.data.dataset.GraphDataset(gs)
#print (g.dict)
ds.shuffle(seed=2666)
ds_tr, ds_vl, ds_te = ds.split([8, 1, 1])
print(len(ds_tr)) # 打印数据集中的样本数量
print(len(ds_vl))
print(len(ds_te))
ds_tr_loader = ds_tr.view(batch_size=100, shuffle=True)
g_tr = next(iter(ds_tr.view(batch_size=len(ds_tr))))
g_vl = next(iter(ds_vl.view(batch_size=len(ds_vl))))
representation = esp.nn.Sequential(
layer=esp.nn.layers.dgl_legacy.gn("SAGEConv"), # use SAGEConv implementation in DGL
config=[128, "relu", 128, "relu", 128, "relu"], # 3 layers, 128 units, ReLU activation
)
readout = esp.nn.readout.janossy.JanossyPooling(
in_features=128, config=[128, "relu", 128, "relu", 128, "relu"],
out_features={ # define modular MM parameters Espaloma will assign
1: {"e": 1, "s": 1}, # atom hardness and electronegativity
2: {"log_coefficients": 2}, # bond linear combination, enforce positive
3: {"log_coefficients": 2}, # angle linear combination, enforce positive
4: {"k": 6}, # torsion barrier heights (can be positive or negative)
},
)
espaloma_model = torch.nn.Sequential(
representation, readout, esp.nn.readout.janossy.ExpCoefficients(),
esp.mm.geometry.GeometryInGraph(),
esp.mm.energy.EnergyInGraph(),
esp.mm.energy.EnergyInGraph(suffix="_ref"),
esp.nn.readout.charge_equilibrium.ChargeEquilibrium(),
)
if torch.cuda.is_available():
espaloma_model = espaloma_model.cuda()
loss_fn = esp.metrics.GraphMetric(
base_metric=torch.nn.MSELoss(), # use mean-squared error loss
between=['u', "u_ref"], # between predicted and QM energies
level="g", # compare on graph level
)
optimizer = torch.optim.Adam(espaloma_model.parameters(), 1e-4)
for idx_epoch in range(10):
for g in ds_tr_loader:
optimizer.zero_grad()
if torch.cuda.is_available():
g = g.to("cuda:0")
g = espaloma_model(g)
loss = loss_fn(g)
loss.backward()
optimizer.step()
torch.save(espaloma_model.state_dict(), "%s.th" % idx_epoch)
# 保存模型参数
torch.save(espaloma_model.state_dict(), f"/home/twyuan/test/4/{idx_epoch}.th")
画学习率和损失曲线图
plt.plot(learning_rates, label="Learning Rate")
plt.plot(losses, label="Loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()
plt.savefig('/home/twyuan/test/4/learning_curve.png')
inspect_metric = esp.metrics.GraphMetric(
base_metric=torch.nn.L1Loss(), # use mean-squared error loss
between=['u', "u_ref"], # between predicted and QM energies
level="g", # compare on graph level
)
if torch.cuda.is_available():
g_vl = g_vl.to("cuda:0")
g_tr = g_tr.to("cuda:0")
loss_tr = []
loss_vl = []
for idx_epoch in range(10):
espaloma_model.load_state_dict(
torch.load("%s.th" % idx_epoch)
)
import numpy as np
loss_tr = np.array(loss_tr) * 627.5
loss_vl = np.array(loss_vl) * 627.5
from matplotlib import pyplot as plt
plt.plot(loss_tr, label="train")
plt.plot(loss_vl, label="valid")
plt.yscale("log")
plt.legend()
plt.show()
plt.savefig('/home/twyuan/test/4/train_valid.png')
from openff.toolkit.topology import Molecule
from rdkit import Chem
import espaloma as esp
from espaloma.data.collection import zinc
import dgl
import numpy as np
def main():
# 假设你的数据文件在当前工作目录下,文件名为 data.zinc
ds = zinc(first=10) # 仅加载前10个分子
for item in ds:
print(item)
# 处理返回的 GraphDataset 对象
if isinstance(ds, esp.data.dataset.GraphDataset):
print("GraphDataset loaded successfully!")
# 这里可以根据需要使用 result 进行进一步的操作
if name == "main":
main()
`
The text was updated successfully, but these errors were encountered: