-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
182 lines (152 loc) · 5.77 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from ase.db import connect
from ase import Atoms
import ase.db
from ase.data import covalent_radii
from ase.utils import basestring
from ase.constraints import FixAtoms
import itertools
import re
import numpy as np
import math
#Extracts molecular data
#Returns an array of dictionaries containing molecular formula, positions, and energies
#Indices
def extract_molecular_data(dbName, dx=0, useAseDistance=True, filterSigma=0, removeOutliers = True):
db = connect(dbName)
# use ase get_mic_distance() to get neighboring atoms
if useAseDistance == True:
return get_molecular_aseDist(db, dx, filterSigma)
data = []
for row in db.select(relaxed = True):
atoms = []
sz = len(row.numbers)
hydrogenIdx = sz - 1 # hydrogen is always the last one
for i in range(sz):
if dx > 0 and calculate_distance(row, i, hydrogenIdx) > dx:
continue
atom = {'num': row.numbers[i], 'position':row.positions[i]}
atoms.append(atom)
# print("orig size: %d after my filtering %d" % (sz, len(atoms)))
# print(atoms)
molecule = {'atoms': atoms, 'energy': row.energy}
data.append(molecule)
return data
def calculate_distance(row, atomA, atomB):
if atomA == atomB:
return 0
p1 = row.positions[atomA]
p2 = row.positions[atomB]
d = 0.0;
for i in range(3):
d = d + (p1[i]-p2[i])**2
# print ("dist %5.2f" % (math.sqrt(d)))
return math.sqrt(d)
#Calculates distance between origin and all the points in array points
def calculateDistances(origin, points):
distances = []
for point in points:
distances.append(calculateDistance(origin, point))
return distances[:-1]
#Calculates the distance between 2 3D points
def calculateDistance(p1, p2):
distance = None
squared_deltas = []
for i in range(len(p1)):
delta = math.fabs(np.subtract(p1[i], p2[i]))
delta_squared = delta * delta
squared_deltas.append(delta_squared)
delta_sums = 0
for delta in squared_deltas:
delta_sums += delta
return math.sqrt(delta_sums)
def get_mic_distance(p1, p2, cell, pbc):
""" This method calculates the shortest distance between p1 and p2
through the cell boundaries defined by cell and pbc.
This method works for reasonable unit cells, but not for extremely
elongated ones.
"""
ct = cell.T
pos = np.mat((p1, p2))
scaled = np.linalg.solve(ct, pos.T).T
for i in range(3):
if pbc[i]:
scaled[:, i] %= 1.0
scaled[:, i] %= 1.0
P = np.dot(scaled, cell)
pbc_directions = [[-1, 1] * int(direction) + [0] for direction in pbc]
translations = np.mat(list(itertools.product(*pbc_directions))).T
p0r = np.tile(np.reshape(P[0, :], (3, 1)), (1, translations.shape[1]))
p1r = np.tile(np.reshape(P[1, :], (3, 1)), (1, translations.shape[1]))
dp_vec = p0r + ct * translations
d = np.min(np.power(p1r - dp_vec, 2).sum(axis=0))**0.5
return d
def get_atom_neighborlist(atoms, centerAtom, dx=0.2, no_count_types=None):
"""
Method to get the a dict with list of neighboring
atoms defined as the two covalent radii + fixed distance.
Option added to remove neighbors between defined atom types.
"""
cell = atoms.get_cell()
pbc = atoms.get_pbc()
if no_count_types is None:
no_count_types = []
conn = []
atomi = centerAtom;
for atomj in atoms:
if atomi.index != atomj.index:
if atomi.number not in no_count_types:
if atomj.number not in no_count_types:
d = get_mic_distance(atomi.position,
atomj.position,
cell,
pbc)
cri = covalent_radii[atomi.number]
crj = covalent_radii[atomj.number]
d_max = crj + cri + dx
if d < d_max:
conn.append(atomj)
conn.append(centerAtom)
return conn
# use ase get_mic_distance() to get neighboring atoms
def get_molecular_aseDist(db, dx, filterSigma=0, removeOutliers=True):
data = []
for row in db.select(relaxed = True):
atoms = row.toatoms()
# H atom is always the last
nbAtoms = get_atom_neighborlist(atoms, atoms[len(atoms)-1], dx=dx)
atomData = []
for a in nbAtoms:
atomData.append( {'num': a.number, 'position':a.position} )
# print("orig size: %d after ase filtering %d" % (len(atoms), len(atomData)))
molecule = {'atoms': atomData, 'energy': row.energy}
if removeOutliers == True and math.fabs(molecule["energy"]) > 2:
print("Removed molecule with energy %s" % molecule["energy"])
continue
else:
data.append(molecule)
# filter out bad data if needed
if filterSigma > 0:
data = filter_by_sigma(data, filterSigma)
print('total number of data: %d' % (len(data)))
return data
# check if need to filter out bad data points out of number sigma
def filter_by_sigma(db, nSigma):
# save starting iterator
# calculate mean and sigma of atomization_energy
energyArr = []
for row in db:
energyArr.append(row["energy"])
mean = np.mean(energyArr)
sigma = np.std(energyArr)
# cut off at nSigma
leftCutOff = mean - nSigma*sigma
rightCutOff = mean + nSigma*sigma
# filter and create new db
newDb = []
for row in db:
en = row["energy"]
if ( en >= leftCutOff and en <= rightCutOff):
newDb.append(row)
else:
print('filter out row with engergy %f with sigma %f' % (en, nSigma))
return newDb