Skip to content
This repository was archived by the owner on Dec 8, 2024. It is now read-only.

Commit 9143138

Browse files
authored
Qmlearn (#82)
* Made base representations * started CM and data class * Working on generate routine * Working basic example * Mostly hacked the searchcv routines to work * Implementing atomic gaussian kernel * working atomic krr * Restructure and started global slatm * Slatm * Started acsf * stash before merging acsf bugfix * acsf bugfix cherrypick * sigma='auto' option added to kernels * Started fchl * Working fchl * Started preprocessing * Mostly working atom scaler * Made several attributes private * Restructured how the data object is passed, to avoid possible memory issues * Started alchemy in kernels * Minor change to kernel alchemy * Working feature trick in kernels * Cleaned up code * daily * Finished examples
1 parent 2f47e85 commit 9143138

30 files changed

+2621
-81
lines changed

docs/source/qml.rst

+11
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,14 @@ qml\.aglaia module
113113
:inherited-members:
114114

115115

116+
qml\.qmlearn.representations module
117+
------------------
118+
119+
.. automodule:: qml.qmlearn.representations
120+
:inherited-members:
121+
122+
qml\.qmlearn.kernels module
123+
------------------
124+
125+
.. automodule:: qml.qmlearn.kernels
126+
:inherited-members:

examples/qmlearn.py

+287
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
import glob
2+
import numpy as np
3+
from qml import qmlearn
4+
import sklearn.pipeline
5+
import sklearn.model_selection
6+
7+
def data():
8+
"""
9+
Using the Data object.
10+
"""
11+
print("*** Begin data examples ***")
12+
13+
# The Data object has the same role as the Compound class.
14+
# Where the Compound class is for one compound, the Data class
15+
# Is for multiple
16+
17+
# One can load in a set of xyz files
18+
filenames = sorted(glob.glob("../test/qm7/00*.xyz"))
19+
data = qmlearn.Data(filenames)
20+
print("length of filenames", len(filenames))
21+
print("length of nuclear_charges", len(data.nuclear_charges))
22+
print("length of coordinates", len(data.coordinates))
23+
24+
# Or just load a glob string
25+
data = qmlearn.Data("../test/qm7/00*.xyz")
26+
print("length of nuclear_charges", len(data.nuclear_charges))
27+
28+
# Energies (or other molecular properties) can be stored in the object
29+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)[:98]
30+
data.set_energies(energies)
31+
print("length of energies", len(data.energies))
32+
33+
print("*** End data examples ***")
34+
print()
35+
36+
def preprocessing():
37+
"""
38+
Rescaling energies
39+
"""
40+
41+
print("*** Begin preprocessing examples ***")
42+
43+
# The AtomScaler object does a linear fit of the number of each element to the energy.
44+
data = qmlearn.Data("../test/qm7/*.xyz")
45+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
46+
47+
# Input can be nuclear_charges and energies
48+
print("Energies before rescaling", energies[:3])
49+
rescaled_energies = qmlearn.preprocessing.AtomScaler().fit_transform(data.nuclear_charges, energies)
50+
print("Energies after rescaling", rescaled_energies[:3])
51+
52+
# Or a data object can be used
53+
data.set_energies(energies)
54+
data2 = qmlearn.preprocessing.AtomScaler().fit_transform(data)
55+
print("Energies after rescaling", data2.energies[:3])
56+
57+
print("*** End preprocessing examples ***")
58+
print()
59+
60+
def representations():
61+
"""
62+
Creating representations. Currently implemented representations are
63+
CoulombMatrix, AtomicCoulombMatrix, AtomicSLATM, GlobalSLATM,
64+
FCHLRepresentations, AtomCenteredSymmetryFunctions.
65+
(BagOfBonds is still missing)
66+
"""
67+
68+
print("*** Begin representations examples ***")
69+
70+
data = qmlearn.Data("../test/qm7/*.xyz")
71+
72+
# Representations can be created from a data object
73+
model = qmlearn.representations.CoulombMatrix(sorting ='row-norm')
74+
representations = model.generate(data)
75+
print("Shape of representations:", representations.shape)
76+
77+
# Alternatively the data object can be passed at initialization of the representation class
78+
# and only select molecule indices can be parsed
79+
80+
model = qmlearn.representations.CoulombMatrix(data)
81+
representations = model.generate([0,5,7,16])
82+
print("Shape of representations:", representations.shape)
83+
84+
print("*** End representations examples ***")
85+
print()
86+
87+
def kernels():
88+
"""
89+
Create kernels. Currently implemented kernels are GaussianKernel,
90+
LaplacianKernel, FCHLKernel.
91+
"""
92+
93+
print("*** Begin kernels examples ***")
94+
95+
data = qmlearn.Data("../test/qm7/*.xyz")
96+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
97+
data.set_energies(energies)
98+
99+
# Kernels can be created from representations
100+
model = qmlearn.representations.CoulombMatrix(data)
101+
indices = np.arange(100)
102+
representations = model.generate(indices)
103+
104+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
105+
symmetric_kernels = model.generate(representations[:80])
106+
print("Shape of symmetric kernels:", symmetric_kernels.shape)
107+
108+
asymmetric_kernels = model.generate(representations[:80], representations[80:])
109+
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)
110+
111+
# Atomic representations can be used as well
112+
model = qmlearn.representations.AtomicCoulombMatrix(data)
113+
indices = np.arange(100)
114+
representations = model.generate(indices)
115+
116+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
117+
symmetric_kernels = model.generate(representations[:80], representation_type = 'atomic')
118+
print("Shape of symmetric kernels:", symmetric_kernels.shape)
119+
120+
asymmetric_kernels = model.generate(representations[:80], representations[80:], representation_type = 'atomic')
121+
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)
122+
123+
print("*** End kernels examples ***")
124+
print()
125+
126+
def models():
127+
"""
128+
Regression models. Only KernelRidgeRegression implemented so far.
129+
"""
130+
131+
print("*** Begin models examples ***")
132+
133+
filenames = sorted(glob.glob("../test/qm7/*.xyz"))
134+
data = qmlearn.Data(filenames)
135+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
136+
model = qmlearn.representations.CoulombMatrix(data)
137+
# Create 1000 random indices
138+
indices = np.arange(1000)
139+
np.random.shuffle(indices)
140+
141+
representations = model.generate(indices)
142+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
143+
symmetric_kernels = model.generate(representations[:800])
144+
asymmetric_kernels = model.generate(representations[:800], representations[800:])
145+
146+
# Model can be fit giving kernel matrix and energies
147+
148+
model = qmlearn.models.KernelRidgeRegression()
149+
model.fit(symmetric_kernels, energies[indices[:800]])
150+
print("Fitted KRR weights:", model.alpha[:3])
151+
152+
# Predictions can be had from an asymmetric kernel
153+
predictions = model.predict(asymmetric_kernels)
154+
print("Predicted energies:", predictions[:3])
155+
print("True energies:", energies[indices[:3]])
156+
157+
# Or the score (default negative mae) can be had directly
158+
scores = model.score(asymmetric_kernels, energies[indices[800:]])
159+
print("Negative MAE:", scores)
160+
161+
print("*** End models examples ***")
162+
print()
163+
164+
def pipelines():
165+
"""
166+
Constructing scikit-learn pipelines
167+
"""
168+
169+
print("*** Begin pipelines examples ***")
170+
171+
# It is much easier to do all this with a scikit-learn pipeline
172+
173+
# Create data
174+
data = qmlearn.Data("../test/qm7/*.xyz")
175+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
176+
data.set_energies(energies)
177+
178+
# Create model
179+
model = sklearn.pipeline.make_pipeline(
180+
qmlearn.preprocessing.AtomScaler(data),
181+
qmlearn.representations.CoulombMatrix(),
182+
qmlearn.kernels.GaussianKernel(),
183+
qmlearn.models.KernelRidgeRegression(),
184+
)
185+
186+
# Create 1000 random indices
187+
indices = np.arange(1000)
188+
np.random.shuffle(indices)
189+
190+
model.fit(indices[:800])
191+
scores = model.score(indices[800:])
192+
print("Negative MAE:", scores)
193+
194+
# Passing alchemy=False to kernels makes sure that the atomic kernel only compares C to C, H to H etc.
195+
# This will speed up kernels of some representations dramatically, but only works in pipelines
196+
197+
# Create model
198+
model = sklearn.pipeline.make_pipeline(
199+
qmlearn.preprocessing.AtomScaler(data),
200+
qmlearn.representations.CoulombMatrix(),
201+
qmlearn.kernels.GaussianKernel(alchemy=False),
202+
qmlearn.models.KernelRidgeRegression(),
203+
)
204+
205+
# Create 1000 random indices
206+
indices = np.arange(1000)
207+
np.random.shuffle(indices)
208+
209+
model.fit(indices[:800])
210+
scores = model.score(indices[800:])
211+
print("Negative MAE without alchemy:", scores)
212+
213+
print("*** End pipelines examples ***")
214+
print()
215+
216+
def cross_validation():
217+
"""
218+
Doing cross validation with qmlearn
219+
"""
220+
221+
print("*** Begin CV examples ***")
222+
223+
# Create data
224+
data = qmlearn.Data("../test/qm7/*.xyz")
225+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
226+
data.set_energies(energies)
227+
228+
# Create model
229+
model = sklearn.pipeline.make_pipeline(
230+
qmlearn.preprocessing.AtomScaler(data),
231+
qmlearn.representations.CoulombMatrix(),
232+
qmlearn.kernels.GaussianKernel(),
233+
qmlearn.models.KernelRidgeRegression(),
234+
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
235+
)
236+
237+
# Create 1000 random indices
238+
indices = np.arange(1000)
239+
np.random.shuffle(indices)
240+
241+
# 3-fold CV of a given model can easily be done
242+
scores = sklearn.model_selection.cross_validate(model, indices, cv=3)
243+
print("Cross-validated scores:", scores['test_score'])
244+
245+
# Doing a grid search over hyper parameters
246+
params = {'gaussiankernel__sigma': [10, 30, 100],
247+
'kernelridgeregression__l2_reg': [1e-8, 1e-4],
248+
}
249+
250+
grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
251+
grid.fit(indices)
252+
print("Best hyper parameters:", grid.best_params_)
253+
print("Best score:", grid.best_score_)
254+
255+
# As an alternative the pipeline can be constructed slightly different, which allows more complex CV
256+
# Create model
257+
model = sklearn.pipeline.Pipeline([
258+
('preprocess', qmlearn.preprocessing.AtomScaler(data)),
259+
('representations', qmlearn.representations.CoulombMatrix()),
260+
('kernel', qmlearn.kernels.GaussianKernel()),
261+
('model', qmlearn.models.KernelRidgeRegression())
262+
],
263+
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
264+
)
265+
266+
# Doing a grid search over hyper parameters
267+
# including which kernel to use
268+
params = {'kernel': [qmlearn.kernels.LaplacianKernel(), qmlearn.kernels.GaussianKernel()],
269+
'kernel__sigma': [10, 30, 100, 1000, 3000, 1000],
270+
'model__l2_reg': [1e-8, 1e-4],
271+
}
272+
273+
grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
274+
grid.fit(indices)
275+
print("Best hyper parameters:", grid.best_params_)
276+
print("Best score:", grid.best_score_)
277+
278+
print("*** End CV examples ***")
279+
280+
if __name__ == '__main__':
281+
data()
282+
preprocessing()
283+
representations()
284+
kernels()
285+
models()
286+
pipelines()
287+
cross_validation()

qml/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
from . import arad
4141
from . import fchl
4242
from . import representations
43+
from . import qmlearn
44+
from . import utils
4345

4446
__author__ = "Anders S. Christensen"
4547
__copyright__ = "Copyright 2016"

0 commit comments

Comments
 (0)