Skip to content

Commit 8a43e06

Browse files
authoredSep 10, 2018
Qmlearn (qmlcode#82)
* Made base representations * started CM and data class * Working on generate routine * Working basic example * Mostly hacked the searchcv routines to work * Implementing atomic gaussian kernel * working atomic krr * Restructure and started global slatm * Slatm * Started acsf * stash before merging acsf bugfix * acsf bugfix cherrypick * sigma='auto' option added to kernels * Started fchl * Working fchl * Started preprocessing * Mostly working atom scaler * Made several attributes private * Restructured how the data object is passed, to avoid possible memory issues * Started alchemy in kernels * Minor change to kernel alchemy * Working feature trick in kernels * Cleaned up code * daily * Finished examples
1 parent df37c92 commit 8a43e06

30 files changed

+2621
-81
lines changed
 

‎docs/source/qml.rst

+11
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,14 @@ qml\.aglaia module
113113
:inherited-members:
114114

115115

116+
qml\.qmlearn.representations module
117+
------------------
118+
119+
.. automodule:: qml.qmlearn.representations
120+
:inherited-members:
121+
122+
qml\.qmlearn.kernels module
123+
------------------
124+
125+
.. automodule:: qml.qmlearn.kernels
126+
:inherited-members:

‎examples/qmlearn.py

+287
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
import glob
2+
import numpy as np
3+
from qml import qmlearn
4+
import sklearn.pipeline
5+
import sklearn.model_selection
6+
7+
def data():
8+
"""
9+
Using the Data object.
10+
"""
11+
print("*** Begin data examples ***")
12+
13+
# The Data object has the same role as the Compound class.
14+
# Where the Compound class is for one compound, the Data class
15+
# Is for multiple
16+
17+
# One can load in a set of xyz files
18+
filenames = sorted(glob.glob("../test/qm7/00*.xyz"))
19+
data = qmlearn.Data(filenames)
20+
print("length of filenames", len(filenames))
21+
print("length of nuclear_charges", len(data.nuclear_charges))
22+
print("length of coordinates", len(data.coordinates))
23+
24+
# Or just load a glob string
25+
data = qmlearn.Data("../test/qm7/00*.xyz")
26+
print("length of nuclear_charges", len(data.nuclear_charges))
27+
28+
# Energies (or other molecular properties) can be stored in the object
29+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)[:98]
30+
data.set_energies(energies)
31+
print("length of energies", len(data.energies))
32+
33+
print("*** End data examples ***")
34+
print()
35+
36+
def preprocessing():
37+
"""
38+
Rescaling energies
39+
"""
40+
41+
print("*** Begin preprocessing examples ***")
42+
43+
# The AtomScaler object does a linear fit of the number of each element to the energy.
44+
data = qmlearn.Data("../test/qm7/*.xyz")
45+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
46+
47+
# Input can be nuclear_charges and energies
48+
print("Energies before rescaling", energies[:3])
49+
rescaled_energies = qmlearn.preprocessing.AtomScaler().fit_transform(data.nuclear_charges, energies)
50+
print("Energies after rescaling", rescaled_energies[:3])
51+
52+
# Or a data object can be used
53+
data.set_energies(energies)
54+
data2 = qmlearn.preprocessing.AtomScaler().fit_transform(data)
55+
print("Energies after rescaling", data2.energies[:3])
56+
57+
print("*** End preprocessing examples ***")
58+
print()
59+
60+
def representations():
61+
"""
62+
Creating representations. Currently implemented representations are
63+
CoulombMatrix, AtomicCoulombMatrix, AtomicSLATM, GlobalSLATM,
64+
FCHLRepresentations, AtomCenteredSymmetryFunctions.
65+
(BagOfBonds is still missing)
66+
"""
67+
68+
print("*** Begin representations examples ***")
69+
70+
data = qmlearn.Data("../test/qm7/*.xyz")
71+
72+
# Representations can be created from a data object
73+
model = qmlearn.representations.CoulombMatrix(sorting ='row-norm')
74+
representations = model.generate(data)
75+
print("Shape of representations:", representations.shape)
76+
77+
# Alternatively the data object can be passed at initialization of the representation class
78+
# and only select molecule indices can be parsed
79+
80+
model = qmlearn.representations.CoulombMatrix(data)
81+
representations = model.generate([0,5,7,16])
82+
print("Shape of representations:", representations.shape)
83+
84+
print("*** End representations examples ***")
85+
print()
86+
87+
def kernels():
88+
"""
89+
Create kernels. Currently implemented kernels are GaussianKernel,
90+
LaplacianKernel, FCHLKernel.
91+
"""
92+
93+
print("*** Begin kernels examples ***")
94+
95+
data = qmlearn.Data("../test/qm7/*.xyz")
96+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
97+
data.set_energies(energies)
98+
99+
# Kernels can be created from representations
100+
model = qmlearn.representations.CoulombMatrix(data)
101+
indices = np.arange(100)
102+
representations = model.generate(indices)
103+
104+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
105+
symmetric_kernels = model.generate(representations[:80])
106+
print("Shape of symmetric kernels:", symmetric_kernels.shape)
107+
108+
asymmetric_kernels = model.generate(representations[:80], representations[80:])
109+
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)
110+
111+
# Atomic representations can be used as well
112+
model = qmlearn.representations.AtomicCoulombMatrix(data)
113+
indices = np.arange(100)
114+
representations = model.generate(indices)
115+
116+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
117+
symmetric_kernels = model.generate(representations[:80], representation_type = 'atomic')
118+
print("Shape of symmetric kernels:", symmetric_kernels.shape)
119+
120+
asymmetric_kernels = model.generate(representations[:80], representations[80:], representation_type = 'atomic')
121+
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)
122+
123+
print("*** End kernels examples ***")
124+
print()
125+
126+
def models():
127+
"""
128+
Regression models. Only KernelRidgeRegression implemented so far.
129+
"""
130+
131+
print("*** Begin models examples ***")
132+
133+
filenames = sorted(glob.glob("../test/qm7/*.xyz"))
134+
data = qmlearn.Data(filenames)
135+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
136+
model = qmlearn.representations.CoulombMatrix(data)
137+
# Create 1000 random indices
138+
indices = np.arange(1000)
139+
np.random.shuffle(indices)
140+
141+
representations = model.generate(indices)
142+
model = qmlearn.kernels.GaussianKernel(sigma='auto')
143+
symmetric_kernels = model.generate(representations[:800])
144+
asymmetric_kernels = model.generate(representations[:800], representations[800:])
145+
146+
# Model can be fit giving kernel matrix and energies
147+
148+
model = qmlearn.models.KernelRidgeRegression()
149+
model.fit(symmetric_kernels, energies[indices[:800]])
150+
print("Fitted KRR weights:", model.alpha[:3])
151+
152+
# Predictions can be had from an asymmetric kernel
153+
predictions = model.predict(asymmetric_kernels)
154+
print("Predicted energies:", predictions[:3])
155+
print("True energies:", energies[indices[:3]])
156+
157+
# Or the score (default negative mae) can be had directly
158+
scores = model.score(asymmetric_kernels, energies[indices[800:]])
159+
print("Negative MAE:", scores)
160+
161+
print("*** End models examples ***")
162+
print()
163+
164+
def pipelines():
165+
"""
166+
Constructing scikit-learn pipelines
167+
"""
168+
169+
print("*** Begin pipelines examples ***")
170+
171+
# It is much easier to do all this with a scikit-learn pipeline
172+
173+
# Create data
174+
data = qmlearn.Data("../test/qm7/*.xyz")
175+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
176+
data.set_energies(energies)
177+
178+
# Create model
179+
model = sklearn.pipeline.make_pipeline(
180+
qmlearn.preprocessing.AtomScaler(data),
181+
qmlearn.representations.CoulombMatrix(),
182+
qmlearn.kernels.GaussianKernel(),
183+
qmlearn.models.KernelRidgeRegression(),
184+
)
185+
186+
# Create 1000 random indices
187+
indices = np.arange(1000)
188+
np.random.shuffle(indices)
189+
190+
model.fit(indices[:800])
191+
scores = model.score(indices[800:])
192+
print("Negative MAE:", scores)
193+
194+
# Passing alchemy=False to kernels makes sure that the atomic kernel only compares C to C, H to H etc.
195+
# This will speed up kernels of some representations dramatically, but only works in pipelines
196+
197+
# Create model
198+
model = sklearn.pipeline.make_pipeline(
199+
qmlearn.preprocessing.AtomScaler(data),
200+
qmlearn.representations.CoulombMatrix(),
201+
qmlearn.kernels.GaussianKernel(alchemy=False),
202+
qmlearn.models.KernelRidgeRegression(),
203+
)
204+
205+
# Create 1000 random indices
206+
indices = np.arange(1000)
207+
np.random.shuffle(indices)
208+
209+
model.fit(indices[:800])
210+
scores = model.score(indices[800:])
211+
print("Negative MAE without alchemy:", scores)
212+
213+
print("*** End pipelines examples ***")
214+
print()
215+
216+
def cross_validation():
217+
"""
218+
Doing cross validation with qmlearn
219+
"""
220+
221+
print("*** Begin CV examples ***")
222+
223+
# Create data
224+
data = qmlearn.Data("../test/qm7/*.xyz")
225+
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
226+
data.set_energies(energies)
227+
228+
# Create model
229+
model = sklearn.pipeline.make_pipeline(
230+
qmlearn.preprocessing.AtomScaler(data),
231+
qmlearn.representations.CoulombMatrix(),
232+
qmlearn.kernels.GaussianKernel(),
233+
qmlearn.models.KernelRidgeRegression(),
234+
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
235+
)
236+
237+
# Create 1000 random indices
238+
indices = np.arange(1000)
239+
np.random.shuffle(indices)
240+
241+
# 3-fold CV of a given model can easily be done
242+
scores = sklearn.model_selection.cross_validate(model, indices, cv=3)
243+
print("Cross-validated scores:", scores['test_score'])
244+
245+
# Doing a grid search over hyper parameters
246+
params = {'gaussiankernel__sigma': [10, 30, 100],
247+
'kernelridgeregression__l2_reg': [1e-8, 1e-4],
248+
}
249+
250+
grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
251+
grid.fit(indices)
252+
print("Best hyper parameters:", grid.best_params_)
253+
print("Best score:", grid.best_score_)
254+
255+
# As an alternative the pipeline can be constructed slightly different, which allows more complex CV
256+
# Create model
257+
model = sklearn.pipeline.Pipeline([
258+
('preprocess', qmlearn.preprocessing.AtomScaler(data)),
259+
('representations', qmlearn.representations.CoulombMatrix()),
260+
('kernel', qmlearn.kernels.GaussianKernel()),
261+
('model', qmlearn.models.KernelRidgeRegression())
262+
],
263+
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
264+
)
265+
266+
# Doing a grid search over hyper parameters
267+
# including which kernel to use
268+
params = {'kernel': [qmlearn.kernels.LaplacianKernel(), qmlearn.kernels.GaussianKernel()],
269+
'kernel__sigma': [10, 30, 100, 1000, 3000, 1000],
270+
'model__l2_reg': [1e-8, 1e-4],
271+
}
272+
273+
grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
274+
grid.fit(indices)
275+
print("Best hyper parameters:", grid.best_params_)
276+
print("Best score:", grid.best_score_)
277+
278+
print("*** End CV examples ***")
279+
280+
if __name__ == '__main__':
281+
data()
282+
preprocessing()
283+
representations()
284+
kernels()
285+
models()
286+
pipelines()
287+
cross_validation()

‎qml/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
from . import arad
4141
from . import fchl
4242
from . import representations
43+
from . import qmlearn
44+
from . import utils
4345

4446
__author__ = "Anders S. Christensen"
4547
__copyright__ = "Copyright 2016"

‎qml/aglaia/aglaia.py

+43-43
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
import tensorflow as tf
3131
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
3232
from sklearn.base import BaseEstimator
33-
from qml.aglaia.symm_funct import generate_parkhill_acsf
34-
from qml.aglaia.utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \
35-
is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, is_none, \
33+
from .symm_funct import generate_parkhill_acsf
34+
from ..utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \
35+
is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, \
3636
check_global_representation, check_y, check_sizes, check_dy, check_classes, is_numeric_array, is_non_zero_integer, \
3737
is_positive_integer_or_zero_array, check_local_representation
3838

39-
from qml.aglaia.tf_utils import TensorBoardLogger
39+
from .tf_utils import TensorBoardLogger
4040

4141
try:
4242
from qml.data import Compound
@@ -580,7 +580,7 @@ def _set_slatm_parameters(self, params):
580580
self.slatm_parameters = {'slatm_sigma1': 0.05, 'slatm_sigma2': 0.05, 'slatm_dgrid1': 0.03, 'slatm_dgrid2': 0.03,
581581
'slatm_rcut': 4.8, 'slatm_rpower': 6, 'slatm_alchemy': False}
582582

583-
if not is_none(params):
583+
if params is not None:
584584
for key, value in params.items():
585585
if key in self.slatm_parameters:
586586
self.slatm_parameters[key] = value
@@ -597,7 +597,7 @@ def _set_acsf_parameters(self, params):
597597
self.acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2),
598598
'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0}
599599

600-
if not is_none(params):
600+
if params is not None:
601601
for key, value in params.items():
602602
if key in self.acsf_parameters:
603603
self.acsf_parameters[key] = value
@@ -658,7 +658,7 @@ def generate_compounds(self, filenames):
658658
"""
659659

660660
# Check that the number of properties match the number of compounds if the properties have already been set
661-
if is_none(self.properties):
661+
if self.properties is None:
662662
pass
663663
else:
664664
if self.properties.size == len(filenames):
@@ -683,18 +683,18 @@ def generate_representation(self, xyz=None, classes=None):
683683
:return: None
684684
"""
685685

686-
if is_none(self.compounds) and is_none(xyz) and is_none(classes):
686+
if self.compounds is None and xyz is None and classes is None:
687687
raise InputError("QML compounds need to be created in advance or Cartesian coordinates need to be passed in "
688688
"order to generate the representation.")
689689

690-
if not is_none(self.representation):
690+
if self.representation is not None:
691691
raise InputError("The representations have already been set!")
692692

693-
if is_none(self.compounds):
693+
if self.compounds is None:
694694

695695
self.representation, self.classes = self._generate_representations_from_data(xyz, classes)
696696

697-
elif is_none(xyz):
697+
elif xyz is None:
698698
# Make representations from compounds
699699

700700
self.representation, self.classes = self._generate_representations_from_compounds()
@@ -708,7 +708,7 @@ def set_properties(self, properties):
708708
:param y: array of properties of size (nsamples,)
709709
:type y: array
710710
"""
711-
if is_none(properties):
711+
if properties is None:
712712
raise InputError("Properties cannot be set to none.")
713713
else:
714714
if is_numeric_array(properties) and np.asarray(properties).ndim == 1:
@@ -725,10 +725,10 @@ def set_representations(self, representations):
725725
:type representations: numpy array of shape (n_samples, n_features) or (n_samples, n_atoms, n_features)
726726
"""
727727

728-
if not is_none(self.representation):
728+
if self.representation is not None:
729729
raise InputError("The representations have already been set!")
730730

731-
if is_none(representations):
731+
if representations is None:
732732
raise InputError("Descriptor cannot be set to none.")
733733
else:
734734
if is_numeric_array(representations):
@@ -745,7 +745,7 @@ def set_gradients(self, gradients):
745745
:return: None
746746
"""
747747

748-
if is_none(gradients):
748+
if gradients is None:
749749
raise InputError("Gradients cannot be set to none.")
750750
else:
751751
if is_numeric_array(gradients):
@@ -762,7 +762,7 @@ def set_classes(self, classes):
762762
:type classes: numpy array of shape (n_samples, n_atoms) of ints
763763
:return: None
764764
"""
765-
if is_none(classes):
765+
if classes is None:
766766
raise InputError("Classes cannot be set to none.")
767767
else:
768768
if is_positive_integer_array(classes):
@@ -1050,7 +1050,7 @@ def _initialise_representation(self, representation, parameters):
10501050
raise InputError("Unknown representation %s" % representation)
10511051
self.representation_name = representation.lower()
10521052

1053-
if not is_none(parameters):
1053+
if parameters is not None:
10541054
if not type(parameters) is dict:
10551055
raise InputError("The representation parameters passed should be either None or a dictionary.")
10561056

@@ -1060,7 +1060,7 @@ def _initialise_representation(self, representation, parameters):
10601060

10611061
else:
10621062

1063-
if not is_none(parameters):
1063+
if parameters is not None:
10641064
raise InputError("The representation %s does not take any additional parameters." % (self.representation_name))
10651065

10661066
def _set_representation(self, representation):
@@ -1098,7 +1098,7 @@ def _generate_representations_from_compounds(self):
10981098
:rtype: numpy array of shape (n_samples, n_features) and None
10991099
"""
11001100

1101-
if is_none(self.compounds):
1101+
if self.compounds is None:
11021102
raise InputError("This should never happen.")
11031103

11041104
n_samples = len(self.compounds)
@@ -1368,18 +1368,18 @@ def _check_inputs(self, x, y, dy, classes):
13681368
if not is_array_like(x):
13691369
raise InputError("x should be an array either containing indices or data.")
13701370

1371-
if not is_none(dy) and not is_none(classes):
1371+
if dy is not None and classes is not None:
13721372
raise InputError("MRMP estimator cannot predict gradients and do atomic decomposition.")
13731373

13741374
# Check if x is made up of indices or data
13751375
if is_positive_integer_or_zero_array(x):
13761376

1377-
if is_none(self.representation):
1378-
if is_none(self.compounds):
1377+
if self.representation is None:
1378+
if self.compounds is None:
13791379
raise InputError("No representations or QML compounds have been set yet.")
13801380
else:
13811381
self.representation, _ = self._generate_representations_from_compounds()
1382-
if is_none(self.properties):
1382+
if self.properties is None:
13831383
raise InputError("The properties need to be set in advance.")
13841384

13851385
approved_x = self.representation[x]
@@ -1391,7 +1391,7 @@ def _check_inputs(self, x, y, dy, classes):
13911391

13921392
else:
13931393

1394-
if is_none(y):
1394+
if y is None:
13951395
raise InputError("y cannot be of None type.")
13961396

13971397
approved_x = check_global_representation(x)
@@ -1420,18 +1420,18 @@ def _check_predict_input(self, x, classes):
14201420
if not is_array_like(x):
14211421
raise InputError("x should be an array either containing indices or data.")
14221422

1423-
if not is_none(classes):
1423+
if classes is not None:
14241424
raise InputError("MRMP estimator cannot do atomic decomposition.")
14251425

14261426
# Check if x is made up of indices or data
14271427
if is_positive_integer_or_zero_array(x):
14281428

1429-
if is_none(self.representation):
1430-
if is_none(self.compounds):
1429+
if self.representation is None:
1430+
if self.compounds is None:
14311431
raise InputError("No representations or QML compounds have been set yet.")
14321432
else:
14331433
self.representation, _ = self._generate_representations_from_compounds()
1434-
if is_none(self.properties):
1434+
if self.properties is None:
14351435
raise InputError("The properties need to be set in advance.")
14361436

14371437
approved_x = self.representation[x]
@@ -1586,7 +1586,7 @@ def _initialise_representation(self, representation, parameters):
15861586
raise InputError("Unknown representation %s" % representation)
15871587
self.representation_name = representation.lower()
15881588

1589-
if not is_none(parameters):
1589+
if parameters is not None:
15901590
if not type(parameters) is dict:
15911591
raise InputError("The representation parameters passed should be either None or a dictionary.")
15921592
self._check_representation_parameters(parameters)
@@ -1601,7 +1601,7 @@ def _initialise_representation(self, representation, parameters):
16011601

16021602
else:
16031603

1604-
if not is_none(parameters):
1604+
if parameters is not None:
16051605
raise InputError("The representation %s does not take any additional parameters." % (self.representation_name))
16061606

16071607
def _set_representation(self, representation):
@@ -1624,7 +1624,7 @@ def _generate_representations_from_data(self, xyz, classes):
16241624
:rtype: numpy arrays of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms)
16251625
"""
16261626

1627-
if is_none(classes):
1627+
if classes is None:
16281628
raise InputError("The classes need to be provided for the ARMP estimator.")
16291629
else:
16301630
if len(classes.shape) > 2 or np.all(xyz.shape[:2] != classes.shape):
@@ -1743,7 +1743,7 @@ def _generate_representations_from_compounds(self):
17431743
:rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms)
17441744
"""
17451745

1746-
if is_none(self.compounds):
1746+
if self.compounds is None:
17471747
raise InputError("QML compounds needs to be created in advance")
17481748

17491749
if self.representation_name == 'slatm':
@@ -2028,22 +2028,22 @@ def _check_inputs(self, x, y, dy, classes):
20282028
if not is_array_like(x):
20292029
raise InputError("x should be an array either containing indices or data.")
20302030

2031-
if not is_none(dy):
2031+
if dy is not None:
20322032
raise InputError("ARMP estimator cannot be used to predict gradients. Use ARMP_G estimator.")
20332033

20342034
# Check if x is made up of indices or data
20352035
if is_positive_integer_or_zero_array(x):
20362036

2037-
if is_none(self.representation):
2037+
if self.representation is None:
20382038

2039-
if is_none(self.compounds):
2039+
if self.compounds is None:
20402040
raise InputError("No representations or QML compounds have been set yet.")
20412041
else:
20422042
self.representation, self.classes = self._generate_representations_from_compounds()
20432043

2044-
if is_none(self.properties):
2044+
if self.properties is None:
20452045
raise InputError("The properties need to be set in advance.")
2046-
if is_none(self.classes):
2046+
if self.classes is None:
20472047
raise InputError("The classes need to be set in advance.")
20482048

20492049
approved_x = self.representation[x]
@@ -2055,9 +2055,9 @@ def _check_inputs(self, x, y, dy, classes):
20552055

20562056
else:
20572057

2058-
if is_none(y):
2058+
if y is None:
20592059
raise InputError("y cannot be of None type.")
2060-
if is_none(classes):
2060+
if classes is None:
20612061
raise InputError("ARMP estimator needs the classes to do atomic decomposition.")
20622062

20632063
approved_x = check_local_representation(x)
@@ -2089,12 +2089,12 @@ def _check_predict_input(self, x, classes):
20892089
# Check if x is made up of indices or data
20902090
if is_positive_integer_or_zero_array(x):
20912091

2092-
if is_none(self.representation):
2093-
if is_none(self.compounds):
2092+
if self.representation is None:
2093+
if self.compounds is None:
20942094
raise InputError("No representations or QML compounds have been set yet.")
20952095
else:
20962096
self.representation, self.classes = self._generate_representations_from_compounds()
2097-
if is_none(self.properties):
2097+
if self.properties is None:
20982098
raise InputError("The properties need to be set in advance.")
20992099

21002100
approved_x = self.representation[x]
@@ -2104,7 +2104,7 @@ def _check_predict_input(self, x, classes):
21042104

21052105
else:
21062106

2107-
if is_none(classes):
2107+
if classes is None:
21082108
raise InputError("ARMP estimator needs the classes to do atomic decomposition.")
21092109

21102110
approved_x = check_local_representation(x)

‎qml/arad/arad.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from .farad_kernels import fget_atomic_kernels_arad
3434
from .farad_kernels import fget_atomic_symmetric_kernels_arad
3535

36-
from qml.data.alchemy import PTP
36+
from qml.utils.alchemy import PTP
3737

3838
def getAngle(sp,norms):
3939
epsilon = 10.* np.finfo(float).eps

‎qml/data/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,3 @@
2323

2424
from .xyzdataprovider import XYZDataProvider
2525
from .compound import Compound
26-
from .alchemy import ELEMENT_NAME, NUCLEAR_CHARGE

‎qml/data/compound.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2016-2017 Anders Steen Christensen, Felix Faber, Lars Andersen Bratholm
3+
# Copyright (c) 2016-2018 Anders Steen Christensen, Felix Faber, Lars Andersen Bratholm
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@
2525
import numpy as np
2626
import collections
2727

28-
from .alchemy import NUCLEAR_CHARGE
28+
from ..utils import NUCLEAR_CHARGE
2929

3030
from ..representations import generate_coulomb_matrix
3131
from ..representations import generate_atomic_coulomb_matrix

‎qml/data/dataprovider.py

-1
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,3 @@ def get_properties(self, idx=None):
4444
def read_database(self, db_filename):
4545

4646
self.compounds = connect(db_filename)
47-

‎qml/data/xyzdataprovider.py

-1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,3 @@ def add_structures(self, xyz_filenames):
4040
print(i, xyz_filename, self.properties[i])
4141
compound = read(xyz_filename)
4242
self.compounds.write(compound)
43-

‎qml/fchl/fchl_electric_field_kernels.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
from .fchl_kernel_functions import get_kernel_parameters
3333

34-
from qml.data.alchemy import get_alchemy
34+
from qml.utils.alchemy import get_alchemy
3535

3636

3737
# def get_local_kernels_ef(A, B, verbose=False, df=0.01, ef_scaling=0.01,\

‎qml/fchl/fchl_force_kernels.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
from .fchl_kernel_functions import get_kernel_parameters
3939

40-
from qml.data.alchemy import get_alchemy
40+
from qml.utils.alchemy import get_alchemy
4141

4242

4343
def get_gaussian_process_kernels(A, B, verbose=False, dx=0.005, \

‎qml/fchl/fchl_kernels.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
from .fchl_kernel_functions import get_kernel_parameters
5454

55-
from qml.data.alchemy import get_alchemy
55+
from qml.utils.alchemy import get_alchemy
5656

5757

5858
def get_local_kernels(A, B, \

‎qml/fchl/fchl_representations.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
import numpy as np
2626
import copy
2727

28-
from qml.data.alchemy import get_alchemy
29-
from qml.data.alchemy import ELEMENT_NAME
28+
from qml.utils.alchemy import get_alchemy
29+
from qml.utils import ELEMENT_NAME
3030

3131
def generate_representation(coordinates, nuclear_charges,
3232
max_size=23, neighbors=23, cut_distance = 5.0, cell=None):

‎qml/fchl/fchl_scalar_kernels.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from .ffchl_module import fget_atomic_local_kernels_fchl
3737

3838
from .fchl_kernel_functions import get_kernel_parameters
39-
from qml.data.alchemy import get_alchemy
39+
from qml.utils.alchemy import get_alchemy
4040

4141

4242
def get_local_kernels(A, B, verbose=False,\

‎qml/kernels/kernels.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
import numpy as np
2626

27-
from .fkernels import fgaussian_kernel
27+
from .fkernels import fgaussian_kernel, fgaussian_kernel_symmetric
2828
from .fkernels import flaplacian_kernel
2929
from .fkernels import fgaussian_kernel_symmetric
3030
from .fkernels import flaplacian_kernel_symmetric
@@ -34,6 +34,7 @@
3434

3535
from .fkernels import fget_local_kernels_gaussian
3636
from .fkernels import fget_local_kernels_laplacian
37+
from .fkernels import fget_vector_kernels_gaussian, fget_vector_kernels_gaussian_symmetric
3738

3839
def laplacian_kernel(A, B, sigma):
3940
""" Calculates the Laplacian kernel matrix K, where :math:`K_{ij}`:
@@ -304,7 +305,7 @@ def get_local_kernels_gaussian(A, B, na, nb, sigmas):
304305

305306
nma = len(na)
306307
nmb = len(nb)
307-
308+
308309
sigmas = np.asarray(sigmas)
309310
nsigmas = len(sigmas)
310311

‎qml/models/kernelridge.py

-2
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,3 @@ def _save(self, path, save_kernel=False):
110110
if save_kernel:
111111
np.save(path + "/K.npy")
112112

113-
114-

‎qml/qmlearn/__init__.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2018 Lars Andersen Bratholm
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from .data import Data
24+
from . import representations
25+
from . import kernels
26+
from . import preprocessing
27+
from . import models

‎qml/qmlearn/data.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
2+
from __future__ import print_function
3+
4+
import glob
5+
import numpy as np
6+
from ..utils import NUCLEAR_CHARGE
7+
import copy
8+
9+
10+
class Data(object):
11+
"""
12+
Temporary data class which should be replaced at some point by the ASE-interface.
13+
This could in principle also be replaced by a dictionary
14+
15+
"""
16+
17+
def __init__(self, filenames=None, property_type = "energy"):
18+
"""
19+
:param filenames: list of filenames or a string to be read by glob. e.g. 'dir/*.xyz'
20+
:type filenames: list or string
21+
:param property_type: What kind of property will be predicted ('energy')
22+
:type property_type: string
23+
"""
24+
25+
self.property_type = property_type
26+
27+
self._set_ncompounds(0)
28+
self.coordinates = None
29+
self.nuclear_charges = None
30+
self.natoms = None
31+
self.energies = None
32+
33+
if isinstance(filenames, str):
34+
filenames = sorted(glob.glob(filenames))
35+
if isinstance(filenames, list):
36+
self._parse_xyz_files(filenames)
37+
# Overwritten in various parts of a standard prediction pipeline
38+
# so don't use these within the class
39+
#self._has_transformed_labels
40+
#self._representations
41+
#self._kernel
42+
#self._indices
43+
#self._representation_type
44+
#self._representation_short_name
45+
#self._representation_cutoff
46+
#self._representation_alchemy
47+
48+
def _set_ncompounds(self, n):
49+
self.ncompounds = n
50+
# Hack for sklearn CV
51+
self.shape = (n,)
52+
53+
def take(self, i, axis=None):
54+
"""
55+
Hack for sklearn CV
56+
"""
57+
other = copy.copy(self)
58+
other._indices = i
59+
return other
60+
61+
# Hack for sklearn CV
62+
def __getitem__(self, i):
63+
return i
64+
65+
# Hack for sklearn CV but also convenience
66+
def __len__(self):
67+
if hasattr(self, '_indices'):
68+
return len(self._indices)
69+
return self.ncompounds
70+
71+
# Hack for sklearn CV but also convenience
72+
def __eq__(self, other):
73+
"""
74+
Overrides the == operator.
75+
"""
76+
77+
if type(self) != type(other):
78+
return False
79+
80+
self_vars = vars(self)
81+
other_vars = vars(other)
82+
83+
if len(self_vars) != len(other_vars):
84+
return False
85+
86+
for key, val in self_vars.items():
87+
if val is not other_vars[key]:
88+
return False
89+
90+
return True
91+
92+
# Hack for sklearn CV but also convenience
93+
def __ne__(self, other):
94+
"""
95+
Overrides the != operator (unnecessary in Python 3)
96+
"""
97+
return not self.__eq__(other)
98+
99+
def set_energies(self, energies):
100+
self.energies = energies
101+
102+
def _parse_xyz_files(self, filenames):
103+
"""
104+
Parse a list of xyz files.
105+
"""
106+
107+
self._set_ncompounds(len(filenames))
108+
self.coordinates = np.empty(self.ncompounds, dtype=object)
109+
self.nuclear_charges = np.empty(self.ncompounds, dtype=object)
110+
self.natoms = np.empty(self.ncompounds, dtype = int)
111+
112+
for i, filename in enumerate(filenames):
113+
with open(filename, "r") as f:
114+
lines = f.readlines()
115+
116+
natoms = int(lines[0])
117+
self.natoms[i] = natoms
118+
self.nuclear_charges[i] = np.empty(natoms, dtype=int)
119+
self.coordinates[i] = np.empty((natoms, 3), dtype=float)
120+
121+
for j, line in enumerate(lines[2:natoms+2]):
122+
tokens = line.split()
123+
124+
if len(tokens) < 4:
125+
break
126+
127+
self.nuclear_charges[i][j] = NUCLEAR_CHARGE[tokens[0]]
128+
self.coordinates[i][j] = np.asarray(tokens[1:4], dtype=float)
129+
130+
# Try to convert dtype to int/float in cases where you have the
131+
# same molecule, just different conformers
132+
133+
try:
134+
self.nuclear_charges = np.asarray([self.nuclear_charges[i] for i in range(self.ncompounds)],
135+
dtype=int)
136+
self.coordinates = np.asarray([self.coordinates[i] for i in range(self.ncompounds)],
137+
dtype=float)
138+
except ValueError:
139+
pass

‎qml/qmlearn/kernels.py

+845
Large diffs are not rendered by default.

‎qml/qmlearn/models.py

+161
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2018 Lars A. Bratholm
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from __future__ import division, absolute_import, print_function
24+
25+
import numpy as np
26+
from sklearn.base import BaseEstimator
27+
from sklearn.metrics import mean_absolute_error
28+
29+
from ..utils import is_numeric_array
30+
from .data import Data
31+
from ..math import cho_solve
32+
33+
class _BaseModel(BaseEstimator):
34+
"""
35+
Base class for all regression models
36+
"""
37+
38+
_estimator_type = "regressor"
39+
40+
def fit(self, X):
41+
raise NotImplementedError
42+
43+
def predict(self, X):
44+
return NotImplementedError
45+
46+
def score(self, X, y=None):
47+
"""
48+
Make predictions on `X` and return a score
49+
50+
:param X: Data object
51+
:type X: object
52+
:param y: Energies
53+
:type y: array
54+
:return: score
55+
:rtype: float
56+
"""
57+
58+
# Make predictions
59+
y_pred = self.predict(X)
60+
61+
# Get the true values
62+
if is_numeric_array(y):
63+
pass
64+
65+
elif isinstance(X, Data):
66+
try:
67+
y = X.energies[X._indices]
68+
except:
69+
print("No kernel energies found in data object in module %s" % self.__class__.__name__)
70+
raise SystemExit
71+
72+
else:
73+
print("Expected variable 'X' to be Data object. Got %s" % str(X))
74+
raise SystemExit
75+
76+
# Return the score
77+
if self.scoring == 'mae':
78+
return mean_absolute_error(y, y_pred)
79+
elif self.scoring == 'neg_mae':
80+
return - mean_absolute_error(y, y_pred)
81+
elif self.scoring == 'rmsd':
82+
return np.sqrt(mean_squared_error(y, y_pred))
83+
elif self.scoring == 'neg_rmsd':
84+
return - np.sqrt(mean_squared_error(y, y_pred))
85+
elif self.scoring == 'neg_log_mae':
86+
return - np.log(mean_absolute_error(y, y_pred))
87+
88+
class KernelRidgeRegression(_BaseModel):
89+
"""
90+
Standard Kernel Ridge Regression using a cholesky solver
91+
"""
92+
93+
def __init__(self, l2_reg=1e-10, scoring='neg_mae'):
94+
"""
95+
:param llambda: l2 regularization
96+
:type llambda: float
97+
:param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae')
98+
:type scoring: string
99+
"""
100+
self.l2_reg = l2_reg
101+
self.scoring = scoring
102+
103+
self.alpha = None
104+
105+
def fit(self, X, y=None):
106+
"""
107+
Fit the Kernel Ridge Regression model using a cholesky solver
108+
109+
:param X: Data object or kernel
110+
:type X: object or array
111+
:param y: Energies
112+
:type y: array
113+
"""
114+
115+
if isinstance(X, Data):
116+
try:
117+
K, y = X._kernel, X.energies[X._indices]
118+
except:
119+
print("No kernel matrix and/or energies found in data object in module %s" % self.__class__.__name__)
120+
raise SystemExit
121+
elif is_numeric_array(X) and X.ndim == 2 and X.shape[0] == X.shape[1] and y is not None:
122+
K = X
123+
else:
124+
print("Expected variable 'X' to be kernel matrix or Data object. Got %s" % str(X))
125+
raise SystemExit
126+
127+
128+
K[np.diag_indices_from(K)] += self.l2_reg
129+
130+
self.alpha = cho_solve(K, y)
131+
132+
def predict(self, X):
133+
"""
134+
Fit the Kernel Ridge Regression model using a cholesky solver
135+
136+
:param X: Data object
137+
:type X: object
138+
:param y: Energies
139+
:type y: array
140+
"""
141+
142+
# Check if model has been fit
143+
if self.alpha is None:
144+
print("Error: The %s model has not been trained yet" % self.__class__.__name__)
145+
raise SystemExit
146+
147+
if isinstance(X, Data):
148+
try:
149+
K = X._kernel
150+
except:
151+
print("No kernel matrix found in data object in module %s" % self.__class__.__name__)
152+
raise SystemExit
153+
elif is_numeric_array(X) and X.ndim == 2 and X.shape[1] == self.alpha.size:
154+
K = X
155+
elif is_numeric_array(X) and X.ndim == 2 and X.shape[0] == self.alpha.size:
156+
K = X.T
157+
else:
158+
print("Expected variable 'X' to be kernel matrix or Data object. Got %s" % str(X))
159+
raise SystemExit
160+
161+
return np.dot(K, self.alpha)

‎qml/qmlearn/preprocessing.py

+237
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2018 Lars Andersen Bratholm
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from __future__ import print_function
24+
25+
import copy
26+
27+
import numpy as np
28+
from sklearn.base import BaseEstimator
29+
from sklearn.linear_model import LinearRegression
30+
31+
from .data import Data
32+
from ..utils import is_numeric_array, get_unique, is_positive_integer_or_zero_array
33+
34+
class AtomScaler(BaseEstimator):
35+
"""
36+
Subtracts any constant offset or linear dependency on the number of atoms of each element from the property
37+
"""
38+
39+
def __init__(self, data=None, elements='auto'):
40+
"""
41+
:param data: Data object (optional)
42+
:type data: Data object
43+
:param elements: Elements to support. If `elements='auto'` try to determine this automatically.
44+
:type elements: array
45+
:param normalize: Normalize the transformed data such that the standard deviation is 1
46+
:type normalize: bool
47+
"""
48+
# Shallow copy should be fine
49+
self._set_data(data)
50+
self.elements = elements
51+
52+
# Initialize model
53+
self.model = LinearRegression()
54+
55+
def _preprocess_input(self, X):
56+
"""
57+
Convenience function that processes X in a way such that
58+
X can both be a data object, or an array of indices. And y
59+
can be either values to transform or None.
60+
61+
:param X: Data object, floating values or integer array of indices
62+
:type X: Data object or array
63+
:param y: Values or None
64+
:type y: array or None
65+
:return: Nuclear charges and values to transform
66+
:rtype: tuple
67+
"""
68+
69+
if isinstance(X, Data):
70+
71+
self._check_data(X)
72+
73+
data = copy.copy(X)
74+
75+
# Part of the sklearn CV hack.
76+
if not hasattr(data, '_indices'):
77+
data._indices = np.arange(len(data))
78+
79+
if hasattr(data, '_has_transformed_labels'):
80+
print("Error: Target data has already been transformed by %s" % self.__class__.__name__)
81+
raise SystemExit
82+
83+
transformed_labels = np.zeros(len(data), dtype=bool)
84+
transformed_labels[data._indices] = True
85+
data._has_transformed_labels = transformed_labels
86+
87+
elif self.data and is_positive_integer_or_zero_array(X) \
88+
and max(X) <= self.data.natoms.size:
89+
# A copy here might avoid some unintended behaviour
90+
# if multiple models is used sequentially.
91+
data = copy.copy(self.data)
92+
data._indices = np.asarray(X, dtype=int).ravel()
93+
94+
95+
if hasattr(data, '_has_transformed_labels'):
96+
if any(data._has_transformed_labels[data._indices] == True):
97+
print("Error: Target data has already been transformed by %s" % self.__class__.__name__)
98+
raise SystemExit
99+
data._has_transformed_labels[data._indices] = True
100+
else:
101+
transformed_labels = np.zeros(len(data.energies), dtype=bool)
102+
transformed_labels[data._indices] = True
103+
data._has_transformed_labels = transformed_labels
104+
105+
else:
106+
print("Expected X to be array of indices or Data object. Got %s" % str(X))
107+
raise SystemExit
108+
109+
return data
110+
111+
def _check_data(self, X):
112+
if X.natoms is None:
113+
print("Error: Empty Data object passed to the %s transformer" % self.__class__.__name__)
114+
raise SystemExit
115+
116+
if X.energies is None:
117+
print("Error: Expected Data object to have non-empty attribute 'energies'" % self.__class__.__name__)
118+
raise SystemExit
119+
120+
121+
def _set_data(self, data):
122+
if data:
123+
self._check_data(data)
124+
self.data = data
125+
126+
def fit_transform(self, X, y=None):
127+
"""
128+
Fit and transform the data with a linear model.
129+
Supports three different types of input.
130+
1) X is a list of nuclear charges and y is values to transform.
131+
2) X is an array of indices of which to transform.
132+
3) X is a data object
133+
134+
:param X: List with nuclear charges or Data object.
135+
:type X: list
136+
:param y: Values to transform
137+
:type y: array or None
138+
:return: Array of transformed values or Data object, depending on input
139+
:rtype: array or Data object
140+
"""
141+
142+
if not isinstance(X, Data) and y is not None:
143+
data = None
144+
nuclear_charges = X
145+
else:
146+
data = self._preprocess_input(X)
147+
nuclear_charges = data.nuclear_charges[data._indices]
148+
y = data.energies[data._indices]
149+
150+
if self.elements == 'auto':
151+
self.elements = get_unique(nuclear_charges)
152+
else:
153+
self._check_elements(nuclear_charges)
154+
155+
156+
features = self._featurizer(nuclear_charges)
157+
158+
delta_y = y - self.model.fit(features, y).predict(features)
159+
160+
if data:
161+
# Force copy
162+
data.energies = data.energies.copy()
163+
data.energies[data._indices] = delta_y
164+
return data
165+
else:
166+
return delta_y
167+
168+
def _check_elements(self, nuclear_charges):
169+
"""
170+
Check that the elements in the given nuclear_charges was
171+
included in the fit.
172+
"""
173+
174+
elements_transform = get_unique(nuclear_charges)
175+
if not np.isin(elements_transform, self.elements).all():
176+
print("Warning: Trying to transform molecules with elements",
177+
"not included during fit in the %s method." % self.__class__.__name__,
178+
"%s used in training but trying to transform %s" % (str(self.elements), str(elements_transform)))
179+
180+
def _featurizer(self, X):
181+
"""
182+
Get the counts of each element as features.
183+
"""
184+
185+
n = len(X)
186+
m = len(self.elements)
187+
element_to_index = {v:i for i, v in enumerate(self.elements)}
188+
features = np.zeros((n,m), dtype=int)
189+
190+
for i, x in enumerate(X):
191+
count_dict = {k:v for k,v in zip(*np.unique(x, return_counts=True))}
192+
for key, value in count_dict.items():
193+
if key not in element_to_index:
194+
continue
195+
j = element_to_index[key]
196+
features[i, j] = value
197+
198+
return features
199+
200+
def transform(self, X, y=None):
201+
"""
202+
Transform the data with the fitted linear model.
203+
Supports three different types of input.
204+
1) X is a list of nuclear charges and y is values to transform.
205+
2) X is an array of indices of which to transform.
206+
3) X is a data object
207+
208+
:param X: List with nuclear charges or Data object.
209+
:type X: list
210+
:param y: Values to transform
211+
:type y: array or None
212+
:return: Array of transformed values or Data object, depending on input
213+
:rtype: array or Data object
214+
"""
215+
216+
if not isinstance(X, Data) and y is not None:
217+
data = None
218+
nuclear_charges = X
219+
else:
220+
data = self._preprocess_input(X)
221+
nuclear_charges = data.nuclear_charges[data._indices]
222+
y = data.energies[data._indices]
223+
224+
self._check_elements(nuclear_charges)
225+
226+
features = self._featurizer(nuclear_charges)
227+
228+
delta_y = y - self.model.predict(features)
229+
230+
if data:
231+
# Force copy
232+
data.energies = data.energies.copy()
233+
data.energies[data._indices] = delta_y
234+
return data
235+
else:
236+
return delta_y
237+

‎qml/qmlearn/representations.py

+800
Large diffs are not rendered by default.

‎qml/representations/representations.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from .frepresentations import fgenerate_eigenvalue_coulomb_matrix
3333
from .frepresentations import fgenerate_bob
3434

35-
from qml.data.alchemy import NUCLEAR_CHARGE
35+
from qml.utils import NUCLEAR_CHARGE
3636

3737
from .slatm import get_boa
3838
from .slatm import get_sbop

‎qml/utils/__init__.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2017-2018 Silvia Amabilino, Lars Andersen Bratholm
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from . import alchemy
24+
from .utils import *
25+
from .alchemy import ELEMENT_NAME, NUCLEAR_CHARGE
File renamed without changes.

‎qml/aglaia/utils.py ‎qml/utils/utils.py

+25-17
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ def is_positive_integer_or_zero(x):
4141
def is_string(x):
4242
return isinstance(x, str)
4343

44-
def is_none(x):
45-
return isinstance(x, type(None))
46-
4744
def is_dict(x):
4845
return isinstance(x, dict)
4946

@@ -88,10 +85,28 @@ def _is_integer_array(x):
8885
def is_positive_integer_array(x):
8986
return (_is_integer_array(x) and _is_positive_array(x))
9087

91-
9288
def is_positive_integer_or_zero_array(x):
9389
return (_is_integer_array(x) and _is_positive_or_zero_array(x))
9490

91+
def get_unique(x):
92+
"""
93+
Gets all unique elements in lists of lists
94+
"""
95+
elements = list(set(item for l in x for item in l))
96+
return elements
97+
98+
def get_pairs(x):
99+
"""
100+
Get all unique pairs. E.g. x = [1,2,3] will return
101+
[[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]]
102+
"""
103+
pairs = []
104+
for i,v in enumerate(x):
105+
for w in x[i:]:
106+
pairs.append([v,w])
107+
return pairs
108+
109+
95110
# ------------- ** Checking inputs ** --------------------------
96111

97112
def check_global_representation(x):
@@ -157,14 +172,14 @@ def check_sizes(x, y=None, dy=None, classes=None):
157172
:return: None
158173
"""
159174

160-
if is_none(dy) and is_none(classes):
175+
if dy is None and classes is None:
161176

162177
if x.shape[0] != y.shape[0]:
163178
raise InputError("The descriptor and the properties should have the same first number of elements in the "
164179
"first dimension. Got %s and %s" % (x.shape[0], y.shape[0]))
165180

166-
elif is_none(y) and is_none(dy):
167-
if is_none(classes):
181+
elif y is None and dy is None:
182+
if classes is None:
168183
raise InputError("Only x is not none.")
169184
else:
170185
if x.shape[0] != classes.shape[0]:
@@ -173,7 +188,7 @@ def check_sizes(x, y=None, dy=None, classes=None):
173188
if x.shape[1] != classes.shape[1]:
174189
raise InputError("The number of atoms in the descriptor and in the classes is different: %s and %s." % (x.shape[1], classes.shape[1]))
175190

176-
elif is_none(dy) and not is_none(classes):
191+
elif dy is None and classes is not None:
177192

178193
if x.shape[0] != y.shape[0] or x.shape[0] != classes.shape[0]:
179194
raise InputError("All x, y and classes should have the first number of elements in the first dimension. Got "
@@ -202,7 +217,7 @@ def check_dy(dy):
202217
:return: numpy array of floats of shape (n_samples, n_atoms, 3)
203218
"""
204219

205-
if is_none(dy):
220+
if dy is None:
206221
approved_dy = dy
207222
else:
208223
if not is_array_like(dy):
@@ -227,7 +242,7 @@ def check_classes(classes):
227242
:return: numpy array of ints of shape (n_samples, n_atoms)
228243
"""
229244

230-
if is_none(classes):
245+
if classes is None:
231246
approved_classes = classes
232247
else:
233248
if not is_array_like(classes):
@@ -244,13 +259,6 @@ def check_classes(classes):
244259

245260
return approved_classes
246261

247-
248-
249-
250-
251-
252-
253-
254262
#
255263
#def _is_numeric_array(x):
256264
# try:

‎setup.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,9 @@ def setup_qml():
146146
'qml.kernels',
147147
'qml.math',
148148
'qml.representations',
149-
'qml.models',
149+
'qml.qmlearn',
150+
'qml.utils',
151+
'qml.models'
150152
],
151153

152154
# metadata

‎test/test_armp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import numpy as np
2929
from qml.aglaia.aglaia import ARMP
30-
from qml.aglaia.utils import InputError
30+
from qml.utils import InputError
3131
import glob
3232
import os
3333

‎test/test_mrmp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import numpy as np
2929
from qml.aglaia.aglaia import MRMP
30-
from qml.aglaia.utils import InputError
30+
from qml.utils import InputError
3131
import glob
3232
import os
3333
import shutil

‎test/test_neural_network.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
# TODO relative imports
3131
from qml.aglaia.aglaia import MRMP
32-
from qml.aglaia.utils import InputError
32+
from qml.utils import InputError
3333

3434

3535
# ------------ ** All functions to test the inputs to the classes ** ---------------

0 commit comments

Comments
 (0)
Please sign in to comment.