Skip to content
This repository was archived by the owner on Dec 8, 2024. It is now read-only.

Commit 44b6c3e

Browse files
Silviaandersx
Silvia
authored andcommitted
MRMP changes (#112)
* Corrected small bug in predict function * Started updating so that model can be trained after its been reloaded * Minor modifications * Updated model so one can predict from xyz and disabled shuffling in training because it leads to a problem with predictions * Fix for the problem of shuffling * Added some tests to make sure the predictions work * Fixed a tensorboard problem * The saving of the model doesn't cause an error if the directory already exists * Fixed a bug that made a test fail * Modified the name of a parameter * Made modifications to make te symmetry functions more numerically stable * Added a hack that makes ARMP work with fortran ACSF when there are padded representations. Currently works *ONLY* when there is one molecule for the whole data set. * corrected bug in score function for padded molecules * Changes that make the model work quickly even when there is padding. * Fixed discrepancies between fortran and TF acsf * Corrected bug in setting of ACSF parameters * Attempt at fixing issue #10 * another attempt at fixing #10 * Removed a pointless line * set-up * Added the graceful killer * Modifications which prevent installation from breaking on BC4 * Modification to add neural networks to qmlearn * Fix for issue #8 * Random comment * Started including the atomic model * Made the atomic neural network work * Fixed a bug with the indices * Now training and predictions don't use the default graph, to avoid problems * uncommented examples * Removed unique_elements in data class This can be stored in the NN class, but I might reverse the change later * Made tensorflow an optional dependency The reason for this approach is that pip would just auto install tensorflow and you might want the gpu version or your own compiled one. * Made is_numeric non-private and removed legacy code * Added 1d array util function * Removed QML check and moved functions from utils to tf_utils * Support for linear models (no hidden layers) * fixed import bug in tf_utils * Added text to explain that you are scoring on training set * Restructure. But elements are still not working Sorted elements * Moved documentation from init to class * Constant features will now be removed at fit/predict time * Moved get_batch_size back into utils, since it doesn't depend on tf * Made the NeuralNetwork class compliant with sklearn Cannot be any transforms of the input data * Fixed tests that didn't pass * Fixed mistake in checks of set_classes() in ARMP * started fixing ARMP bugs for QM7 * Fixed bug in padding and added examples that give low errors * Attempted fix to make representations single precision * Hot fix for AtomScaler * Minor bug fixes * More bug fixes to make sure tests run * Fixed some tests that had failures * Reverted the fchl tests to original * Fixed path in acsf test * Readded changes to tests * Modifications after code review * Version with the ACSF basis functions starting at 0.8 A * Updated ACSF representations so that the minimum distance at which to start the binning can be set by the user * Modified the name of the new parameter (minimum distance of the binning in ACSF) * Added a function to the atomscaler that enables to revert back * Relaxed tolerance in tests * Fixed bug in the padding of the representation in the ARMP network used in the pipeline * Made a modification to how the Fortran ACSF are generated that helps with how much memory is used. Currrently only float32 ACSF are available * Added a check to make sure there are no NANs in the representations. * Small mistake corrected in aglaia * Fixed extra space before -lpthread flag * Removed what I added * Implemented MRMP representations from xyz * Generate atomic slatm from data * Fixed typo * Fixed problem with slatm and ARMP * Fixed bug for MRMP tensorboard logger * Actually fixed the tensorboard bug for MRMP and added tests to catch future errors * Fixed another tensorboard bug * Changed the behaviour of logging to tensorboard in MRMP
1 parent 3bd256b commit 44b6c3e

File tree

7 files changed

+110
-18
lines changed

7 files changed

+110
-18
lines changed

qml/aglaia/aglaia.py

+39-14
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,22 @@ def _get_classes(self, indices):
982982

983983
return np.asarray(zs, dtype=np.float32)
984984

985+
def _generate_compounds_from_data(self, xyz, classes):
986+
"""
987+
This function generates the compounds from xyz data and nuclear charges.
988+
989+
:param xyz: cartesian coordinates
990+
:type xyz: numpy array of shape (n_samples, n_atoms, 3)
991+
:param classes: classes for atomic decomposition
992+
:type classes: None
993+
:return: array of compound objects
994+
"""
995+
compounds = np.empty(xyz.shape[0], dtype=object)
996+
for i in range(xyz.shape[0]):
997+
compounds[i] = Compound()
998+
compounds[i].set_compounds(xyz=xyz[i], zs=classes[i])
999+
return compounds
1000+
9851001
def predict(self, x, classes=None):
9861002
"""
9871003
This function calls the predict function for either ARMP or MRMP.
@@ -1088,8 +1104,13 @@ def _generate_representations_from_data(self, xyz, classes, method):
10881104
:type method: string
10891105
:return: numpy array of shape (n_samples, n_features) and None
10901106
"""
1091-
# TODO implement
1092-
raise InputError("Not implemented yet. Use compounds.")
1107+
1108+
if method != "fortran":
1109+
raise NotImplementedError
1110+
1111+
self.compounds = self._generate_compounds_from_data(xyz, classes)
1112+
1113+
return self._generate_representations_from_compounds('fortran')
10931114

10941115
def _generate_representations_from_compounds(self, method):
10951116
"""
@@ -1238,9 +1259,9 @@ def _fit(self, x, y, dy, classes):
12381259
opt, c = self.session.run([optimisation_op, cost], feed_dict=feed_dict)
12391260
avg_cost += c * batch_x.shape[0] / x_approved.shape[0]
12401261

1241-
if self.tensorboard:
1262+
if self.tensorboard and j == 0:
12421263
if i % self.tensorboard_logger_training.store_frequency == 0:
1243-
self.tensorboard_logger_training.write_summary(self.session, feed_dict, i, j)
1264+
self.tensorboard_logger_training.write_summary(self.session, i, feed_dict=feed_dict)
12441265

12451266
self.training_cost.append(avg_cost)
12461267

@@ -1642,20 +1663,16 @@ def _generate_representations_from_data(self, xyz, classes, method):
16421663
representation = None
16431664

16441665
if self.representation_name == 'slatm':
1645-
# TODO implement
1646-
raise InputError("Slatm from data has not been implemented yet. Use Compounds.")
1666+
self.compounds = self._generate_compounds_from_data(xyz, classes)
1667+
representation, classes = self._generate_representations_from_compounds('fortran')
16471668

16481669
elif self.representation_name == 'acsf':
16491670
if method == 'tf':
16501671
representation = self._generate_acsf_tf(xyz, classes)
16511672
else:
16521673
representation = self._generate_acsf_fortran(xyz, classes)
16531674

1654-
# Hotfix t make sure the representation is single precision
1655-
single_precision_representation = representation.astype(dtype=np.float32)
1656-
del representation
1657-
1658-
return single_precision_representation, classes
1675+
return representation, classes
16591676

16601677
def _generate_acsf_tf(self, xyz, classes):
16611678
"""
@@ -1776,7 +1793,11 @@ def _generate_acsf_fortran(self, xyz, classes):
17761793
padded_g = np.zeros((initial_natoms, g.shape[-1]))
17771794
padded_g[:g.shape[0], :] = g
17781795

1779-
representation.append(padded_g)
1796+
# Hotfix t make sure the representation is single precision
1797+
single_precision_g = padded_g.astype(dtype=np.float32)
1798+
del padded_g
1799+
1800+
representation.append(single_precision_g)
17801801

17811802
else:
17821803

@@ -1790,7 +1811,10 @@ def _generate_acsf_fortran(self, xyz, classes):
17901811
eta3=self.acsf_parameters['eta'],
17911812
zeta=self.acsf_parameters['zeta'])
17921813

1793-
representation.append(g)
1814+
single_precision_g = g.astype(dtype=np.float32)
1815+
del g
1816+
1817+
representation.append(single_precision_g)
17941818

17951819
return np.asarray(representation)
17961820

@@ -2275,7 +2299,8 @@ def _fit_from_scratch(self, x, y, dy, classes):
22752299
init = tf.global_variables_initializer()
22762300
iterator_init = iterator.make_initializer(dataset, name="dataset_init")
22772301

2278-
self._build_model_from_xyz(self.n_atoms, element_weights, element_biases)
2302+
if self.representation_name == "acsf":
2303+
self._build_model_from_xyz(self.n_atoms, element_weights, element_biases)
22792304

22802305
self.session = tf.Session()
22812306

qml/aglaia/tf_utils.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,13 @@ def set_store_frequency(self, freq):
5555
def set_summary_writer(self, sess):
5656
self.summary_writer = tf.summary.FileWriter(logdir=self.path, graph=sess.graph)
5757

58-
def write_summary(self, sess, iteration):
58+
def write_summary(self, sess, iteration, feed_dict=None):
5959

6060
self.merged_summary = tf.summary.merge_all()
61-
summary = sess.run(self.merged_summary)
61+
if not isinstance(feed_dict, type(None)):
62+
summary = sess.run(self.merged_summary, feed_dict)
63+
else:
64+
summary = sess.run(self.merged_summary)
6265
self.summary_writer.add_summary(summary, iteration)
6366
self.summary_writer.add_run_metadata(self.run_metadata, 'iteration %d' % (iteration))
6467

qml/qmlearn/models.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -734,8 +734,8 @@ def _padding(self, representation, nuclear_charges):
734734
print("Trying to predict on larger molecules than given by the 'size' parameter at initialization")
735735
raise SystemExit
736736

737-
padded_rep = np.zeros((len(representation), max_n_atoms, representation[0].shape[1]))
738-
padded_zs = np.zeros((len(representation), max_n_atoms))
737+
padded_rep = np.zeros((len(representation), self.size, representation[0].shape[1]))
738+
padded_zs = np.zeros((len(representation), self.size))
739739

740740
for i in range(len(representation)):
741741
n_atoms = representation[i].shape[0]

qml/qmlearn/representations.py

+5
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,11 @@ def transform(self, X):
710710
fgenerate_acsf(xyz, charge, self.elements, Rs, Rs, Ts,
711711
eta, eta, zeta, self.cutoff, self.cutoff, n, size)))
712712

713+
# Check to make sure there are no NANs
714+
# if np.any(np.isnan(representations)):
715+
# print("There are NANs in the representations.")
716+
# exit()
717+
713718
data._representations = np.asarray(representations)
714719

715720
return data

qml/utils/compound.py

+19
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,22 @@ def read_xyz(self, filename):
373373
self.coordinates[i] = np.asarray(tokens[1:4], dtype=float)
374374

375375
self.natypes = dict([(key, len(value)) for key,value in self.atomtype_indices.items()])
376+
377+
def set_compounds(self, xyz, zs):
378+
"""
379+
Generating the compounds straight from XYZ rather than from the files.
380+
381+
:param xyz: coordinates
382+
:type xyz: np array of shape (n_samples, n_atoms, 3)
383+
:param zs: nuclear charges
384+
:type zs: np array of shape (n_samples, n_atoms)
385+
:return: None
386+
"""
387+
388+
self.natoms = xyz.shape[0]
389+
self.nuclear_charges = zs
390+
self.coordinates = xyz
391+
self.atomtypes = np.unique(zs)
392+
393+
self.name = "Compound"
394+

test/test_armp.py

+23
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,28 @@ def test_fit_3():
162162
estimator = ARMP()
163163
estimator.fit(x=descriptor, y=energies, classes=classes)
164164

165+
def test_fit_4():
166+
"""
167+
This function tests the second way of fitting the descriptor: the data is passed by storing the compounds in the
168+
class.
169+
"""
170+
test_dir = os.path.dirname(os.path.realpath(__file__))
171+
172+
data = np.load(test_dir + "/data/local_slatm_ch4cn_light.npz")
173+
descriptor = data["arr_0"]
174+
classes = data["arr_1"]
175+
energies = data["arr_2"]
176+
177+
estimator = ARMP(tensorboard=True, tensorboard_subdir="./tb_test_4")
178+
estimator.set_representations(representations=descriptor)
179+
estimator.set_classes(classes=classes)
180+
estimator.set_properties(energies)
181+
182+
idx = np.arange(0, 100)
183+
estimator.fit(idx)
184+
185+
shutil.rmtree("./tb_test_4")
186+
165187
def test_score_3():
166188
"""
167189
This function tests that all the scoring functions work.
@@ -303,6 +325,7 @@ def test_retraining():
303325
test_fit_1()
304326
test_fit_2()
305327
test_fit_3()
328+
test_fit_4()
306329
test_score_3()
307330
test_predict_3()
308331
test_predict_fromxyz()

test/test_mrmp.py

+17
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,22 @@ def test_fit_3():
169169
estimator = MRMP()
170170
estimator.fit(descriptor, energies)
171171

172+
def test_fit_4():
173+
"""
174+
This function tests a third way of fitting the descriptor:
175+
The data is passed directly to the fit function.
176+
"""
177+
test_dir = os.path.dirname(os.path.realpath(__file__))
178+
179+
data = np.load(test_dir + "/data/CN_isopent_light_UCM.npz")
180+
descriptor = data["arr_0"]
181+
energies = data["arr_1"]
182+
183+
estimator = MRMP(tensorboard=True, tensorboard_subdir="./tb_test_4")
184+
estimator.fit(descriptor, energies)
185+
186+
shutil.rmtree("./tb_test_4")
187+
172188
def test_score():
173189
"""
174190
This function tests that all the scoring functions work.
@@ -264,6 +280,7 @@ def test_load_external():
264280
test_fit_1()
265281
test_fit_2()
266282
test_fit_3()
283+
test_fit_4()
267284
test_score()
268285
test_load_external()
269286
# test_get_params()

0 commit comments

Comments
 (0)