From d974133cb5d777015a3d83e962ad4ebd323576a7 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 24 Aug 2018 16:18:42 +0100 Subject: [PATCH 01/59] Corrected small bug in predict function --- qml/aglaia/aglaia.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index c174ce286..2f70b0ab4 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2094,8 +2094,6 @@ def _check_predict_input(self, x, classes): raise InputError("No representations or QML compounds have been set yet.") else: self.representation, self.classes = self._generate_representations_from_compounds() - if is_none(self.properties): - raise InputError("The properties need to be set in advance.") approved_x = self.representation[x] approved_classes = self.classes[x] From 8a3fbf3dc69fa0b214b6f07b681b902c237f93ae Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 24 Aug 2018 17:34:11 +0100 Subject: [PATCH 02/59] Started updating so that model can be trained after its been reloaded --- qml/aglaia/aglaia.py | 109 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 9 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 2f70b0ab4..646fc45d6 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -141,6 +141,9 @@ def __init__(self, hidden_layer_sizes, l1_reg, l2_reg, batch_size, learning_rate self.gradients = None self.classes = None + # To enable restart model + self.loaded_model = False + def _set_activation_function(self, activation_function): """ This function sets which activation function will be used in the model. @@ -2161,6 +2164,27 @@ def _find_elements(self, zs): return np.trim_zeros(elements) def _fit(self, x, y, dy, classes): + """ + This function calls either fit_from_scratch or fit_from_loaded depending on if a model has been loaded or not. + + :param x: either the representations or the indices to the data points to use + :type x: either a numpy array of shape (n_samples, n_atoms, n_features) or a numpy array of ints + :param y: either the properties or None + :type y: either a numpy array of shape (n_samples,) or None + :param dy: None + :type dy: None + :param classes: classes to use for the atomic decomposition or None + :type classes: either a numpy array of shape (n_samples, n_atoms) or None + + :return: None + """ + + if not self.loaded_model: + self._fit_from_scratch(x, y, classes, dy) + else: + self._fit_from_loaded(x, y, classes, dy) + + def _fit_from_scratch(self, x, y, dy, classes): """ This function fits an atomic decomposed network to the data. @@ -2199,9 +2223,9 @@ def _fit(self, x, y, dy, classes): # Initial set up of the NN with tf.name_scope("Data"): - x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features]) - zs_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms]) - y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1]) + x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features], name="Descriptors") + zs_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms], name="Atomic-numbers") + y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") dataset = tf.data.Dataset.from_tensor_slices((x_ph, zs_ph, y_ph)) dataset = dataset.shuffle(buffer_size=self.n_samples) @@ -2210,9 +2234,9 @@ def _fit(self, x, y, dy, classes): iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) tf_x, tf_zs, tf_y = iterator.get_next() - tf_x = tf.identity(tf_x, name="Descriptors") - tf_zs = tf.identity(tf_zs, name="Atomic-numbers") - tf_y = tf.identity(tf_y, name="Properties") + # tf_x = tf.identity(tf_x, name="Descriptors") + # tf_zs = tf.identity(tf_zs, name="Atomic-numbers") + # tf_y = tf.identity(tf_y, name="Properties") # Creating dictionaries of the weights and biases for each element element_weights = {} @@ -2233,16 +2257,17 @@ def _fit(self, x, y, dy, classes): with tf.name_scope("Cost_func"): cost = self._cost(molecular_energies, tf_y, element_weights) + print(cost) if self.tensorboard: cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) optimiser = self._set_optimiser() - optimisation_op = optimiser.minimize(cost) + optimisation_op = optimiser.minimize(cost, name="optimisation_op") # Initialisation of the variables init = tf.global_variables_initializer() - iterator_init = iterator.make_initializer(dataset) + iterator_init = iterator.make_initializer(dataset, name="dataset_init") self.session = tf.Session() @@ -2276,6 +2301,68 @@ def _fit(self, x, y, dy, classes): self.training_cost.append(avg_cost/n_batches) + def _fit_from_loaded(self, x, y, dy, classes): + """ + This function carries on fitting an atomic decomposed network to the data after it has been loaded. + + :param x: either the representations or the indices to the data points to use + :type x: either a numpy array of shape (n_samples, n_atoms, n_features) or a numpy array of ints + :param y: either the properties or None + :type y: either a numpy array of shape (n_samples,) or None + :param dy: None + :type dy: None + :param classes: classes to use for the atomic decomposition or None + :type classes: either a numpy array of shape (n_samples, n_atoms) or None + + :return: None + """ + + x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) + + self.elements = self._find_elements(classes_approved) + + if self.tensorboard: + self.tensorboard_logger_training.initialise() + self.tensorboard_logger_training.set_summary_writer(self.session) + + self.n_samples = x_approved.shape[0] + self.n_atoms = x_approved.shape[1] + self.n_features = x_approved.shape[2] + + batch_size = self._get_batch_size() + n_batches = ceil(self.n_samples, batch_size) + + graph = tf.get_default_graph() + + with graph.as_default(): + # Reloading all the needed operations and tensors + tf_x = graph.get_tensor_by_name("Data/Descriptors:0") + tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + tf_ene = graph.get_tensor_by_name("Data/Properties:0") + cost = graph.get_tensor_by_name("Cost_func/add_13:0") + + optimisation_op = graph.get_operation_by_name("optimisation_op") + dataset_init_op = graph.get_operation_by_name("dataset_init") + + if self.tensorboard: + cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) + + for i in range(self.iterations): + self.session.run(dataset_init_op, feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved}) + + for j in range(n_batches): + if self.tensorboard: + self.session.run(optimisation_op, options=self.tensorboard_logger_training.options, + run_metadata=self.tensorboard_logger_training.run_metadata) + else: + self.session.run(optimisation_op) + + if self.tensorboard: + if i % self.tensorboard_logger_training.store_frequency == 0: + self.session.run(dataset_init_op, + feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved}) + self.tensorboard_logger_training.write_summary(self.session, i) + def _predict(self, x, classes): """ This function checks whether x contains indices or data. If it contains indices, the data is extracted by the @@ -2401,10 +2488,12 @@ def save_nn(self, save_dir="saved_model"): with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + true_ene = graph.get_tensor_by_name("Data/Properties:0") model = graph.get_tensor_by_name("Model/output:0") tf.saved_model.simple_save(self.session, export_dir=save_dir, - inputs={"Data/Descriptors:0": tf_x, "Data/Atomic-numbers:0": tf_zs}, + inputs={"Data/Descriptors:0": tf_x, "Data/Atomic-numbers:0": tf_zs, + "Data/Properties:0": true_ene}, outputs={"Model/output:0": model}) def load_nn(self, save_dir="saved_model"): @@ -2418,3 +2507,5 @@ def load_nn(self, save_dir="saved_model"): self.session = tf.Session(graph=tf.get_default_graph()) tf.saved_model.loader.load(self.session, [tf.saved_model.tag_constants.SERVING], save_dir) + + self.loaded_model = True From 7a2dc7ee82e8915af45aed5b3ca710e10ac0d0ca Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Sat, 25 Aug 2018 09:37:59 +0100 Subject: [PATCH 03/59] Minor modifications --- qml/aglaia/aglaia.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 646fc45d6..5dffcb12f 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2228,7 +2228,7 @@ def _fit_from_scratch(self, x, y, dy, classes): y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") dataset = tf.data.Dataset.from_tensor_slices((x_ph, zs_ph, y_ph)) - dataset = dataset.shuffle(buffer_size=self.n_samples) + dataset = dataset.shuffle(buffer_size=3*batch_size) dataset = dataset.batch(batch_size) # batched_dataset = dataset.prefetch(buffer_size=batch_size) @@ -2257,7 +2257,6 @@ def _fit_from_scratch(self, x, y, dy, classes): with tf.name_scope("Cost_func"): cost = self._cost(molecular_energies, tf_y, element_weights) - print(cost) if self.tensorboard: cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) From 5525d3b475ec496a4bcde8b9dc87c76685aa0cee Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Sun, 26 Aug 2018 11:50:50 +0100 Subject: [PATCH 04/59] Updated model so one can predict from xyz and disabled shuffling in training because it leads to a problem with predictions --- qml/aglaia/aglaia.py | 147 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 121 insertions(+), 26 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 5dffcb12f..ae4860d28 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1646,7 +1646,6 @@ def _generate_representations_from_data(self, xyz, classes): return representation, classes - # TODO modify so that it uses the new map function def _generate_acsf_from_data(self, xyz, classes): """ This function generates the acsf from the cartesian coordinates and the classes. @@ -1658,23 +1657,8 @@ def _generate_acsf_from_data(self, xyz, classes): :return: representation acsf :rtype: numpy array of shape (n_samples, n_atoms, n_features) """ - mbtypes = qml_rep.get_slatm_mbtypes([classes[i] for i in range(classes.shape[0])]) - elements = [] - element_pairs = [] - - # Splitting the one and two body interactions in mbtypes - for item in mbtypes: - if len(item) == 1: - elements.append(item[0]) - if len(item) == 2: - element_pairs.append(list(item)) - if len(item) == 3: - break - - # Need the element pairs in descending order for TF - for item in element_pairs: - item.reverse() + elements, element_pairs = self._get_elements_and_pairs(classes) if self.tensorboard: run_metadata = tf.RunMetadata() @@ -1958,7 +1942,7 @@ def _model(self, x, zs, element_weights, element_biases): :rtype: tf tensor of shape (n_samples, 1) """ - atomic_energies = tf.zeros_like(zs) + atomic_energies = tf.zeros_like(zs, dtype=tf.float32) for i in range(self.elements.shape[0]): @@ -1971,7 +1955,7 @@ def _model(self, x, zs, element_weights, element_biases): where_element = tf.equal(tf.cast(zs, dtype=tf.int32), current_element) # (n_samples, n_atoms) # Extracting the energies corresponding to the right element - element_energies = tf.where(where_element, atomic_energies_all, tf.zeros_like(zs)) + element_energies = tf.where(where_element, atomic_energies_all, tf.zeros_like(zs, dtype=tf.float32)) # Adding the energies of the current element to the final atomic energies tensor atomic_energies = tf.add(atomic_energies, element_energies) @@ -2147,6 +2131,36 @@ def _check_representation_parameters(self, parameters): except Exception: raise InputError("Unrecognised parameter for acsf representation: %s" % (key)) + def _get_elements_and_pairs(self, classes): + """ + This function generates the atom centred symmetry functions. + :param classes: The different types of atoms present in the system + :type classes: numpy array of shape (n_samples, n_atoms) + :return: elements and element pairs in the system + :rtype: numpy array of shape (n_elements,) and (n_element_pairs) + """ + + # Obtaining the total elements and the element pairs + mbtypes = qml_rep.get_slatm_mbtypes(classes) + + elements = [] + element_pairs = [] + + # Splitting the one and two body interactions in mbtypes + for item in mbtypes: + if len(item) == 1: + elements.append(item[0]) + if len(item) == 2: + element_pairs.append(list(item)) + if len(item) == 3: + break + + # Need the element pairs in descending order for TF + for item in element_pairs: + item.reverse() + + return np.asarray(elements), np.asarray(element_pairs) + def _find_elements(self, zs): """ This function finds the unique atomic numbers in Zs and returns them in a list. @@ -2203,7 +2217,7 @@ def _fit_from_scratch(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) # Obtaining the array of unique elements in all samples - self.elements = self._find_elements(classes_approved) + self.elements, self.element_pairs = self._get_elements_and_pairs(classes_approved) if self.tensorboard: self.tensorboard_logger_training.initialise() @@ -2228,15 +2242,12 @@ def _fit_from_scratch(self, x, y, dy, classes): y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") dataset = tf.data.Dataset.from_tensor_slices((x_ph, zs_ph, y_ph)) - dataset = dataset.shuffle(buffer_size=3*batch_size) + # dataset = dataset.shuffle(buffer_size=3*batch_size) dataset = dataset.batch(batch_size) # batched_dataset = dataset.prefetch(buffer_size=batch_size) iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) tf_x, tf_zs, tf_y = iterator.get_next() - # tf_x = tf.identity(tf_x, name="Descriptors") - # tf_zs = tf.identity(tf_zs, name="Atomic-numbers") - # tf_y = tf.identity(tf_y, name="Properties") # Creating dictionaries of the weights and biases for each element element_weights = {} @@ -2300,6 +2311,8 @@ def _fit_from_scratch(self, x, y, dy, classes): self.training_cost.append(avg_cost/n_batches) + self._build_model_from_xyz(self.n_atoms, element_weights, element_biases) + def _fit_from_loaded(self, x, y, dy, classes): """ This function carries on fitting an atomic decomposed network to the data after it has been loaded. @@ -2362,6 +2375,40 @@ def _fit_from_loaded(self, x, y, dy, classes): feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved}) self.tensorboard_logger_training.write_summary(self.session, i) + def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): + """ + This function builds a model that makes it possible to predict energies straight from xyz data. It constructs the + graph needed to do this. + + :param n_atoms: number of atoms + :param element_weights: the dictionary of the trained weights in the model + :param element_biases: the dictionary of trained biases in the model + """ + + with tf.name_scope("Inputs_pred"): + zs_tf = tf.placeholder(shape=[None, n_atoms], dtype=tf.int32, name="Classes") + xyz_tf = tf.placeholder(shape=[None, n_atoms, 3], dtype=tf.float32, name="xyz") + + dataset = tf.data.Dataset.from_tensor_slices((xyz_tf, zs_tf)) + dataset = dataset.batch(2) + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + batch_xyz, batch_zs = iterator.get_next() + iterator_init = iterator.make_initializer(dataset, name="dataset_init_pred") + + with tf.name_scope("Descriptor_pred"): + batch_representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=self.elements, + element_pairs=self.element_pairs, + radial_cutoff=self.acsf_parameters['radial_cutoff'], + angular_cutoff=self.acsf_parameters['angular_cutoff'], + radial_rs=self.acsf_parameters['radial_rs'], + angular_rs=self.acsf_parameters['angular_rs'], + theta_s=self.acsf_parameters['theta_s'], + eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + + with tf.name_scope("Model_pred"): + batch_energies_nn = self._model(batch_representation, batch_zs, element_weights, element_biases) + def _predict(self, x, classes): """ This function checks whether x contains indices or data. If it contains indices, the data is extracted by the @@ -2378,6 +2425,7 @@ def _predict(self, x, classes): """ approved_x, approved_classes = self._check_predict_input(x, classes) + empty_ene = np.empty((approved_x.shape[0], 1)) if self.session == None: raise InputError("Model needs to be fit before predictions can be made.") @@ -2387,10 +2435,57 @@ def _predict(self, x, classes): with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + tf_true_ene = graph.get_tensor_by_name("Data/Properties:0") model = graph.get_tensor_by_name("Model/output:0") - y_pred = self.session.run(model, feed_dict={tf_x: approved_x, tf_zs:approved_classes}) + dataset_init_op = graph.get_operation_by_name("dataset_init") + self.session.run(dataset_init_op, feed_dict={tf_x: approved_x, tf_zs: approved_classes, tf_true_ene: empty_ene}) + + tot_y_pred = [] + + while True: + try: + y_pred = self.session.run(model) + tot_y_pred.append(y_pred) + except tf.errors.OutOfRangeError: + break + + return np.concatenate(tot_y_pred, axis=0) + + def predict_from_xyz(self, xyz, classes): + """ + This function takes in the cartesian coordinates and the atom types and returns energies. + + :param xyz: cartesian coordinates + :type xyz: numpy array of shape (n_samples, n_atoms, 3) + :param classes: atom types + :type classes: numpy array of shape (n_samples, n_atoms) + :return: energies + :rtype: numpy array of shape (n_samples,) + """ + + if self.session == None: + raise InputError("Model needs to be fit before predictions can be made.") + + graph = tf.get_default_graph() + + with graph.as_default(): + xyz_tf = graph.get_tensor_by_name("Inputs_pred/xyz:0") + classes_tf = graph.get_tensor_by_name("Inputs_pred/Classes:0") + ene_nn = graph.get_tensor_by_name("Model_pred/output:0") + dataset_init_op = graph.get_operation_by_name("Inputs_pred/dataset_init_pred") + self.session.run(dataset_init_op, + feed_dict={xyz_tf: xyz, classes_tf: classes}) + + tot_y_pred = [] + + while True: + try: + y_pred = self.session.run(ene_nn) + tot_y_pred.append(y_pred) + except tf.errors.OutOfRangeError: + break - return y_pred + return np.concatenate(tot_y_pred, axis=0).ravel() def _score_r2(self, x, y=None, dy=None, classes=None): """ From bb28c4a0d4bd7df616c519ae82dee0d757c96908 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 27 Aug 2018 13:56:19 +0100 Subject: [PATCH 05/59] Fix for the problem of shuffling --- qml/aglaia/aglaia.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index ae4860d28..d556d6b60 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2240,9 +2240,10 @@ def _fit_from_scratch(self, x, y, dy, classes): x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features], name="Descriptors") zs_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms], name="Atomic-numbers") y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") + buffer_tf = tf.placeholder(dtype=tf.int64, name="buffer") dataset = tf.data.Dataset.from_tensor_slices((x_ph, zs_ph, y_ph)) - # dataset = dataset.shuffle(buffer_size=3*batch_size) + dataset = dataset.shuffle(buffer_size=buffer_tf) dataset = dataset.batch(batch_size) # batched_dataset = dataset.prefetch(buffer_size=batch_size) @@ -2286,11 +2287,15 @@ def _fit_from_scratch(self, x, y, dy, classes): self.tensorboard_logger_training.set_summary_writer(self.session) self.session.run(init) - self.session.run(iterator_init, feed_dict={x_ph:x_approved, zs_ph:classes_approved, y_ph:y_approved}) for i in range(self.iterations): - self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved}) + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved, buffer_tf:buff}) avg_cost = 0 for j in range(n_batches): @@ -2304,9 +2309,12 @@ def _fit_from_scratch(self, x, y, dy, classes): # This seems to run the iterator.get_next() op, which gives problems with end of sequence # Hence why I re-initialise the iterator - self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved}) + if self.tensorboard: if i % self.tensorboard_logger_training.store_frequency == 0: + self.session.run(iterator_init, + feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved, + buffer_tf: buff}) self.tensorboard_logger_training.write_summary(self.session, i) self.training_cost.append(avg_cost/n_batches) @@ -2436,9 +2444,10 @@ def _predict(self, x, classes): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") tf_true_ene = graph.get_tensor_by_name("Data/Properties:0") + tf_buffer = graph.get_tensor_by_name("Data/buffer:0") model = graph.get_tensor_by_name("Model/output:0") dataset_init_op = graph.get_operation_by_name("dataset_init") - self.session.run(dataset_init_op, feed_dict={tf_x: approved_x, tf_zs: approved_classes, tf_true_ene: empty_ene}) + self.session.run(dataset_init_op, feed_dict={tf_x: approved_x, tf_zs: approved_classes, tf_true_ene: empty_ene, tf_buffer:1}) tot_y_pred = [] From 4b29b95755572840e85ef57afe9e77975cb2e63d Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 27 Aug 2018 15:16:20 +0100 Subject: [PATCH 06/59] Added some tests to make sure the predictions work --- qml/aglaia/aglaia.py | 13 ++++-- test/test_armp.py | 102 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index d556d6b60..1414c322d 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1713,11 +1713,9 @@ def _generate_acsf_from_data(self, xyz, classes): representation_slices.append(representation_np) batch_counter += 1 except tf.errors.OutOfRangeError: - print("Generated all the representations.") break representation_conc = np.concatenate(representation_slices, axis=0) - print("The representation has shape %s." % (str(representation_conc.shape))) sess.close() @@ -2360,6 +2358,7 @@ def _fit_from_loaded(self, x, y, dy, classes): tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") tf_ene = graph.get_tensor_by_name("Data/Properties:0") cost = graph.get_tensor_by_name("Cost_func/add_13:0") + tf_buffer = graph.get_tensor_by_name("Data/buffer:0") optimisation_op = graph.get_operation_by_name("optimisation_op") dataset_init_op = graph.get_operation_by_name("dataset_init") @@ -2368,7 +2367,13 @@ def _fit_from_loaded(self, x, y, dy, classes): cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) for i in range(self.iterations): - self.session.run(dataset_init_op, feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved}) + + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + self.session.run(dataset_init_op, feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved, tf_buffer: buff}) for j in range(n_batches): if self.tensorboard: @@ -2380,7 +2385,7 @@ def _fit_from_loaded(self, x, y, dy, classes): if self.tensorboard: if i % self.tensorboard_logger_training.store_frequency == 0: self.session.run(dataset_init_op, - feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved}) + feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved, tf_buffer: buff}) self.tensorboard_logger_training.write_summary(self.session, i) def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): diff --git a/test/test_armp.py b/test/test_armp.py index b7bc3ddad..47b12998c 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -30,6 +30,7 @@ from qml.aglaia.utils import InputError import glob import os +import shutil def test_set_representation(): """ @@ -198,6 +199,105 @@ def test_predict_3(): assert energies.shape == energies_pred.shape +def test_predict_fromxyz(): + """ + This test checks that the predictions from the "predict" and the "predict_from_xyz" functions are the same. + It also checks that if the model is saved, when the model is reloaded the predictions are still the same. + """ + + xyz = np.array([[[0, 1, 0], [0, 1, 1], [1, 0, 1]], + [[1, 2, 2], [3, 1, 2], [1, 3, 4]], + [[4, 1, 2], [0.5, 5, 6], [-1, 2, 3]]]) + zs = np.array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + ene_true = np.array([0.5, 0.9, 1.0]) + + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + "theta_s": np.arange(0, 3.14, 3)}) + + estimator.set_properties(ene_true) + estimator.generate_representation(xyz, zs) + + idx = list(range(xyz.shape[0])) + + estimator.fit(idx) + + pred1 = estimator.predict(idx) + pred2 = estimator.predict_from_xyz(xyz, zs) + + assert np.all(pred1 == pred2) + + estimator.save_nn(save_dir="temp") + + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + "theta_s": np.arange(0, 3.14, 3)}) + + new_estimator.load_nn(save_dir="temp") + + new_estimator.set_properties(ene_true) + new_estimator.generate_representation(xyz, zs) + + pred3 = new_estimator.predict(idx) + pred4 = new_estimator.predict_from_xyz(xyz, zs) + + assert np.all(pred3 == pred4) + assert np.all(pred1 == pred3) + + shutil.rmtree("temp") + +def test_retraining(): + xyz = np.array([[[0, 1, 0], [0, 1, 1], [1, 0, 1]], + [[1, 2, 2], [3, 1, 2], [1, 3, 4]], + [[4, 1, 2], [0.5, 5, 6], [-1, 2, 3]]]) + zs = np.array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + ene_true = np.array([0.5, 0.9, 1.0]) + + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + "theta_s": np.arange(0, 3.14, 3)}) + + estimator.set_properties(ene_true) + estimator.generate_representation(xyz, zs) + + idx = list(range(xyz.shape[0])) + + estimator.fit(idx) + estimator.save_nn(save_dir="temp") + + pred1 = estimator.predict(idx) + + estimator.loaded_model = True + + estimator.fit(idx) + + pred2 = estimator.predict(idx) + + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + "theta_s": np.arange(0, 3.14, 3)}) + new_estimator.set_properties(ene_true) + new_estimator.generate_representation(xyz, zs) + + new_estimator.load_nn("temp") + + pred3 = new_estimator.predict(idx) + + new_estimator.fit(idx) + + pred4 = new_estimator.predict(idx) + + assert np.all(pred1 == pred3) + assert np.all(pred2 == pred4) + + shutil.rmtree("temp") + if __name__ == "__main__": test_set_representation() test_set_properties() @@ -207,3 +307,5 @@ def test_predict_3(): test_fit_3() test_score_3() test_predict_3() + test_predict_fromxyz() + test_retraining() \ No newline at end of file From a4dc49604ba662dcfa8da97f8c5f03b65f08dd75 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 27 Aug 2018 15:22:12 +0100 Subject: [PATCH 07/59] Fixed a tensorboard problem --- qml/aglaia/aglaia.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 1414c322d..a55524e5a 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1661,8 +1661,7 @@ def _generate_acsf_from_data(self, xyz, classes): elements, element_pairs = self._get_elements_and_pairs(classes) if self.tensorboard: - run_metadata = tf.RunMetadata() - options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + self.tensorboard_logger_representation.initialise() n_samples = xyz.shape[0] max_n_atoms = xyz.shape[1] @@ -1692,14 +1691,14 @@ def _generate_acsf_from_data(self, xyz, classes): representation_slices = [] if self.tensorboard: - summary_writer = tf.summary.FileWriter(logdir="tensorboard", graph=sess.graph) + self.tensorboard_logger_representation.set_summary_writer(sess) batch_counter = 0 while True: try: - representation_np = sess.run(representation, options=options, run_metadata=run_metadata) - summary_writer.add_run_metadata(run_metadata=run_metadata, tag="batch %s" % batch_counter, - global_step=None) + representation_np = sess.run(representation, options=self.tensorboard_logger_representation.options, + run_metadata=self.tensorboard_logger_representation.run_metadata) + self.tensorboard_logger_representation.write_metadata(batch_counter) representation_slices.append(representation_np) batch_counter += 1 except tf.errors.OutOfRangeError: From ef5c880c325705e7d1aeba5d3bd70a3451308fb1 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 28 Aug 2018 07:46:16 +0100 Subject: [PATCH 08/59] The saving of the model doesn't cause an error if the directory already exists --- qml/aglaia/aglaia.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index a55524e5a..aed68d216 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2587,6 +2587,15 @@ def save_nn(self, save_dir="saved_model"): :return: None """ + counter = 0 + dir = save_dir + while True: + if os.path.isdir(save_dir): + counter += 1 + save_dir = dir + "_" + str(counter) + else: + break + if self.session == None: raise InputError("Model needs to be fit before predictions can be made.") From d1e27ad45cce7eaa62ad5936b45b5fc6d9bca960 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 28 Aug 2018 08:03:17 +0100 Subject: [PATCH 09/59] Fixed a bug that made a test fail --- qml/aglaia/aglaia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index aed68d216..f2cbd2c9f 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2191,9 +2191,9 @@ def _fit(self, x, y, dy, classes): """ if not self.loaded_model: - self._fit_from_scratch(x, y, classes, dy) + self._fit_from_scratch(x, y, dy, classes) else: - self._fit_from_loaded(x, y, classes, dy) + self._fit_from_loaded(x, y, dy, classes) def _fit_from_scratch(self, x, y, dy, classes): """ From f3a3e9c54ea96cb1d96f3828d0d29ac9a1c77d0f Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 29 Aug 2018 17:29:01 +0100 Subject: [PATCH 10/59] Modified the name of a parameter --- examples/ARMP_1.py | 2 +- examples/MRMP_1.py | 2 +- qml/aglaia/aglaia.py | 8 ++++---- test/test_armp.py | 26 +++++++++++++------------- test/test_mrmp.py | 12 ++++++------ 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/examples/ARMP_1.py b/examples/ARMP_1.py index a97bb8872..f368646df 100644 --- a/examples/ARMP_1.py +++ b/examples/ARMP_1.py @@ -39,7 +39,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=10, representation='acsf', representation_params={"radial_rs": np.arange(0, 10, 1), "angular_rs": np.arange(0.5, 10.5, 1), +estimator = ARMP(iterations=10, representation_name='acsf', representation_params={"radial_rs": np.arange(0, 10, 1), "angular_rs": np.arange(0.5, 10.5, 1), "theta_s": np.arange(0, 5, 1)}, tensorboard=False) estimator.generate_compounds(filenames) diff --git a/examples/MRMP_1.py b/examples/MRMP_1.py index f91b9daeb..74449e87f 100644 --- a/examples/MRMP_1.py +++ b/examples/MRMP_1.py @@ -41,7 +41,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = MRMP(representation='slatm', representation_params={'slatm_dgrid2': 0.06, 'slatm_dgrid1': 0.06}) +estimator = MRMP(representation_name='slatm', representation_params={'slatm_dgrid2': 0.06, 'slatm_dgrid1': 0.06}) estimator.generate_compounds(filenames[:100]) estimator.set_properties(energies[:100]) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index f2cbd2c9f..721ffe2ee 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1020,7 +1020,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat activation_function = tf.sigmoid, optimiser = tf.train.AdamOptimizer, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, rho = 0.95, initial_accumulator_value = 0.1, initial_gradient_squared_accumulator_value = 0.1, l1_regularization_strength = 0.0, l2_regularization_strength = 0.0, - tensorboard_subdir = os.getcwd() + '/tensorboard', representation='unsorted_coulomb_matrix', representation_params=None): + tensorboard_subdir = os.getcwd() + '/tensorboard', representation_name='unsorted_coulomb_matrix', representation_params=None): """ Descriptors is used as input to a single or multi layered feed-forward neural network with a single output. This class inherits from the _NN class and all inputs not unique to the NN class is passed to the _NN @@ -1034,7 +1034,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat rho, initial_accumulator_value, initial_gradient_squared_accumulator_value, l1_regularization_strength,l2_regularization_strength, tensorboard_subdir) - self._initialise_representation(representation, representation_params) + self._initialise_representation(representation_name, representation_params) def _initialise_representation(self, representation, parameters): """ @@ -1557,7 +1557,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat activation_function = tf.sigmoid, optimiser = tf.train.AdamOptimizer, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, rho = 0.95, initial_accumulator_value = 0.1, initial_gradient_squared_accumulator_value = 0.1, l1_regularization_strength = 0.0, l2_regularization_strength = 0.0, - tensorboard_subdir = os.getcwd() + '/tensorboard', representation='acsf', representation_params=None): + tensorboard_subdir = os.getcwd() + '/tensorboard', representation_name='acsf', representation_params=None): """ To see what parameters are required, look at the description of the _NN class init. This class inherits from the _NN class and all inputs not unique to the ARMP class are passed to the _NN @@ -1570,7 +1570,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat rho, initial_accumulator_value, initial_gradient_squared_accumulator_value, l1_regularization_strength,l2_regularization_strength, tensorboard_subdir) - self._initialise_representation(representation, representation_params) + self._initialise_representation(representation_name, representation_params) def _initialise_representation(self, representation, parameters): """ diff --git a/test/test_armp.py b/test/test_armp.py index 47b12998c..d594ce5fe 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -37,19 +37,19 @@ def test_set_representation(): This function tests the function _set_representation. """ try: - ARMP(representation='slatm', representation_params={'slatm_sigma12': 0.05}) + ARMP(representation_name='slatm', representation_params={'slatm_sigma12': 0.05}) raise Exception except InputError: pass try: - ARMP(representation='coulomb_matrix') + ARMP(representation_name='coulomb_matrix') raise Exception except InputError: pass try: - ARMP(representation='slatm', representation_params={'slatm_alchemy': 0.05}) + ARMP(representation_name='slatm', representation_params={'slatm_alchemy': 0.05}) raise Exception except InputError: pass @@ -57,7 +57,7 @@ def test_set_representation(): parameters = {'slatm_sigma1': 0.07, 'slatm_sigma2': 0.04, 'slatm_dgrid1': 0.02, 'slatm_dgrid2': 0.06, 'slatm_rcut': 5.0, 'slatm_rpower': 7, 'slatm_alchemy': True} - estimator = ARMP(representation='slatm', representation_params=parameters) + estimator = ARMP(representation_name='slatm', representation_params=parameters) assert estimator.representation_name == 'slatm' assert estimator.slatm_parameters == parameters @@ -72,7 +72,7 @@ def test_set_properties(): energies = np.loadtxt(test_dir + '/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) - estimator = ARMP(representation='slatm') + estimator = ARMP(representation_name='slatm') assert estimator.properties == None @@ -120,7 +120,7 @@ def test_fit_1(): usecols=[1]) filenames.sort() - estimator = ARMP(representation="acsf") + estimator = ARMP(representation_name="acsf") estimator.generate_compounds(filenames[:50]) estimator.set_properties(energies[:50]) estimator.generate_representation() @@ -214,8 +214,8 @@ def test_predict_fromxyz(): ene_true = np.array([0.5, 0.9, 1.0]) - estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), "theta_s": np.arange(0, 3.14, 3)}) estimator.set_properties(ene_true) @@ -232,8 +232,8 @@ def test_predict_fromxyz(): estimator.save_nn(save_dir="temp") - new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), "theta_s": np.arange(0, 3.14, 3)}) new_estimator.load_nn(save_dir="temp") @@ -259,7 +259,7 @@ def test_retraining(): ene_true = np.array([0.5, 0.9, 1.0]) - estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), "theta_s": np.arange(0, 3.14, 3)}) @@ -279,8 +279,8 @@ def test_retraining(): pred2 = estimator.predict(idx) - new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), "theta_s": np.arange(0, 3.14, 3)}) new_estimator.set_properties(ene_true) new_estimator.generate_representation(xyz, zs) diff --git a/test/test_mrmp.py b/test/test_mrmp.py index ffe32e585..62dd1742d 100644 --- a/test/test_mrmp.py +++ b/test/test_mrmp.py @@ -37,19 +37,19 @@ def test_set_representation(): This function tests the method MRMP._set_representation. """ try: - MRMP(representation='unsorted_coulomb_matrix', representation_params={'slatm_sigma1': 0.05}) + MRMP(representation_name='unsorted_coulomb_matrix', representation_params={'slatm_sigma1': 0.05}) raise Exception except InputError: pass try: - MRMP(representation='coulomb_matrix') + MRMP(representation_name='coulomb_matrix') raise Exception except InputError: pass try: - MRMP(representation='slatm', representation_params={'slatm_alchemy': 0.05}) + MRMP(representation_name='slatm', representation_params={'slatm_alchemy': 0.05}) raise Exception except InputError: pass @@ -57,7 +57,7 @@ def test_set_representation(): parameters ={'slatm_sigma1': 0.07, 'slatm_sigma2': 0.04, 'slatm_dgrid1': 0.02, 'slatm_dgrid2': 0.06, 'slatm_rcut': 5.0, 'slatm_rpower': 7, 'slatm_alchemy': True} - estimator = MRMP(representation='slatm', representation_params=parameters) + estimator = MRMP(representation_name='slatm', representation_params=parameters) assert estimator.representation_name == 'slatm' assert estimator.slatm_parameters == parameters @@ -71,7 +71,7 @@ def test_set_properties(): energies = np.loadtxt(test_dir + '/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) - estimator = MRMP(representation='unsorted_coulomb_matrix') + estimator = MRMP(representation_name='unsorted_coulomb_matrix') assert estimator.properties == None @@ -123,7 +123,7 @@ def test_fit_1(): available_representations = ['sorted_coulomb_matrix', 'unsorted_coulomb_matrix', 'bag_of_bonds', 'slatm'] for rep in available_representations: - estimator = MRMP(representation=rep) + estimator = MRMP(representation_name=rep) estimator.generate_compounds(filenames[:100]) estimator.set_properties(energies[:100]) estimator.generate_representation() From c77272ec51cb5f7cab1801cb6e423bce552608ff Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 31 Aug 2018 13:15:18 +0100 Subject: [PATCH 11/59] Made modifications to make te symmetry functions more numerically stable --- qml/aglaia/symm_funct.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index 805b81c03..e01ec78ce 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -49,7 +49,7 @@ def acsf_rad(xyzs, Zs, radial_cutoff, radial_rs, eta): # Calculating the distance matrix between the atoms of each sample with tf.name_scope("Distances"): dxyzs = tf.expand_dims(xyzs, axis=2) - tf.expand_dims(xyzs, axis=1) - dist_tensor = tf.cast(tf.norm(dxyzs, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) + dist_tensor = tf.cast(tf.norm(dxyzs+1.e-16, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) # Indices of terms that need to be zero (diagonal elements) mask_0 = tf.zeros(tf.shape(dist_tensor)) @@ -105,7 +105,7 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): # Finding the R_ij + R_ik term with tf.name_scope("Sum_distances"): dxyzs = tf.expand_dims(xyzs, axis=2) - tf.expand_dims(xyzs, axis=1) - dist_tensor = tf.cast(tf.norm(dxyzs, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) + dist_tensor = tf.cast(tf.norm(dxyzs+1.e-16, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) # This is the tensor where element sum_dist_tensor[0,1,2,3] is the R_12 + R_13 in the 0th data sample sum_dist_tensor = tf.expand_dims(dist_tensor, axis=3) + tf.expand_dims(dist_tensor, @@ -163,7 +163,7 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): # Dividing the dot products by the magnitudes to obtain cos theta cos_theta = tf.divide(dots_dxyzs, dist_prod) # Taking care of the values that due numerical error are just above 1.0 or below -1.0 - cut_cos_theta = tf.clip_by_value(cos_theta, tf.constant(-1.0), tf.constant(1.0)) + cut_cos_theta = tf.clip_by_value(cos_theta, tf.constant(-1.0 + 1.0e-7), tf.constant(1.0 - 1.0e-7)) # Applying arc cos to find the theta value theta = tf.acos(cut_cos_theta) # (n_samples, n_atoms, n_atoms, n_atoms) # Removing the NaNs created by dividing by zero @@ -199,16 +199,18 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): # Subtracting them and do the cos cos_theta_term = tf.cos( tf.subtract(expanded_theta, expanded_theta_s)) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) - # Make the whole cos term of the sum - cos_term = tf.pow(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), - zeta) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) + # Make the whole cos term of the sum of shape (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) + cos_term = tf.pow(tf.divide(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), + tf.constant(2, dtype=tf.float32)), zeta) + # cos_term = tf.pow(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), + # zeta) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) # Final product of terms inside the sum time by 2^(1-zeta) expanded_fc = tf.expand_dims(tf.expand_dims(cleaner_fc_term, axis=-1), axis=-1, name="Expanded_fc") expanded_cos = tf.expand_dims(cos_term, axis=-2, name="Expanded_cos") expanded_exp = tf.expand_dims(exp_term, axis=-1, name="Expanded_exp") - const = tf.pow(tf.constant(2.0, dtype=tf.float32), (1.0 - zeta)) + const = tf.constant(2.0, dtype=tf.float32) with tf.name_scope("Ang_term"): prod_of_terms = const * tf.multiply(tf.multiply(expanded_cos, expanded_exp), From 3db69a3212abd8679f79111c755f4b4f44fa839d Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 5 Sep 2018 16:51:53 +0100 Subject: [PATCH 12/59] Added a hack that makes ARMP work with fortran ACSF when there are padded representations. Currently works *ONLY* when there is one molecule for the whole data set. --- qml/aglaia/aglaia.py | 106 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 11 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 721ffe2ee..6de7b5573 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -37,6 +37,7 @@ is_positive_integer_or_zero_array, check_local_representation from qml.aglaia.tf_utils import TensorBoardLogger +from qml.representations import generate_acsf try: from qml.data import Compound @@ -674,7 +675,7 @@ def generate_compounds(self, filenames): for i, filename in enumerate(filenames): self.compounds[i] = Compound(filename) - def generate_representation(self, xyz=None, classes=None): + def generate_representation(self, xyz=None, classes=None, method='fortran'): """ This function can generate representations either from the data contained in the compounds or from xyz data passed through the argument. If the Compounds have already being set and xyz data is given, it complains. @@ -695,7 +696,7 @@ def generate_representation(self, xyz=None, classes=None): if is_none(self.compounds): - self.representation, self.classes = self._generate_representations_from_data(xyz, classes) + self.representation, self.classes = self._generate_representations_from_data(xyz, classes, method) elif is_none(xyz): # Make representations from compounds @@ -1080,7 +1081,7 @@ def _set_representation(self, representation): self.representation = representation - def _generate_representations_from_data(self, xyz, classes): + def _generate_representations_from_data(self, xyz, classes, method): """ This function makes the representation from xyz data and nuclear charges. @@ -1088,6 +1089,8 @@ def _generate_representations_from_data(self, xyz, classes): :type xyz: numpy array of shape (n_samples, n_atoms, 3) :param classes: classes for atomic decomposition :type classes: None + :param method: What method to use + :type method: string :return: numpy array of shape (n_samples, n_features) and None """ # TODO implement @@ -1615,7 +1618,7 @@ def _set_representation(self, representation): self.representation = representation - def _generate_representations_from_data(self, xyz, classes): + def _generate_representations_from_data(self, xyz, classes, method): """ This function generates the representations from xyz data @@ -1623,10 +1626,15 @@ def _generate_representations_from_data(self, xyz, classes): :type xyz: numpy array of shape (n_samples, n_atoms, 3) :param classes: classes to use for atomic decomposition :type classes: numpy array of shape (n_samples, n_atoms) + :param method: What method to use + :type method: string :return: representations and classes :rtype: numpy arrays of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) """ + if method not in ['fortran', 'tf']: + raise InputError("The method to generate the acsf can only be 'fortran' or 'tf'. Got %s." % (method)) + if is_none(classes): raise InputError("The classes need to be provided for the ARMP estimator.") else: @@ -1641,12 +1649,14 @@ def _generate_representations_from_data(self, xyz, classes): raise InputError("Slatm from data has not been implemented yet. Use Compounds.") elif self.representation_name == 'acsf': - - representation = self._generate_acsf_from_data(xyz, classes) + if method == 'tf': + representation = self._generate_acsf_from_data_tf(xyz, classes) + else: + representation = self._generate_acsf_from_data_fortran(xyz, classes) return representation, classes - def _generate_acsf_from_data(self, xyz, classes): + def _generate_acsf_from_data_tf(self, xyz, classes): """ This function generates the acsf from the cartesian coordinates and the classes. @@ -1658,7 +1668,13 @@ def _generate_acsf_from_data(self, xyz, classes): :rtype: numpy array of shape (n_samples, n_atoms, n_features) """ - elements, element_pairs = self._get_elements_and_pairs(classes) + if 0 in classes: + idx_zeros = np.where(classes == 0)[1] + classes_for_elements = classes[:, :idx_zeros[0]] + else: + classes_for_elements = classes + + elements, element_pairs = self._get_elements_and_pairs(classes_for_elements) if self.tensorboard: self.tensorboard_logger_representation.initialise() @@ -1720,6 +1736,62 @@ def _generate_acsf_from_data(self, xyz, classes): return representation_conc + # TODO Make tensorflow and fortran consistent + def _generate_acsf_from_data_fortran(self, xyz, classes): + """ + This function uses fortran to generate the representation and the derivative of the representation with respect + to the cartesian coordinates. + :param xyz: cartesian coordinates + :type xyz: numpy array of shape (n_samples, n_atoms, 3) + :param classes: the atom types + :type classes: numpy array of shape (n_samples, n_atoms) + :return: representations and their derivatives wrt to xyz + :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms, n_features, n_atoms, 3) + """ + + initial_natoms = xyz.shape[1] + + if 0 in classes: + idx_zeros = np.where(classes == 0)[1] + xyz = xyz[:, :idx_zeros[0], :] + classes = classes[:, :idx_zeros[0]] + + unpadded_natoms = xyz.shape[1] + + elements, element_paris = self._get_elements_and_pairs(classes) + + rcut = self.acsf_parameters['radial_cutoff'] + acut = self.acsf_parameters['angular_cutoff'] + nRs2 = len(self.acsf_parameters['radial_rs']) + nRs3 = len(self.acsf_parameters['angular_rs']) + nTs = len(self.acsf_parameters['theta_s']) + eta2 = self.acsf_parameters['eta'] + eta3 = eta2 + zeta = self.acsf_parameters['zeta'] + + + representation = [] + + for i in range(xyz.shape[0]): + g = generate_acsf(coordinates=xyz[i], elements=elements, gradients=False, nuclear_charges=classes[i], + rcut=rcut, + acut=acut, + nRs2=nRs2, + nRs3=nRs3, + nTs=nTs, + eta2=eta2, + eta3=eta3, + zeta=zeta) + + if initial_natoms - unpadded_natoms != 0: + padded_g = np.zeros((initial_natoms, g.shape[-1])) + padded_g[:unpadded_natoms, :] = g + representation.append(padded_g) + else: + representation.append(g) + + return np.asarray(representation) + def _generate_representations_from_compounds(self): """ This function generates the representations from the compounds. @@ -2213,8 +2285,14 @@ def _fit_from_scratch(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) - # Obtaining the array of unique elements in all samples - self.elements, self.element_pairs = self._get_elements_and_pairs(classes_approved) + # Obtaining the array of unique elements in all samples (excluding dummy atoms 0) + if 0 in classes_approved: + idx_zeros = np.where(classes_approved == 0)[1] + classes_for_elements = classes_approved[:, :idx_zeros[0]] + else: + classes_for_elements = classes_approved + + self.elements, self.element_pairs = self._get_elements_and_pairs(classes_for_elements) if self.tensorboard: self.tensorboard_logger_training.initialise() @@ -2336,7 +2414,13 @@ def _fit_from_loaded(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) - self.elements = self._find_elements(classes_approved) + + if 0 in classes: + idx_zeros = np.where(classes == 0)[1] + classes_for_elements = classes_approved[:, :idx_zeros[0]] + else: + classes_for_elements = classes_approved + self.elements = self._find_elements(classes_for_elements) if self.tensorboard: self.tensorboard_logger_training.initialise() From acf0ee7b27ecfc6a437e3d620a2fc94722333c2c Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 5 Sep 2018 17:18:29 +0100 Subject: [PATCH 13/59] corrected bug in score function for padded molecules --- qml/aglaia/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qml/aglaia/utils.py b/qml/aglaia/utils.py index 87beac4d7..4e7b19d11 100644 --- a/qml/aglaia/utils.py +++ b/qml/aglaia/utils.py @@ -233,7 +233,7 @@ def check_classes(classes): if not is_array_like(classes): raise InputError("classes should be array like.") - if not is_positive_integer_array(classes): + if not is_positive_integer_or_zero_array(classes): raise InputError("classes should be an array of ints.") classes = np.asarray(classes) From 49b36c2124e12e65e69f2d0ddbeb8591aa305f58 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Thu, 6 Sep 2018 11:17:55 +0100 Subject: [PATCH 14/59] Changes that make the model work quickly even when there is padding. --- qml/aglaia/aglaia.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 6de7b5573..8b4a06b75 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2011,26 +2011,26 @@ def _model(self, x, zs, element_weights, element_biases): :rtype: tf tensor of shape (n_samples, 1) """ - atomic_energies = tf.zeros_like(zs, dtype=tf.float32) + all_atomic_energies = tf.zeros_like(zs, dtype=tf.float32) - for i in range(self.elements.shape[0]): + for el in self.elements: + # Obtaining the indices of where in Zs there is the current element + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(zs, current_element)), dtype=tf.int32) - # Calculating the output for every atom in all data as if they were all of the same element - atomic_energies_all = self._atomic_model(x, self.hidden_layer_sizes, element_weights[self.elements[i]], - element_biases[self.elements[i]]) # (n_samples, n_atoms) + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(x, where_element) - # Figuring out which atomic energies correspond to the current element. - current_element = tf.expand_dims(tf.constant(self.elements[i], dtype=tf.int32), axis=0) - where_element = tf.equal(tf.cast(zs, dtype=tf.int32), current_element) # (n_samples, n_atoms) + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_model(current_element_in_x, self.hidden_layer_sizes, element_weights[el], + element_biases[el]) - # Extracting the energies corresponding to the right element - element_energies = tf.where(where_element, atomic_energies_all, tf.zeros_like(zs, dtype=tf.float32)) - - # Adding the energies of the current element to the final atomic energies tensor - atomic_energies = tf.add(atomic_energies, element_energies) + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) # Summing the energies of all the atoms - total_energies = tf.reduce_sum(atomic_energies, axis=-1, name="output", keepdims=True) + total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="output", keepdims=True) return total_energies @@ -2313,7 +2313,7 @@ def _fit_from_scratch(self, x, y, dy, classes): # Initial set up of the NN with tf.name_scope("Data"): x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features], name="Descriptors") - zs_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms], name="Atomic-numbers") + zs_ph = tf.placeholder(dtype=tf.int32, shape=[None, self.n_atoms], name="Atomic-numbers") y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") buffer_tf = tf.placeholder(dtype=tf.int64, name="buffer") From 75aac6745b6d3f6d465c9a4021f56c2272d37bde Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Thu, 6 Sep 2018 12:04:20 +0100 Subject: [PATCH 15/59] Fixed discrepancies between fortran and TF acsf --- qml/aglaia/aglaia.py | 143 ++++++++++++++------------------------- qml/aglaia/symm_funct.py | 26 ++++--- test/test_armp.py | 10 +-- 3 files changed, 70 insertions(+), 109 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 8b4a06b75..89f05476c 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -598,8 +598,8 @@ def _set_acsf_parameters(self, params): :return: None """ - self.acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2), - 'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0} + self.acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, + 'zeta': 220.127, 'eta': 30.8065} if not is_none(params): for key, value in params.items(): @@ -824,29 +824,25 @@ def _check_acsf_values(self): :return: None """ - if not is_positive(self.acsf_parameters['radial_cutoff']): - raise InputError("Expected positive float for variable 'radial_cutoff'. Got %s." % str(self.acsf_parameters['radial_cutoff'])) + if not is_positive(self.acsf_parameters['rcut']): + raise InputError( + "Expected positive float for variable 'rcut'. Got %s." % str(self.acsf_parameters['rcut'])) - if not is_positive(self.acsf_parameters['angular_cutoff']): - raise InputError("Expected positive float for variable 'angular_cutoff'. Got %s." % str(self.acsf_parameters['angular_cutoff'])) + if not is_positive(self.acsf_parameters['acut']): + raise InputError( + "Expected positive float for variable 'acut'. Got %s." % str(self.acsf_parameters['acut'])) - if not is_numeric_array(self.acsf_parameters['radial_rs']): - raise InputError("Expecting an array like radial_rs. Got %s." % (self.acsf_parameters['radial_rs']) ) - if not len(self.acsf_parameters['radial_rs'])>0: - raise InputError("No radial_rs values were given." ) + if not is_positive_integer(self.acsf_parameters['nRs2']): + raise InputError("Expected positinve integer for 'nRs2. Got %s." % (self.acsf_parameters['nRs2'])) - if not is_numeric_array(self.acsf_parameters['angular_rs']): - raise InputError("Expecting an array like angular_rs. Got %s." % (self.acsf_parameters['angular_rs']) ) - if not len(self.acsf_parameters['angular_rs'])>0: - raise InputError("No angular_rs values were given." ) + if not is_positive_integer(self.acsf_parameters['nRs3']): + raise InputError("Expected positinve integer for 'nRs3. Got %s." % (self.acsf_parameters['nRs3'])) - if not is_numeric_array(self.acsf_parameters['theta_s']): - raise InputError("Expecting an array like theta_s. Got %s." % (self.acsf_parameters['theta_s']) ) - if not len(self.acsf_parameters['theta_s'])>0: - raise InputError("No theta_s values were given. " ) + if not is_positive_integer(self.acsf_parameters['nTs']): + raise InputError("Expected positinve integer for 'nTs. Got %s." % (self.acsf_parameters['nTs'])) - if is_numeric_array(self.acsf_parameters['eta']): - raise InputError("Expecting a scalar value for eta. Got %s." % (self.acsf_parameters['eta'])) + if is_numeric_array(self.acsf_parameters['eta']) or is_numeric_array(self.acsf_parameters['eta']): + raise InputError("Expecting a scalar value for eta parameters.") if is_numeric_array(self.acsf_parameters['zeta']): raise InputError("Expecting a scalar value for zeta. Got %s." % (self.acsf_parameters['zeta'])) @@ -1693,11 +1689,11 @@ def _generate_acsf_from_data_tf(self, xyz, classes): batch_xyz, batch_zs = iterator.get_next() representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - radial_cutoff=self.acsf_parameters['radial_cutoff'], - angular_cutoff=self.acsf_parameters['angular_cutoff'], - radial_rs=self.acsf_parameters['radial_rs'], - angular_rs=self.acsf_parameters['angular_rs'], - theta_s=self.acsf_parameters['theta_s'], eta=self.acsf_parameters['eta'], + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], zeta=self.acsf_parameters['zeta']) sess = tf.Session() @@ -1758,30 +1754,20 @@ def _generate_acsf_from_data_fortran(self, xyz, classes): unpadded_natoms = xyz.shape[1] - elements, element_paris = self._get_elements_and_pairs(classes) - - rcut = self.acsf_parameters['radial_cutoff'] - acut = self.acsf_parameters['angular_cutoff'] - nRs2 = len(self.acsf_parameters['radial_rs']) - nRs3 = len(self.acsf_parameters['angular_rs']) - nTs = len(self.acsf_parameters['theta_s']) - eta2 = self.acsf_parameters['eta'] - eta3 = eta2 - zeta = self.acsf_parameters['zeta'] - + elements, _ = self._get_elements_and_pairs(classes) representation = [] for i in range(xyz.shape[0]): g = generate_acsf(coordinates=xyz[i], elements=elements, gradients=False, nuclear_charges=classes[i], - rcut=rcut, - acut=acut, - nRs2=nRs2, - nRs3=nRs3, - nTs=nTs, - eta2=eta2, - eta3=eta3, - zeta=zeta) + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta2=self.acsf_parameters['eta'], + eta3=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) if initial_natoms - unpadded_natoms != 0: padded_g = np.zeros((initial_natoms, g.shape[-1])) @@ -1823,25 +1809,6 @@ def _generate_acsf_from_compounds(self): :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) """ - # Obtaining the total elements and the element pairs - mbtypes = qml_rep.get_slatm_mbtypes([mol.nuclear_charges for mol in self.compounds]) - - elements = [] - element_pairs = [] - - # Splitting the one and two body interactions in mbtypes - for item in mbtypes: - if len(item) == 1: - elements.append(item[0]) - if len(item) == 2: - element_pairs.append(list(item)) - if len(item) == 3: - break - - # Need the element pairs in descending order for TF - for item in element_pairs: - item.reverse() - # Obtaining the xyz and the nuclear charges xyzs = [] zs = [] @@ -1853,6 +1820,8 @@ def _generate_acsf_from_compounds(self): if len(compound.nuclear_charges) > max_n_atoms: max_n_atoms = len(compound.nuclear_charges) + elements, element_pairs = self._get_elements_and_pairs(zs) + # Padding so that all the samples have the same shape n_samples = len(zs) for i in range(n_samples): @@ -1882,12 +1851,13 @@ def _generate_acsf_from_compounds(self): batch_xyz, batch_zs = iterator.get_next() representations = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - radial_cutoff=self.acsf_parameters['radial_cutoff'], - angular_cutoff=self.acsf_parameters['angular_cutoff'], - radial_rs=self.acsf_parameters['radial_rs'], - angular_rs=self.acsf_parameters['angular_rs'], - theta_s=self.acsf_parameters['theta_s'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -2209,24 +2179,11 @@ def _get_elements_and_pairs(self, classes): :rtype: numpy array of shape (n_elements,) and (n_element_pairs) """ - # Obtaining the total elements and the element pairs - mbtypes = qml_rep.get_slatm_mbtypes(classes) - - elements = [] + elements = np.unique(classes) element_pairs = [] - - # Splitting the one and two body interactions in mbtypes - for item in mbtypes: - if len(item) == 1: - elements.append(item[0]) - if len(item) == 2: - element_pairs.append(list(item)) - if len(item) == 3: - break - - # Need the element pairs in descending order for TF - for item in element_pairs: - item.reverse() + for i, ei in enumerate(elements): + for ej in elements[i:]: + element_pairs.append([ej, ei]) return np.asarray(elements), np.asarray(element_pairs) @@ -2415,8 +2372,8 @@ def _fit_from_loaded(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) - if 0 in classes: - idx_zeros = np.where(classes == 0)[1] + if 0 in classes_approved: + idx_zeros = np.where(classes_approved == 0)[1] classes_for_elements = classes_approved[:, :idx_zeros[0]] else: classes_for_elements = classes_approved @@ -2494,11 +2451,11 @@ def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): with tf.name_scope("Descriptor_pred"): batch_representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=self.elements, element_pairs=self.element_pairs, - radial_cutoff=self.acsf_parameters['radial_cutoff'], - angular_cutoff=self.acsf_parameters['angular_cutoff'], - radial_rs=self.acsf_parameters['radial_rs'], - angular_rs=self.acsf_parameters['angular_rs'], - theta_s=self.acsf_parameters['theta_s'], + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], zeta=self.acsf_parameters['zeta']) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index e01ec78ce..276eeedca 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -352,8 +352,8 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): return clean_final_term -def generate_parkhill_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cutoff, - radial_rs, angular_rs, theta_s, zeta, eta): +def generate_parkhill_acsf(xyzs, Zs, elements, element_pairs, rcut, acut, + nRs2, nRs3, nTs, zeta, eta): """ This function generates the atom centred symmetry function as used in the Tensormol paper. Currently only tested for single systems with many conformations. It requires the coordinates of all the atoms in each data sample, the atomic @@ -364,19 +364,23 @@ def generate_parkhill_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, ang :param Zs: tensor of shape (n_samples, n_atoms) :param elements: np.array of shape (n_elements,) :param element_pairs: np.array of shape (n_elementpairs, 2) - :param radial_cutoff: scalar float - :param angular_cutoff: scalar float - :param radial_rs: np.array of shape (n_rad_rs,) - :param angular_rs: np.array of shape (n_ang_rs,) - :param theta_s: np.array of shape (n_thetas,) + :param rcut: scalar float + :param acut: scalar float + :param nRs2: positive integer + :param nRs3: positive integer + :param nTs: positive integer :param zeta: scalar float - :param eta: scalar float - :return: a tf tensor of shape (n_samples, n_atoms, n_rad_rs * n_elements + n_ang_rs * n_thetas * n_elementpairs) + :param eta2: scalar float + :return: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) """ + radial_rs = np.linspace(0, rcut, nRs2) + angular_rs = np.linspace(0, acut, nRs3) + theta_s = np.linspace(0, np.pi, nTs) + with tf.name_scope("acsf_params"): - rad_cutoff = tf.constant(radial_cutoff, dtype=tf.float32) - ang_cutoff = tf.constant(angular_cutoff, dtype=tf.float32) + rad_cutoff = tf.constant(rcut, dtype=tf.float32) + ang_cutoff = tf.constant(acut, dtype=tf.float32) rad_rs = tf.constant(radial_rs, dtype=tf.float32) ang_rs = tf.constant(angular_rs, dtype=tf.float32) theta_s = tf.constant(theta_s, dtype=tf.float32) diff --git a/test/test_armp.py b/test/test_armp.py index d594ce5fe..b432ba28a 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -228,7 +228,7 @@ def test_predict_fromxyz(): pred1 = estimator.predict(idx) pred2 = estimator.predict_from_xyz(xyz, zs) - assert np.all(pred1 == pred2) + assert np.all(np.isclose(pred1, pred2, rtol=1.e-6)) estimator.save_nn(save_dir="temp") @@ -244,8 +244,8 @@ def test_predict_fromxyz(): pred3 = new_estimator.predict(idx) pred4 = new_estimator.predict_from_xyz(xyz, zs) - assert np.all(pred3 == pred4) - assert np.all(pred1 == pred3) + assert np.all(np.isclose(pred3, pred4, rtol=1.e-6)) + assert np.all(np.isclose(pred1, pred3, rtol=1.e-6)) shutil.rmtree("temp") @@ -293,8 +293,8 @@ def test_retraining(): pred4 = new_estimator.predict(idx) - assert np.all(pred1 == pred3) - assert np.all(pred2 == pred4) + assert np.all(np.isclose(pred1, pred3, rtol=1.e-6)) + assert np.all(np.isclose(pred2, pred4, rtol=1.e-6)) shutil.rmtree("temp") From 1f694f89d4db58f6d2b8d8daaf34119ca6e7351a Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 7 Sep 2018 16:12:57 +0100 Subject: [PATCH 16/59] Corrected bug in setting of ACSF parameters --- qml/aglaia/aglaia.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 89f05476c..686cea856 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1747,8 +1747,10 @@ def _generate_acsf_from_data_fortran(self, xyz, classes): initial_natoms = xyz.shape[1] + # Hack to make fortran ACSF deal with dummy atoms (only if all the molecules are the same and the padding is at the end) if 0 in classes: idx_zeros = np.where(classes == 0)[1] + xyz = xyz[:, :idx_zeros[0], :] classes = classes[:, :idx_zeros[0]] @@ -2161,8 +2163,8 @@ def _check_representation_parameters(self, parameters): elif self.representation_name == "acsf": - acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2), - 'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0} + acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, + 'zeta': 220.127, 'eta': 30.8065} for key, value in parameters.items(): try: @@ -2242,12 +2244,8 @@ def _fit_from_scratch(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) - # Obtaining the array of unique elements in all samples (excluding dummy atoms 0) - if 0 in classes_approved: - idx_zeros = np.where(classes_approved == 0)[1] - classes_for_elements = classes_approved[:, :idx_zeros[0]] - else: - classes_for_elements = classes_approved + # Putting a mask on all the 0 values + classes_for_elements = np.ma.masked_equal(classes_approved, 0).compressed() self.elements, self.element_pairs = self._get_elements_and_pairs(classes_for_elements) From e5c478799f3da30c2d0f7712b6a32502ff3e8f44 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 11 Sep 2018 17:16:39 +0100 Subject: [PATCH 17/59] Attempt at fixing issue #10 --- qml/aglaia/aglaia.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 686cea856..a5c6d107b 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2401,9 +2401,6 @@ def _fit_from_loaded(self, x, y, dy, classes): optimisation_op = graph.get_operation_by_name("optimisation_op") dataset_init_op = graph.get_operation_by_name("dataset_init") - if self.tensorboard: - cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) - for i in range(self.iterations): if i % 2 == 0: From fef6fba8931ea651928cee4bdde04ba7447ff726 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 11 Sep 2018 17:48:40 +0100 Subject: [PATCH 18/59] another attempt at fixing #10 --- qml/aglaia/aglaia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index a5c6d107b..5488de434 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2310,6 +2310,8 @@ def _fit_from_scratch(self, x, y, dy, classes): init = tf.global_variables_initializer() iterator_init = iterator.make_initializer(dataset, name="dataset_init") + self._build_model_from_xyz(self.n_atoms, element_weights, element_biases) + self.session = tf.Session() # Running the graph @@ -2349,8 +2351,6 @@ def _fit_from_scratch(self, x, y, dy, classes): self.training_cost.append(avg_cost/n_batches) - self._build_model_from_xyz(self.n_atoms, element_weights, element_biases) - def _fit_from_loaded(self, x, y, dy, classes): """ This function carries on fitting an atomic decomposed network to the data after it has been loaded. From 61fefce10b972f1bd9059406b8f91fd5f17d9be5 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 12 Sep 2018 14:06:41 +0100 Subject: [PATCH 19/59] Removed a pointless line --- qml/aglaia/aglaia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 5488de434..8fbdf0248 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -2395,7 +2395,6 @@ def _fit_from_loaded(self, x, y, dy, classes): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") tf_ene = graph.get_tensor_by_name("Data/Properties:0") - cost = graph.get_tensor_by_name("Cost_func/add_13:0") tf_buffer = graph.get_tensor_by_name("Data/buffer:0") optimisation_op = graph.get_operation_by_name("optimisation_op") From 4c050f71f36e8da523afc4ff70d3d1dd3fc27094 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Thu, 13 Sep 2018 13:24:58 +0100 Subject: [PATCH 20/59] set-up --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0f30de11f..b911cf568 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,8 @@ COMPILER_FLAGS = ["-O3", "-fopenmp", "-m64", "-march=native", "-fPIC", "-Wno-maybe-uninitialized", "-Wno-unused-function", "-Wno-cpp"] LINKER_FLAGS = ["-lgomp"] -MATH_LINKER_FLAGS = ["-lblas", "-llapack"] +MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] +# MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # UNCOMMENT TO FORCE LINKING TO MKL with GNU compilers: if mkl_exists(verbose=True): @@ -32,7 +33,8 @@ if sys.platform == "darwin" and all(["gnu" not in arg for arg in sys.argv]): COMPILER_FLAGS = ["-O3", "-m64", "-march=native", "-fPIC"] LINKER_FLAGS = [] - MATH_LINKER_FLAGS = ["-lblas", "-llapack"] + MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] + # MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # Intel From 3bdd92826cdc99002a56b9b6471225af96de95fa Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Thu, 13 Sep 2018 14:09:30 +0100 Subject: [PATCH 21/59] Added the graceful killer --- qml/aglaia/aglaia.py | 18 ++++++++++++++++-- qml/aglaia/graceful_killer.py | 11 +++++++++++ setup.py | 6 ++++-- test/test_armp.py | 14 ++++++-------- 4 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 qml/aglaia/graceful_killer.py diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 14f7f01fc..0d3650168 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -38,6 +38,7 @@ from qml.aglaia.tf_utils import TensorBoardLogger from qml.representations import generate_acsf +from qml.aglaia.graceful_killer import GracefulKiller try: from qml.data import Compound @@ -2323,6 +2324,9 @@ def _fit_from_scratch(self, x, y, dy, classes): self.session.run(init) + # Initialising the object that enables graceful killing of the training + killer = GracefulKiller() + for i in range(self.iterations): if i % 2 == 0: @@ -2342,9 +2346,11 @@ def _fit_from_scratch(self, x, y, dy, classes): avg_cost += c - # This seems to run the iterator.get_next() op, which gives problems with end of sequence - # Hence why I re-initialise the iterator + if killer.kill_now: + self.save_nn("emergency_save") + exit() + # This seems to run the iterator.get_next() op, which gives problems with end of sequence, hence why I re-initialise the iterator if self.tensorboard: if i % self.tensorboard_logger_training.store_frequency == 0: self.session.run(iterator_init, @@ -2403,6 +2409,9 @@ def _fit_from_loaded(self, x, y, dy, classes): optimisation_op = graph.get_operation_by_name("optimisation_op") dataset_init_op = graph.get_operation_by_name("dataset_init") + # Initialising the object that enables graceful killing of the training + killer = GracefulKiller() + for i in range(self.iterations): if i % 2 == 0: @@ -2419,6 +2428,11 @@ def _fit_from_loaded(self, x, y, dy, classes): else: self.session.run(optimisation_op) + if killer.kill_now: + self.save_nn("emergency_save") + exit() + + if self.tensorboard: if i % self.tensorboard_logger_training.store_frequency == 0: self.session.run(dataset_init_op, diff --git a/qml/aglaia/graceful_killer.py b/qml/aglaia/graceful_killer.py new file mode 100644 index 000000000..f4d4f5009 --- /dev/null +++ b/qml/aglaia/graceful_killer.py @@ -0,0 +1,11 @@ +import signal + +class GracefulKiller: + kill_now = False + + def __init__(self): + signal.signal(signal.SIGINT, self.exit_gracefully) + signal.signal(signal.SIGTERM, self.exit_gracefully) + + def exit_gracefully(self,signum, frame): + self.kill_now = True \ No newline at end of file diff --git a/setup.py b/setup.py index 0f30de11f..b911cf568 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,8 @@ COMPILER_FLAGS = ["-O3", "-fopenmp", "-m64", "-march=native", "-fPIC", "-Wno-maybe-uninitialized", "-Wno-unused-function", "-Wno-cpp"] LINKER_FLAGS = ["-lgomp"] -MATH_LINKER_FLAGS = ["-lblas", "-llapack"] +MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] +# MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # UNCOMMENT TO FORCE LINKING TO MKL with GNU compilers: if mkl_exists(verbose=True): @@ -32,7 +33,8 @@ if sys.platform == "darwin" and all(["gnu" not in arg for arg in sys.argv]): COMPILER_FLAGS = ["-O3", "-m64", "-march=native", "-fPIC"] LINKER_FLAGS = [] - MATH_LINKER_FLAGS = ["-lblas", "-llapack"] + MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] + # MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # Intel diff --git a/test/test_armp.py b/test/test_armp.py index 0630393b8..7cb184c0b 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -214,9 +214,9 @@ def test_predict_fromxyz(): ene_true = np.array([0.5, 0.9, 1.0]) + acsf_param = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), - "theta_s": np.arange(0, 3.14, 3)}) + representation_params=acsf_param) estimator.set_properties(ene_true) estimator.generate_representation(xyz, zs) @@ -233,8 +233,7 @@ def test_predict_fromxyz(): estimator.save_nn(save_dir="temp") new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), - "theta_s": np.arange(0, 3.14, 3)}) + representation_params=acsf_param) new_estimator.load_nn(save_dir="temp") @@ -259,9 +258,9 @@ def test_retraining(): ene_true = np.array([0.5, 0.9, 1.0]) + acsf_param = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), - "theta_s": np.arange(0, 3.14, 3)}) + representation_params=acsf_param) estimator.set_properties(ene_true) estimator.generate_representation(xyz, zs) @@ -280,8 +279,7 @@ def test_retraining(): pred2 = estimator.predict(idx) new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', - representation_params={"radial_rs": np.arange(0, 10, 5), "angular_rs": np.arange(0, 10, 5), - "theta_s": np.arange(0, 3.14, 3)}) + representation_params=acsf_param) new_estimator.set_properties(ene_true) new_estimator.generate_representation(xyz, zs) From 81c9824a2f0cfb1ef8b39194faaaddf58ca91ea2 Mon Sep 17 00:00:00 2001 From: SilviaAmAm Date: Thu, 13 Sep 2018 15:01:07 +0100 Subject: [PATCH 22/59] Modifications which prevent installation from breaking on BC4 --- README.rst | 2 +- setup.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index c909e8905..6e88afed3 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ Current list of contributors: - Bing Huang (University of Basel) - David R. Glowacki (University of Bristol) - Alexandre Tkatchenko (University of Luxembourg) -- Klaus-Robert Müller (Technische Universitat Berlin/Korea University) +- Klaus-Robert Muller (Technische Universitat Berlin/Korea University) - \O. Anatole von Lilienfeld (University of Basel) 1) Citing QML: diff --git a/setup.py b/setup.py index b911cf568..c33bbfde6 100755 --- a/setup.py +++ b/setup.py @@ -21,8 +21,8 @@ COMPILER_FLAGS = ["-O3", "-fopenmp", "-m64", "-march=native", "-fPIC", "-Wno-maybe-uninitialized", "-Wno-unused-function", "-Wno-cpp"] LINKER_FLAGS = ["-lgomp"] -MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] -# MATH_LINKER_FLAGS = ["-lblas", "-llapack"] +# MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] +MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # UNCOMMENT TO FORCE LINKING TO MKL with GNU compilers: if mkl_exists(verbose=True): @@ -33,8 +33,8 @@ if sys.platform == "darwin" and all(["gnu" not in arg for arg in sys.argv]): COMPILER_FLAGS = ["-O3", "-m64", "-march=native", "-fPIC"] LINKER_FLAGS = [] - MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] - # MATH_LINKER_FLAGS = ["-lblas", "-llapack"] + # MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] + MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # Intel From 4a41f1412d3fb582409f5349e1fde6e432c8b7a9 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 14 Sep 2018 17:56:24 +0100 Subject: [PATCH 23/59] Modification to add neural networks to qmlearn --- examples/qmlearn.py | 41 +++++-- qml/qmlearn/models.py | 246 +++++++++++++++++++++++++++++++++++++++++- qml/utils/utils.py | 87 +++++++++++++++ 3 files changed, 364 insertions(+), 10 deletions(-) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index 84af9e7be..60faf0893 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -213,6 +213,32 @@ def pipelines(): print("*** End pipelines examples ***") print() +def pipelines_2(): + + print("\n *** Begin pipelines example with Neural Network ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.CoulombMatrix(), + qmlearn.models.MolecularNeuralNetwork(iterations=100000, batch_size=50, learning_rate=0.005), + ) + + indices = np.arange(1000) + # np.random.shuffle(indices) + + model.fit(indices[:100]) + + scores = model.score(indices[:100]) + print("Negative MAE:", scores) + + print("*** End pipelines example with Neural Network *** \n") + + def cross_validation(): """ Doing cross validation with qmlearn @@ -278,10 +304,11 @@ def cross_validation(): print("*** End CV examples ***") if __name__ == '__main__': - data() - preprocessing() - representations() - kernels() - models() - pipelines() - cross_validation() + # data() + # preprocessing() + # representations() + # kernels() + # models() + # pipelines() + # cross_validation() + pipelines_2() diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index f53c02d1a..d4b8d719f 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -23,10 +23,11 @@ from __future__ import division, absolute_import, print_function import numpy as np +import tensorflow as tf from sklearn.base import BaseEstimator from sklearn.metrics import mean_absolute_error -from ..utils import is_numeric_array +from ..utils import * from .data import Data from ..math import cho_solve @@ -92,8 +93,8 @@ class KernelRidgeRegression(_BaseModel): def __init__(self, l2_reg=1e-10, scoring='neg_mae'): """ - :param llambda: l2 regularization - :type llambda: float + :param l2_reg: l2 regularization + :type l2_reg: float :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') :type scoring: string """ @@ -159,3 +160,242 @@ def predict(self, X): raise SystemExit return np.dot(K, self.alpha) + +class MolecularNeuralNetwork(_BaseModel): + """ + Feed forward neural network that takes molecular representations and molecular properties. + """ + + def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0.001, iterations=500, l1_reg=0.0, + l2_reg=0.0, scoring="neg_mae"): + """ + + :param hl1: number of nodes in the 1st hidden layer + :param hl2: number of nodes in the 2nd hidden layer + :param hl3: number of nodes in the 3rd hidden layer + :param hl4: number of nodes in the 4th hidden layer + :param batch_size: Number of samples to have in each batch during training + :param learning_rate: step size in optimisation algorithm + :param iterations: number of iterations to do during training + :param l1_reg: L1 regularisation parameter + :param l2_reg: L2 regularisation parameter + :param scoring: What function to use for scoring + """ + + self.hl1, self.hl2, self.hl3, self.hl4 = check_hl(hl1, hl2, hl3, hl4) + self.batch_size = check_batchsize(batch_size) + self.learning_rate = check_learningrate(learning_rate) + self.iterations = check_iterations(iterations) + self.l1_reg, self.l2_reg = check_reg(l1_reg, l2_reg) + self.scoring = check_scoring(scoring) + + def fit(self, X, y=None): + """ + Fit a feedforward neural network. + + :param X: Data object + :param y: Energies + """ + + # obtaining the representations and the energies from the data object + if isinstance(X, Data): + g, ene = X._representations, X.energies[X._indices] + ene = np.reshape(ene, (ene.shape[0], 1)) + + + # Generating the model + self._generate_model(g.shape[1]) + + # Training the model + self._train(g, ene) + + def predict(self, X): + + if isinstance(X, Data): + g = X._representations + + y_pred = self._predict(g) + + return y_pred + + def _generate_model(self, n_features): + """ + This function generates the tensorflow graph. + """ + + tf.reset_default_graph() + + hidden_layers = [] + for item in [self.hl1, self.hl2, self.hl3, self.hl4]: + if item != 0: + hidden_layers.append(item) + + hidden_layers = tuple(hidden_layers) + + # Initial set up of the NN + with tf.name_scope("Data"): + ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") + ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") + batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") + buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") + + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) + dataset = dataset.shuffle(buffer_size=buffer_tf) + dataset = dataset.batch(batch_size_tf) + + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + tf_x, tf_y = iterator.get_next() + + with tf.name_scope("Weights"): + weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + + with tf.name_scope("Model"): + z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) + h = tf.sigmoid(z) + + # Calculate the activation of the remaining hidden layers + for i in range(1, len(weights) - 1): + z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) + h = tf.sigmoid(z) + + # Calculating the output of the last layer + y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") + + with tf.name_scope("Cost_func"): + cost = self._cost(y_pred, tf_y, weights) + + with tf.name_scope("Optimiser"): + optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + + iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + + def _train(self, representations, energies): + + graph = tf.get_default_graph() + + with graph.as_default(): + tf_x = graph.get_tensor_by_name("Data/Representation:0") + tf_y = graph.get_tensor_by_name("Data/True_energies:0") + batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") + + optimisation_op = graph.get_operation_by_name("Optimiser/Adam") + iter_init_op = graph.get_operation_by_name("dataset_init") + + batch_size = get_batch_size(self.batch_size, representations.shape[0]) + + self.session = tf.Session(graph=graph) + + self.session.run(tf.global_variables_initializer()) + + for i in range(self.iterations): + + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, + batch_size_tf:batch_size}) + + while True: + try: + self.session.run(optimisation_op) + except tf.errors.OutOfRangeError: + break + + def _cost(self, y_pred, y, weights): + """ + Constructs the cost function + + :param y_pred: Predicted molecular properties + :type y_pred: tf.Variable of size (n_samples, 1) + :param y: True molecular properties + :type y: tf.placeholder of shape (n_samples, 1) + :param weights: Weights used in the network. + :type weights: list of tf.Variables of length hidden_layer_sizes.size + 1 + :return: Cost + :rtype: tf.Variable of size (1,) + """ + + err = tf.square(tf.subtract(y,y_pred)) + loss = tf.reduce_mean(err, name="loss") + cost = loss + if self.l2_reg > 0: + l2_loss = self._l2_loss(weights) + cost = cost + l2_loss + if self.l1_reg > 0: + l1_loss = self._l1_loss(weights) + cost = cost + l1_loss + + return cost + + def _l2_loss(self, weights): + """ + Creates the expression for L2-regularisation on the weights + + :param weights: tensorflow tensors representing the weights + :type weights: list of tf tensors + :return: tensorflow scalar representing the regularisation contribution to the cost function + :rtype: tf.float32 + """ + + reg_term = tf.zeros([], name="l2_loss") + + for i in range(len(weights)): + reg_term += tf.reduce_sum(tf.square(weights[i])) + + return self.l2_reg * reg_term + + def _l1_loss(self, weights): + """ + Creates the expression for L1-regularisation on the weights + + :param weights: tensorflow tensors representing the weights + :type weights: list of tf tensors + :return: tensorflow scalar representing the regularisation contribution to the cost function + :rtype: tf.float32 + """ + + reg_term = tf.zeros([], name="l1_loss") + + for i in range(len(weights)): + reg_term += tf.reduce_sum(tf.abs(weights[i])) + + return self.l1_reg * reg_term + + def _predict(self, representation): + + graph = tf.get_default_graph() + + batch_size = get_batch_size(self.batch_size, representation.shape[0]) + + with graph.as_default(): + tf_x = graph.get_tensor_by_name("Data/Representation:0") + tf_y = graph.get_tensor_by_name("Data/True_energies:0") + batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") + iter_init_op = graph.get_operation_by_name("dataset_init") + + y_pred = graph.get_tensor_by_name("Model/Predicted_energies:0") + + self.session.run(iter_init_op, + feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), + batch_size_tf: batch_size, buffer_tf:1}) + + tot_y_pred = [] + + while True: + try: + ene_predictions = self.session.run(y_pred) + tot_y_pred.append(ene_predictions) + except tf.errors.OutOfRangeError: + break + + return np.concatenate(tot_y_pred, axis=0).ravel() + + + + + + diff --git a/qml/utils/utils.py b/qml/utils/utils.py index 969f6c44b..e7e4dc493 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -22,6 +22,7 @@ import numpy as np +import tensorflow as tf def is_positive(x): return (not is_array_like(x) and _is_numeric(x) and x > 0) @@ -259,6 +260,92 @@ def check_classes(classes): return approved_classes +def check_hl(hl1, hl2, hl3, hl4): + + layers = [hl1, hl2, hl3, hl4] + approved_layers = [] + + for item in layers: + is_positive_integer(item) + approved_layers.append(int(item)) + + return approved_layers[0], approved_layers[1], approved_layers[2], approved_layers[3] + +def check_batchsize(bs): + is_positive_integer(bs) + if bs == 1: + raise InputError("Batch size should be larger than 1.") + return int(bs) + +def check_learningrate(lr): + if not is_positive(lr): + raise InputError("Expected positive float value for variable learning_rate. Got %s" % str(lr)) + return float(lr) + +def check_iterations(it): + if not is_positive_integer(it): + raise InputError("Expected positive integer value for variable iterations. Got %s" % str(it)) + return int(it) + +def check_reg(l1_reg, l2_reg): + if not is_positive_or_zero(l1_reg) or not is_positive_or_zero(l2_reg): + raise InputError("Expected positive float value for regularisation variables 'l1_reg' and 'l2_reg. Got %s and %s" % (str(l1_reg), str(l2_reg))) + return float(l1_reg), float(l2_reg) + +def check_scoring(scoring): + if not scoring in ['mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae']: + raise InputError("Unknown scoring function") + return scoring + +# ------------- ** Neural network utils inputs ** -------------------------- + +def generate_weights(n_in, n_out, hl): + + weights = [] + biases = [] + + # Weights from input layer to first hidden layer + w = tf.Variable(tf.truncated_normal([hl[0], n_in], stddev = 1.0 / np.sqrt(hl[0]), dtype = tf.float32), + dtype = tf.float32, name = "weights_in") + b = tf.Variable(tf.zeros([hl[0]], dtype = tf.float32), name="bias_in", dtype = tf.float32) + + weights.append(w) + biases.append(b) + + # Weights from one hidden layer to the next + for i in range(1, len(hl)): + w = tf.Variable(tf.truncated_normal([hl[i], hl[i-1]], stddev=1.0 / np.sqrt(hl[i-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_hidden_%d" % i) + b = tf.Variable(tf.zeros([hl[i]], dtype=tf.float32), name="bias_hidden_%d" % i, dtype=tf.float32) + + weights.append(w) + biases.append(b) + + + # Weights from last hidden layer to output layer + w = tf.Variable(tf.truncated_normal([n_out, hl[-1]], + stddev=1.0 / np.sqrt(hl[-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_out") + b = tf.Variable(tf.zeros([n_out], dtype=tf.float32), name="bias_out", dtype=tf.float32) + + weights.append(w) + biases.append(b) + + return weights, biases + +def get_batch_size(batch_size, n_samples): + + if batch_size > n_samples: + print("Warning: batch_size larger than sample size. It is going to be clipped") + return min(n_samples, batch_size) + + # see if the batch size can be modified slightly to make sure the last batch is similar in size + # to the rest of the batches + # This is always less that the requested batch size, so no memory issues should arise + + better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) + return better_batch_size + # #def _is_numeric_array(x): # try: From 898b640a374d2b2d424e357480fbc8e1a7c6754f Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 21 Sep 2018 10:05:53 +0100 Subject: [PATCH 24/59] Fix for issue #8 --- qml/aglaia/aglaia.py | 58 ++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 0d3650168..2b19d196f 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1748,35 +1748,43 @@ def _generate_acsf_from_data_fortran(self, xyz, classes): initial_natoms = xyz.shape[1] - # Hack to make fortran ACSF deal with dummy atoms (only if all the molecules are the same and the padding is at the end) - if 0 in classes: - idx_zeros = np.where(classes == 0)[1] - - xyz = xyz[:, :idx_zeros[0], :] - classes = classes[:, :idx_zeros[0]] - - unpadded_natoms = xyz.shape[1] - elements, _ = self._get_elements_and_pairs(classes) representation = [] for i in range(xyz.shape[0]): - g = generate_acsf(coordinates=xyz[i], elements=elements, gradients=False, nuclear_charges=classes[i], - rcut=self.acsf_parameters['rcut'], - acut=self.acsf_parameters['acut'], - nRs2=self.acsf_parameters['nRs2'], - nRs3=self.acsf_parameters['nRs3'], - nTs=self.acsf_parameters['nTs'], - eta2=self.acsf_parameters['eta'], - eta3=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) - - if initial_natoms - unpadded_natoms != 0: + if 0 in classes[i]: + idx_zeros = np.where(classes == 0)[1] + mol_xyz = xyz[i, :idx_zeros[0], :] + mol_classes = classes[i, :idx_zeros[0]] + + g = generate_acsf(coordinates=mol_xyz, elements=elements, gradients=False, nuclear_charges=mol_classes, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta2=self.acsf_parameters['eta'], + eta3=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + padded_g = np.zeros((initial_natoms, g.shape[-1])) - padded_g[:unpadded_natoms, :] = g + padded_g[:g.shape[0], :] = g + representation.append(padded_g) + else: + + g = generate_acsf(coordinates=xyz[i], elements=elements, gradients=False, nuclear_charges=classes[i], + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta2=self.acsf_parameters['eta'], + eta3=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + representation.append(g) return np.asarray(representation) @@ -2186,12 +2194,14 @@ def _get_elements_and_pairs(self, classes): """ elements = np.unique(classes) + elements_no_zero = np.ma.masked_equal(elements,0).compressed() + element_pairs = [] - for i, ei in enumerate(elements): - for ej in elements[i:]: + for i, ei in enumerate(elements_no_zero): + for ej in elements_no_zero[i:]: element_pairs.append([ej, ei]) - return np.asarray(elements), np.asarray(element_pairs) + return np.asarray(elements_no_zero), np.asarray(element_pairs) def _find_elements(self, zs): """ From f77d04090c324b43fbf41d9f7edad3b1d3cc6c5e Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 24 Sep 2018 14:07:29 +0100 Subject: [PATCH 25/59] Random comment --- qml/aglaia/aglaia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 2b19d196f..7c4337695 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1733,7 +1733,6 @@ def _generate_acsf_from_data_tf(self, xyz, classes): return representation_conc - # TODO Make tensorflow and fortran consistent def _generate_acsf_from_data_fortran(self, xyz, classes): """ This function uses fortran to generate the representation and the derivative of the representation with respect From 2d5988f3cdfa429aedeb285e0c0b9ae24e39f9f7 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 24 Sep 2018 16:46:03 +0100 Subject: [PATCH 26/59] Started including the atomic model --- qml/qmlearn/models.py | 118 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index d4b8d719f..72aada1fd 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -161,7 +161,7 @@ def predict(self, X): return np.dot(K, self.alpha) -class MolecularNeuralNetwork(_BaseModel): +class NeuralNetwork(_BaseModel): """ Feed forward neural network that takes molecular representations and molecular properties. """ @@ -203,8 +203,15 @@ def fit(self, X, y=None): ene = np.reshape(ene, (ene.shape[0], 1)) - # Generating the model - self._generate_model(g.shape[1]) + if X._representation_type == 'molecular': + + self._generate_molecular_model(g.shape[-1]) + + elif X._representation_type == 'atomic': + zs = X.nuclear_charges + # TODO make it deal with padding + elements = np.unique(zs) + self._generate_atomic_model(g.shape[1], g.shape[-1], elements) # Training the model self._train(g, ene) @@ -218,9 +225,9 @@ def predict(self, X): return y_pred - def _generate_model(self, n_features): + def _generate_molecular_model(self, n_features): """ - This function generates the tensorflow graph. + This function generates the tensorflow graph for the molecular feed forward neural net. """ tf.reset_default_graph() @@ -269,6 +276,106 @@ def _generate_model(self, n_features): iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + def _generate_atomic_model(self, n_atoms, n_features, elements): + """ + This function generates the atomic feed forward neural network. + :return: + """ + #TODO actually make these into different graphs + tf.reset_default_graph() + + hidden_layers = [] + for item in [self.hl1, self.hl2, self.hl3, self.hl4]: + if item != 0: + hidden_layers.append(item) + + hidden_layers = tuple(hidden_layers) + + # Initial set up of the NN + with tf.name_scope("Data"): + ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") + ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") + ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") + batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") + buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") + + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) + dataset = dataset.shuffle(buffer_size=buffer_tf) + dataset = dataset.batch(batch_size_tf) + + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + tf_x, tf_zs, tf_y = iterator.get_next() + + element_weights = {} + element_biases = {} + + with tf.name_scope("Weights"): + for i in range(elements.shape[0]): + weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + element_weights[elements[i]] = weights + element_biases[elements[i]] = biases + + with tf.name_scope("Model"): + all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) + + for el in elements: + # Obtaining the indices of where in Zs there is the current element + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) + + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(tf_x, where_element) + + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, element_weights[el], + element_biases[el]) + + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) + + # Summing the energies of all the atoms + total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) + + with tf.name_scope("Cost_func"): + cost = self._cost(total_energies, tf_y, element_weights) + + with tf.name_scope("Optimiser"): + optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + + iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + + def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): + """ + Constructs the atomic part of the network. It calculates the output for all atoms as if they all were the same + element. + + :param x: Atomic representation + :type x: tf tensor of shape (n_samples, n_atoms, n_features) + :param weights: Weights used in the network for a particular element. + :type weights: list of tf.Variables of length hidden_layer_sizes.size + 1 + :param biases: Biases used in the network for a particular element. + :type biases: list of tf.Variables of length hidden_layer_sizes.size + 1 + :return: Output + :rtype: tf.Variable of size (n_samples, n_atoms) + """ + + # Calculate the activation of the first hidden layer + z = tf.add(tf.tensordot(x, tf.transpose(weights[0]), axes=1), biases[0]) + h = tf.sigmoid(z) + + # Calculate the activation of the remaining hidden layers + for i in range(len(hidden_layer_sizes) - 1): + z = tf.add(tf.tensordot(h, tf.transpose(weights[i + 1]), axes=1), biases[i + 1]) + h = tf.sigmoid(z) + + # Calculating the output of the last layer + z = tf.add(tf.tensordot(h, tf.transpose(weights[-1]), axes=1), biases[-1]) + + z_squeezed = tf.squeeze(z, axis=[-1]) + + return z_squeezed + def _train(self, representations, energies): graph = tf.get_default_graph() @@ -304,6 +411,7 @@ def _train(self, representations, energies): except tf.errors.OutOfRangeError: break + # TODO modify so that it works with the atomic model as well def _cost(self, y_pred, y, weights): """ Constructs the cost function From da8b524b47f99b01f97384507c95b74638b81bb5 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 25 Sep 2018 11:24:09 +0100 Subject: [PATCH 27/59] Made the atomic neural network work --- examples/qmlearn.py | 39 +++++++++- qml/qmlearn/data.py | 4 + qml/qmlearn/models.py | 173 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 180 insertions(+), 36 deletions(-) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index 60faf0893..67e7d0617 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -214,8 +214,11 @@ def pipelines(): print() def pipelines_2(): + """ + Scikit learn pipeline with a molecular neural network + """ - print("\n *** Begin pipelines example with Neural Network ***") + print("\n *** Begin pipelines example with molecular Neural Network ***") data = qmlearn.Data("../test/qm7/*.xyz") energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) @@ -225,19 +228,46 @@ def pipelines_2(): model = sklearn.pipeline.make_pipeline( qmlearn.preprocessing.AtomScaler(data), qmlearn.representations.CoulombMatrix(), - qmlearn.models.MolecularNeuralNetwork(iterations=100000, batch_size=50, learning_rate=0.005), + qmlearn.models.NeuralNetwork(iterations=500, batch_size=50, learning_rate=0.005), ) indices = np.arange(1000) - # np.random.shuffle(indices) + np.random.shuffle(indices) model.fit(indices[:100]) scores = model.score(indices[:100]) print("Negative MAE:", scores) - print("*** End pipelines example with Neural Network *** \n") + print("*** End pipelines example with molecular Neural Network *** \n") + +def pipelines_3(): + """ + Scikit learn pipeline with an atomic neural network + """ + + print("\n *** Begin pipelines example with atomic Neural Network ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.AtomCenteredSymmetryFunctions(), + qmlearn.models.NeuralNetwork(iterations=500, batch_size=50, learning_rate=0.005), + ) + + indices = np.arange(1000) + np.random.shuffle(indices) + + model.fit(indices[:100]) + + scores = model.score(indices[:100]) + print("Negative MAE:", scores) + print("*** End pipelines example with atomic Neural Network *** \n") def cross_validation(): """ @@ -312,3 +342,4 @@ def cross_validation(): # pipelines() # cross_validation() pipelines_2() + pipelines_3() diff --git a/qml/qmlearn/data.py b/qml/qmlearn/data.py index 3890e2641..6348cbb29 100644 --- a/qml/qmlearn/data.py +++ b/qml/qmlearn/data.py @@ -29,6 +29,7 @@ def __init__(self, filenames=None, property_type = "energy"): self.nuclear_charges = None self.natoms = None self.energies = None + self.unique_elements = None if isinstance(filenames, str): filenames = sorted(glob.glob(filenames)) @@ -108,6 +109,7 @@ def _parse_xyz_files(self, filenames): self.coordinates = np.empty(self.ncompounds, dtype=object) self.nuclear_charges = np.empty(self.ncompounds, dtype=object) self.natoms = np.empty(self.ncompounds, dtype = int) + self.unique_elements = [] for i, filename in enumerate(filenames): with open(filename, "r") as f: @@ -126,6 +128,8 @@ def _parse_xyz_files(self, filenames): self.nuclear_charges[i][j] = NUCLEAR_CHARGE[tokens[0]] self.coordinates[i][j] = np.asarray(tokens[1:4], dtype=float) + if not NUCLEAR_CHARGE[tokens[0]] in self.unique_elements: + self.unique_elements.append(NUCLEAR_CHARGE[tokens[0]]) # Try to convert dtype to int/float in cases where you have the # same molecule, just different conformers diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index 72aada1fd..e6c15bac1 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -163,7 +163,7 @@ def predict(self, X): class NeuralNetwork(_BaseModel): """ - Feed forward neural network that takes molecular representations and molecular properties. + Feed forward neural network that takes molecular or atomic representations for predicting molecular properties. """ def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0.001, iterations=500, l1_reg=0.0, @@ -171,15 +171,26 @@ def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0 """ :param hl1: number of nodes in the 1st hidden layer + :type hl1: int :param hl2: number of nodes in the 2nd hidden layer + :type hl2: int :param hl3: number of nodes in the 3rd hidden layer + :type hl3: int :param hl4: number of nodes in the 4th hidden layer + :type hl4: int :param batch_size: Number of samples to have in each batch during training + :type batch_size: int :param learning_rate: step size in optimisation algorithm + :type learning_rate: float :param iterations: number of iterations to do during training + :type iterations: int :param l1_reg: L1 regularisation parameter + :type l1_reg: float :param l2_reg: L2 regularisation parameter - :param scoring: What function to use for scoring + :type l2_reg: float + :param scoring: What function to use for scoring. Available options are "neg_mae", "mae", "rmsd", "neg_rmsd", + "neg_log_mae" + :type scoring: string """ self.hl1, self.hl2, self.hl3, self.hl4 = check_hl(hl1, hl2, hl3, hl4) @@ -194,40 +205,56 @@ def fit(self, X, y=None): Fit a feedforward neural network. :param X: Data object + :type X: object from the class Data :param y: Energies + :type y: numpy array of shape (n_samples,) """ # obtaining the representations and the energies from the data object if isinstance(X, Data): - g, ene = X._representations, X.energies[X._indices] + g, zs, ene = X._representations, X.nuclear_charges, X.energies[X._indices] ene = np.reshape(ene, (ene.shape[0], 1)) - + else: + raise NotImplementedError if X._representation_type == 'molecular': self._generate_molecular_model(g.shape[-1]) + self._train(g, ene, zs, "molecular") elif X._representation_type == 'atomic': - zs = X.nuclear_charges - # TODO make it deal with padding - elements = np.unique(zs) - self._generate_atomic_model(g.shape[1], g.shape[-1], elements) - # Training the model - self._train(g, ene) + g, zs = self._padding(g, zs) + self._generate_atomic_model(g.shape[1], g.shape[-1], X.unique_elements) + self._train(g, ene, zs, "atomic") def predict(self, X): + """ + Function that predicts the molecular properties from a trained model. + + :param X: Data object + :type X: object from class Data + :return: predicted properties + :rtype: numpy array of shape (n_samples,) + """ if isinstance(X, Data): - g = X._representations + g, zs = X._representations, X.nuclear_charges - y_pred = self._predict(g) + if X._representation_type == "molecular": + y_pred = self._predict(g, zs, "molecular") + elif X._representation_type == "atomic": + g, zs = self._padding(g, zs) + y_pred = self._predict(g, zs, "atomic") return y_pred def _generate_molecular_model(self, n_features): """ This function generates the tensorflow graph for the molecular feed forward neural net. + + :param n_features: number of features in the representation + :type n_features: int """ tf.reset_default_graph() @@ -279,7 +306,13 @@ def _generate_molecular_model(self, n_features): def _generate_atomic_model(self, n_atoms, n_features, elements): """ This function generates the atomic feed forward neural network. - :return: + + :param n_atoms: maximum number of atoms in the molecules + :type n_atoms: int + :param n_features: number of features in the representation + :type n_features: int + :param elements: unique elements in the data set + :type elements: array of ints """ #TODO actually make these into different graphs tf.reset_default_graph() @@ -310,7 +343,7 @@ def _generate_atomic_model(self, n_atoms, n_features, elements): element_biases = {} with tf.name_scope("Weights"): - for i in range(elements.shape[0]): + for i in range(len(elements)): weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) element_weights[elements[i]] = weights element_biases[elements[i]] = biases @@ -347,8 +380,8 @@ def _generate_atomic_model(self, n_atoms, n_features, elements): def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): """ - Constructs the atomic part of the network. It calculates the output for all atoms as if they all were the same - element. + Constructs the atomic part of the network. It calculates the atomic property for all atoms and then sums them + together to obtain the molecular property. :param x: Atomic representation :type x: tf tensor of shape (n_samples, n_atoms, n_features) @@ -376,13 +409,27 @@ def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): return z_squeezed - def _train(self, representations, energies): + def _train(self, representations, energies, zs, representation_type): + """ + This function trains the neural network. + + :param representations: the representations of the molecules (either atomic or molecular) + :type representations: numpy array of shape (n_samples, max_n_atoms, n_features) or (n_samples, n_features) + :param energies: the true molecular properties to be learnt + :type energies: numpy array of shape (n_samples, ) + :param zs: the nuclear charges of all the atoms in the molecules + :type zs: numpy array of shape (n_samples, max_n_atoms) + :param representation_type: flag setting whether the representation is "atomic" or "molecular". + :type representation_type: string + """ graph = tf.get_default_graph() with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Representation:0") tf_y = graph.get_tensor_by_name("Data/True_energies:0") + if representation_type == "atomic": + tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") @@ -402,8 +449,12 @@ def _train(self, representations, energies): else: buff = int(4.5 * batch_size) - self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, - batch_size_tf:batch_size}) + if representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: zs, buffer_tf: buff, + batch_size_tf: batch_size}) + else: + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, + batch_size_tf:batch_size}) while True: try: @@ -411,7 +462,6 @@ def _train(self, representations, energies): except tf.errors.OutOfRangeError: break - # TODO modify so that it works with the atomic model as well def _cost(self, y_pred, y, weights): """ Constructs the cost function @@ -426,15 +476,29 @@ def _cost(self, y_pred, y, weights): :rtype: tf.Variable of size (1,) """ - err = tf.square(tf.subtract(y,y_pred)) + err = tf.square(tf.subtract(y, y_pred)) loss = tf.reduce_mean(err, name="loss") - cost = loss - if self.l2_reg > 0: - l2_loss = self._l2_loss(weights) - cost = cost + l2_loss - if self.l1_reg > 0: - l1_loss = self._l1_loss(weights) - cost = cost + l1_loss + + if isinstance(weights, dict): + cost = loss + if self.l2_reg > 0: + l2_loss = 0 + for element in weights: + l2_loss += self._l2_loss(weights[element]) + cost += l2_loss + if self.l1_reg > 0: + l1_loss = 0 + for element in weights: + l1_loss += self._l1_loss(weights[element]) + cost += l1_loss + else: + cost = loss + if self.l2_reg > 0: + l2_loss = self._l2_loss(weights) + cost = cost + l2_loss + if self.l1_reg > 0: + l1_loss = self._l1_loss(weights) + cost = cost + l1_loss return cost @@ -472,7 +536,19 @@ def _l1_loss(self, weights): return self.l1_reg * reg_term - def _predict(self, representation): + def _predict(self, representation, zs, representation_type): + """ + This function predicts the molecular properties from the representations. + + :param representation: representation of the molecules, can be atomic or molecular + :type representation: numpy array of shape (n_samples, n_max_atoms, n_features) or (n_samples, n_features) + :param zs: nuclear charges of the molecules + :type zs: numpy array of shape (n_samples, n_max_atoms) + :param representation_type: flag saying whether the representation is the "molecular" or "atomic" + :type representation_type: string + :return: molecular properties predictions + :rtype: numpy array of shape (n_samples,) + """ graph = tf.get_default_graph() @@ -481,15 +557,21 @@ def _predict(self, representation): with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Representation:0") tf_y = graph.get_tensor_by_name("Data/True_energies:0") + if representation_type == "atomic": + tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") iter_init_op = graph.get_operation_by_name("dataset_init") y_pred = graph.get_tensor_by_name("Model/Predicted_energies:0") - self.session.run(iter_init_op, - feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), - batch_size_tf: batch_size, buffer_tf:1}) + if representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), + tf_zs: zs, buffer_tf: 1, batch_size_tf: batch_size}) + else: + self.session.run(iter_init_op, + feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), + batch_size_tf: batch_size, buffer_tf:1}) tot_y_pred = [] @@ -502,6 +584,33 @@ def _predict(self, representation): return np.concatenate(tot_y_pred, axis=0).ravel() + def _padding(self, representation, nuclear_charges): + """ + This function takes atomic representations for molecules of different sizes and pads them with 0 so that they + all have the same size. + + :param representation: list of numby arrays of shape (n_atoms, n_features) + :param nuclear_charges: list of numpy arrays of shape (n_atoms,) + :return: numpy array of shape (n_samples, max_n_atoms, n_features) and (n_samples, max_n_atoms) + """ + + max_n_atoms = 0 + + for i in range(len(representation)): + n_atoms = representation[i].shape[0] + if n_atoms > max_n_atoms: + max_n_atoms = n_atoms + + padded_rep = np.zeros((len(representation), max_n_atoms, representation[0].shape[1])) + padded_zs = np.zeros((len(representation), max_n_atoms)) + + for i in range(len(representation)): + n_atoms = representation[i].shape[0] + padded_rep[i, :n_atoms] = representation[i] + padded_zs[i, :n_atoms] = nuclear_charges[i] + + return padded_rep, padded_zs + From 227e2b8ec24cc66e9b398e6b862dcae2b9efaad2 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 25 Sep 2018 11:43:31 +0100 Subject: [PATCH 28/59] Fixed a bug with the indices --- qml/qmlearn/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index e6c15bac1..b7ded374d 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -212,7 +212,7 @@ def fit(self, X, y=None): # obtaining the representations and the energies from the data object if isinstance(X, Data): - g, zs, ene = X._representations, X.nuclear_charges, X.energies[X._indices] + g, zs, ene = X._representations, X.nuclear_charges[X._indices], X.energies[X._indices] ene = np.reshape(ene, (ene.shape[0], 1)) else: raise NotImplementedError @@ -239,7 +239,7 @@ def predict(self, X): """ if isinstance(X, Data): - g, zs = X._representations, X.nuclear_charges + g, zs = X._representations, X.nuclear_charges[X._indices] if X._representation_type == "molecular": y_pred = self._predict(g, zs, "molecular") From 250031d846b84136698b94a16c146688e6902541 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 25 Sep 2018 12:00:54 +0100 Subject: [PATCH 29/59] Now training and predictions don't use the default graph, to avoid problems --- qml/qmlearn/models.py | 211 +++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 107 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index b7ded374d..76ff4988e 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -257,7 +257,7 @@ def _generate_molecular_model(self, n_features): :type n_features: int """ - tf.reset_default_graph() + self.graph = tf.Graph() hidden_layers = [] for item in [self.hl1, self.hl2, self.hl3, self.hl4]: @@ -267,41 +267,42 @@ def _generate_molecular_model(self, n_features): hidden_layers = tuple(hidden_layers) # Initial set up of the NN - with tf.name_scope("Data"): - ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") - ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") - batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") - buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") + with self.graph.as_default(): + with tf.name_scope("Data"): + ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") + ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") + batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") + buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") - dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) - dataset = dataset.shuffle(buffer_size=buffer_tf) - dataset = dataset.batch(batch_size_tf) + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) + dataset = dataset.shuffle(buffer_size=buffer_tf) + dataset = dataset.batch(batch_size_tf) - iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - tf_x, tf_y = iterator.get_next() + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + tf_x, tf_y = iterator.get_next() - with tf.name_scope("Weights"): - weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) - - with tf.name_scope("Model"): - z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) - h = tf.sigmoid(z) + with tf.name_scope("Weights"): + weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) - # Calculate the activation of the remaining hidden layers - for i in range(1, len(weights) - 1): - z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) + with tf.name_scope("Model"): + z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) h = tf.sigmoid(z) - # Calculating the output of the last layer - y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") + # Calculate the activation of the remaining hidden layers + for i in range(1, len(weights) - 1): + z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) + h = tf.sigmoid(z) - with tf.name_scope("Cost_func"): - cost = self._cost(y_pred, tf_y, weights) + # Calculating the output of the last layer + y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") - with tf.name_scope("Optimiser"): - optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + with tf.name_scope("Cost_func"): + cost = self._cost(y_pred, tf_y, weights) - iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + with tf.name_scope("Optimiser"): + optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + + iter_init_op = iterator.make_initializer(dataset, name="dataset_init") def _generate_atomic_model(self, n_atoms, n_features, elements): """ @@ -314,8 +315,7 @@ def _generate_atomic_model(self, n_atoms, n_features, elements): :param elements: unique elements in the data set :type elements: array of ints """ - #TODO actually make these into different graphs - tf.reset_default_graph() + self.graph = tf.Graph() hidden_layers = [] for item in [self.hl1, self.hl2, self.hl3, self.hl4]: @@ -325,58 +325,59 @@ def _generate_atomic_model(self, n_atoms, n_features, elements): hidden_layers = tuple(hidden_layers) # Initial set up of the NN - with tf.name_scope("Data"): - ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") - ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") - ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") - batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") - buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") + with self.graph.as_default(): + with tf.name_scope("Data"): + ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") + ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") + ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") + batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") + buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") - dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) - dataset = dataset.shuffle(buffer_size=buffer_tf) - dataset = dataset.batch(batch_size_tf) + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) + dataset = dataset.shuffle(buffer_size=buffer_tf) + dataset = dataset.batch(batch_size_tf) - iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - tf_x, tf_zs, tf_y = iterator.get_next() + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + tf_x, tf_zs, tf_y = iterator.get_next() - element_weights = {} - element_biases = {} + element_weights = {} + element_biases = {} - with tf.name_scope("Weights"): - for i in range(len(elements)): - weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) - element_weights[elements[i]] = weights - element_biases[elements[i]] = biases + with tf.name_scope("Weights"): + for i in range(len(elements)): + weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + element_weights[elements[i]] = weights + element_biases[elements[i]] = biases - with tf.name_scope("Model"): - all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) + with tf.name_scope("Model"): + all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) - for el in elements: - # Obtaining the indices of where in Zs there is the current element - current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) - where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) + for el in elements: + # Obtaining the indices of where in Zs there is the current element + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) - # Extract the descriptor corresponding to the right element - current_element_in_x = tf.gather_nd(tf_x, where_element) + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(tf_x, where_element) - # Calculate the atomic energy of all the atoms of type equal to the current element - atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, element_weights[el], - element_biases[el]) + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, element_weights[el], + element_biases[el]) - # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies - updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) - all_atomic_energies = tf.add(all_atomic_energies, updates) + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) - # Summing the energies of all the atoms - total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) + # Summing the energies of all the atoms + total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) - with tf.name_scope("Cost_func"): - cost = self._cost(total_energies, tf_y, element_weights) + with tf.name_scope("Cost_func"): + cost = self._cost(total_energies, tf_y, element_weights) - with tf.name_scope("Optimiser"): - optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + with tf.name_scope("Optimiser"): + optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) - iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + iter_init_op = iterator.make_initializer(dataset, name="dataset_init") def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): """ @@ -423,44 +424,42 @@ def _train(self, representations, energies, zs, representation_type): :type representation_type: string """ - graph = tf.get_default_graph() + batch_size = get_batch_size(self.batch_size, representations.shape[0]) - with graph.as_default(): - tf_x = graph.get_tensor_by_name("Data/Representation:0") - tf_y = graph.get_tensor_by_name("Data/True_energies:0") + with self.graph.as_default(): + tf_x = self.graph.get_tensor_by_name("Data/Representation:0") + tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") if representation_type == "atomic": - tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") - batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") - buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") + tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") + batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") - optimisation_op = graph.get_operation_by_name("Optimiser/Adam") - iter_init_op = graph.get_operation_by_name("dataset_init") + optimisation_op = self.graph.get_operation_by_name("Optimiser/Adam") + iter_init_op = self.graph.get_operation_by_name("dataset_init") - batch_size = get_batch_size(self.batch_size, representations.shape[0]) - - self.session = tf.Session(graph=graph) + self.session = tf.Session(graph=self.graph) - self.session.run(tf.global_variables_initializer()) + self.session.run(tf.global_variables_initializer()) - for i in range(self.iterations): + for i in range(self.iterations): - if i % 2 == 0: - buff = int(3.5 * batch_size) - else: - buff = int(4.5 * batch_size) + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) - if representation_type == "atomic": - self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: zs, buffer_tf: buff, - batch_size_tf: batch_size}) - else: - self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, - batch_size_tf:batch_size}) + if representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: zs, buffer_tf: buff, + batch_size_tf: batch_size}) + else: + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, + batch_size_tf:batch_size}) - while True: - try: - self.session.run(optimisation_op) - except tf.errors.OutOfRangeError: - break + while True: + try: + self.session.run(optimisation_op) + except tf.errors.OutOfRangeError: + break def _cost(self, y_pred, y, weights): """ @@ -550,20 +549,18 @@ def _predict(self, representation, zs, representation_type): :rtype: numpy array of shape (n_samples,) """ - graph = tf.get_default_graph() - batch_size = get_batch_size(self.batch_size, representation.shape[0]) - with graph.as_default(): - tf_x = graph.get_tensor_by_name("Data/Representation:0") - tf_y = graph.get_tensor_by_name("Data/True_energies:0") + with self.graph.as_default(): + tf_x = self.graph.get_tensor_by_name("Data/Representation:0") + tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") if representation_type == "atomic": - tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") - batch_size_tf = graph.get_tensor_by_name("Data/Batch_size:0") - buffer_tf = graph.get_tensor_by_name("Data/Buffer:0") - iter_init_op = graph.get_operation_by_name("dataset_init") + tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") + batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") + iter_init_op = self.graph.get_operation_by_name("dataset_init") - y_pred = graph.get_tensor_by_name("Model/Predicted_energies:0") + y_pred = self.graph.get_tensor_by_name("Model/Predicted_energies:0") if representation_type == "atomic": self.session.run(iter_init_op, feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), From 687291bf35c05c2ca7abc4e146c74b3cd3023da7 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 25 Sep 2018 12:48:03 +0100 Subject: [PATCH 30/59] uncommented examples --- examples/qmlearn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index 67e7d0617..46de04e05 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -334,12 +334,12 @@ def cross_validation(): print("*** End CV examples ***") if __name__ == '__main__': - # data() - # preprocessing() - # representations() - # kernels() - # models() - # pipelines() - # cross_validation() + data() + preprocessing() + representations() + kernels() + models() + pipelines() + cross_validation() pipelines_2() pipelines_3() From 439190638fb4d2fec279bc4a6e8ffcbc84a4d476 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Wed, 26 Sep 2018 11:18:20 +0100 Subject: [PATCH 31/59] Removed unique_elements in data class This can be stored in the NN class, but I might reverse the change later --- qml/qmlearn/data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/qml/qmlearn/data.py b/qml/qmlearn/data.py index 6348cbb29..3890e2641 100644 --- a/qml/qmlearn/data.py +++ b/qml/qmlearn/data.py @@ -29,7 +29,6 @@ def __init__(self, filenames=None, property_type = "energy"): self.nuclear_charges = None self.natoms = None self.energies = None - self.unique_elements = None if isinstance(filenames, str): filenames = sorted(glob.glob(filenames)) @@ -109,7 +108,6 @@ def _parse_xyz_files(self, filenames): self.coordinates = np.empty(self.ncompounds, dtype=object) self.nuclear_charges = np.empty(self.ncompounds, dtype=object) self.natoms = np.empty(self.ncompounds, dtype = int) - self.unique_elements = [] for i, filename in enumerate(filenames): with open(filename, "r") as f: @@ -128,8 +126,6 @@ def _parse_xyz_files(self, filenames): self.nuclear_charges[i][j] = NUCLEAR_CHARGE[tokens[0]] self.coordinates[i][j] = np.asarray(tokens[1:4], dtype=float) - if not NUCLEAR_CHARGE[tokens[0]] in self.unique_elements: - self.unique_elements.append(NUCLEAR_CHARGE[tokens[0]]) # Try to convert dtype to int/float in cases where you have the # same molecule, just different conformers From 2fb8d3941e9ca02024dfa049e9da5d9b7f15b5e1 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Wed, 26 Sep 2018 11:19:44 +0100 Subject: [PATCH 32/59] Made tensorflow an optional dependency The reason for this approach is that pip would just auto install tensorflow and you might want the gpu version or your own compiled one. --- qml/qmlearn/models.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index 76ff4988e..e3793e160 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -23,14 +23,19 @@ from __future__ import division, absolute_import, print_function import numpy as np -import tensorflow as tf from sklearn.base import BaseEstimator from sklearn.metrics import mean_absolute_error -from ..utils import * +try: + import tensorflow as tf +except ImportError: + tf = None + +from ..utils import is_numeric_array from .data import Data from ..math import cho_solve + class _BaseModel(BaseEstimator): """ Base class for all regression models @@ -169,30 +174,32 @@ class NeuralNetwork(_BaseModel): def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0.001, iterations=500, l1_reg=0.0, l2_reg=0.0, scoring="neg_mae"): """ - - :param hl1: number of nodes in the 1st hidden layer + :param hl1: number of hidden nodes in the 1st hidden layer :type hl1: int - :param hl2: number of nodes in the 2nd hidden layer + :param hl2: number of hidden nodes in the 2nd hidden layer :type hl2: int - :param hl3: number of nodes in the 3rd hidden layer + :param hl3: number of hidden nodes in the 3rd hidden layer :type hl3: int - :param hl4: number of nodes in the 4th hidden layer + :param hl4: number of hidden nodes in the 4th hidden layer :type hl4: int :param batch_size: Number of samples to have in each batch during training :type batch_size: int - :param learning_rate: step size in optimisation algorithm + :param learning_rate: Step size in optimisation algorithm :type learning_rate: float - :param iterations: number of iterations to do during training + :param iterations: Number of iterations to do during training :type iterations: int :param l1_reg: L1 regularisation parameter :type l1_reg: float :param l2_reg: L2 regularisation parameter :type l2_reg: float :param scoring: What function to use for scoring. Available options are "neg_mae", "mae", "rmsd", "neg_rmsd", - "neg_log_mae" + "neg_log_mae" :type scoring: string """ + # Check if tensorflow is available + self.tf_check() + self.hl1, self.hl2, self.hl3, self.hl4 = check_hl(hl1, hl2, hl3, hl4) self.batch_size = check_batchsize(batch_size) self.learning_rate = check_learningrate(learning_rate) @@ -608,8 +615,7 @@ def _padding(self, representation, nuclear_charges): return padded_rep, padded_zs - - - - - + def tf_check(self): + if tf is None: + print("Tensorflow not found but is needed for %s" % self.__class__.__name__) + raise SystemExit From 819fa71d414ac8a47588d2fc019b543527bf4e93 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Wed, 26 Sep 2018 16:22:19 +0100 Subject: [PATCH 33/59] Made is_numeric non-private and removed legacy code --- qml/utils/utils.py | 87 +++------------------------------------------- 1 file changed, 4 insertions(+), 83 deletions(-) diff --git a/qml/utils/utils.py b/qml/utils/utils.py index e7e4dc493..bae3adf1a 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -25,10 +25,10 @@ import tensorflow as tf def is_positive(x): - return (not is_array_like(x) and _is_numeric(x) and x > 0) + return (not is_array_like(x) and is_numeric(x) and x > 0) def is_positive_or_zero(x): - return (not is_array_like(x) and _is_numeric(x) and x >= 0) + return (not is_array_like(x) and is_numeric(x) and x >= 0) def is_array_like(x): return isinstance(x, (tuple, list, np.ndarray)) @@ -45,7 +45,7 @@ def is_string(x): def is_dict(x): return isinstance(x, dict) -def _is_numeric(x): +def is_numeric(x): return isinstance(x, (float, int)) def is_numeric_array(x): @@ -58,7 +58,7 @@ def is_numeric_array(x): return False def _is_integer(x): - return (_is_numeric(x) and (float(x) == int(x))) + return (is_numeric(x) and (float(x) == int(x))) # will intentionally accept 0, 1 as well def is_bool(x): @@ -346,85 +346,6 @@ def get_batch_size(batch_size, n_samples): better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) return better_batch_size -# -#def _is_numeric_array(x): -# try: -# arr = np.asarray(x, dtype = float) -# return True -# except (ValueError, TypeError): -# return False -# -#def _is_numeric_scalar(x): -# try: -# float(x) -# return True -# except (ValueError, TypeError): -# return False -# -#def is_positive(x): -# if is_array(x) and _is_numeric_array(x): -# return _is_positive_scalar(x) -# -#def _is_positive_scalar(x): -# return float(x) > 0 -# -#def _is_positive_array(x): -# return np.asarray(x, dtype = float) > 0 -# -#def is_positive_or_zero(x): -# if is_numeric(x): -# if is_array(x): -# return is_positive_or_zero_array(x) -# else: -# return is_positive_or_zero_scalar(x) -# else: -# return False -# -#def is_positive_or_zero_array(x): -# -# -#def is_positive_or_zero_scalar(x): -# return float(x) >= 0 -# -#def is_integer(x): -# if is_array(x) -# return is_integer_array(x) -# else: -# return is_integer_scalar(x) -# -## will intentionally accept floats with integer values -#def is_integer_array(x): -# if is_numeric(x): -# return (np.asarray(x) == np.asarray(y)).all() -# else: -# return False -# -## will intentionally accept floats with integer values -#def is_integer_scalar(x): -# if is_numeric(x): -# return int(float(x)) == float(x) -# else: -# return False -# -# -#def is_string(x): -# return isinstance(x, str) -# -#def is_positive_integer(x): -# return (is_numeric(x) and is_integer(x) and is_positive(x)) -# -#def is_positive_integer_or_zero(x): -# return (is_numeric(x) and is_integer(x) and is_positive_or_zero(x)) -# -#def is_negative_integer(x): -# if is_integer(x): -# return not is_positive(x) -# else: -# return False -# -#def is_non_zero_integer(x): -# return (is_positive_integer(x) or is_negative_integer(x)) - # Custom exception to raise when we intentinoally catch an error # This way we can test that the right error was raised in test cases From 20ca689c88f3b35365de0d9c8a849890c064ff71 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Thu, 27 Sep 2018 14:38:01 +0100 Subject: [PATCH 34/59] Added 1d array util function --- qml/utils/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/qml/utils/utils.py b/qml/utils/utils.py index bae3adf1a..d8c4ff52b 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -57,6 +57,13 @@ def is_numeric_array(x): return False return False +def is_numeric_1d_array(x): + return is_numeric_array(x) and is_1d_array(x) + +# Accepts 2d arrays of shape (n,1) and (1,n) as well +def is_1d_array(x): + return is_array_like(x) and (np.asarray(x).ndim == 1 or np.asarray(x).ndim == 2 and 1 in np.asarray(x).shape) + def _is_integer(x): return (is_numeric(x) and (float(x) == int(x))) @@ -83,6 +90,9 @@ def _is_integer_array(x): return True return False +def is_positive_integer_1d_array(x): + return is_positive_integer_array(x) and is_1d_array(x) + def is_positive_integer_array(x): return (_is_integer_array(x) and _is_positive_array(x)) From 9ef8c100e52515822cfe8092f6f9a891e404466f Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Thu, 27 Sep 2018 14:41:19 +0100 Subject: [PATCH 35/59] Removed QML check and moved functions from utils to tf_utils --- qml/aglaia/aglaia.py | 14 ++++-------- qml/aglaia/tf_utils.py | 48 ++++++++++++++++++++++++++++++++++++++++ qml/utils/utils.py | 50 ------------------------------------------ 3 files changed, 52 insertions(+), 60 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 0d3650168..a99497cf6 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -40,16 +40,10 @@ from qml.representations import generate_acsf from qml.aglaia.graceful_killer import GracefulKiller -try: - from qml.data import Compound - from qml import representations as qml_rep -except ImportError: - raise ImportError("The module qml is required") - -try: - import tensorflow -except ImportError: - raise ImportError("Tensorflow 1.8 is required to run neural networks.") +from qml.data import Compound +from qml import representations as qml_rep + +import tensorflow class _NN(BaseEstimator): diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py index 1bc6567d3..ec2dd4a5c 100644 --- a/qml/aglaia/tf_utils.py +++ b/qml/aglaia/tf_utils.py @@ -71,3 +71,51 @@ def write_weight_histogram(self, weights): def write_cost_summary(self, cost): tf.summary.scalar('cost', cost) + +def generate_weights(n_in, n_out, hl): + + weights = [] + biases = [] + + # Weights from input layer to first hidden layer + w = tf.Variable(tf.truncated_normal([hl[0], n_in], stddev = 1.0 / np.sqrt(hl[0]), dtype = tf.float32), + dtype = tf.float32, name = "weights_in") + b = tf.Variable(tf.zeros([hl[0]], dtype = tf.float32), name="bias_in", dtype = tf.float32) + + weights.append(w) + biases.append(b) + + # Weights from one hidden layer to the next + for i in range(1, len(hl)): + w = tf.Variable(tf.truncated_normal([hl[i], hl[i-1]], stddev=1.0 / np.sqrt(hl[i-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_hidden_%d" % i) + b = tf.Variable(tf.zeros([hl[i]], dtype=tf.float32), name="bias_hidden_%d" % i, dtype=tf.float32) + + weights.append(w) + biases.append(b) + + + # Weights from last hidden layer to output layer + w = tf.Variable(tf.truncated_normal([n_out, hl[-1]], + stddev=1.0 / np.sqrt(hl[-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_out") + b = tf.Variable(tf.zeros([n_out], dtype=tf.float32), name="bias_out", dtype=tf.float32) + + weights.append(w) + biases.append(b) + + return weights, biases + +def get_batch_size(batch_size, n_samples): + + if batch_size > n_samples: + print("Warning: batch_size larger than sample size. It is going to be clipped") + return min(n_samples, batch_size) + + # see if the batch size can be modified slightly to make sure the last batch is similar in size + # to the rest of the batches + # This is always less that the requested batch size, so no memory issues should arise + + better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) + return better_batch_size + diff --git a/qml/utils/utils.py b/qml/utils/utils.py index d8c4ff52b..b694e4f4b 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -307,56 +307,6 @@ def check_scoring(scoring): raise InputError("Unknown scoring function") return scoring -# ------------- ** Neural network utils inputs ** -------------------------- - -def generate_weights(n_in, n_out, hl): - - weights = [] - biases = [] - - # Weights from input layer to first hidden layer - w = tf.Variable(tf.truncated_normal([hl[0], n_in], stddev = 1.0 / np.sqrt(hl[0]), dtype = tf.float32), - dtype = tf.float32, name = "weights_in") - b = tf.Variable(tf.zeros([hl[0]], dtype = tf.float32), name="bias_in", dtype = tf.float32) - - weights.append(w) - biases.append(b) - - # Weights from one hidden layer to the next - for i in range(1, len(hl)): - w = tf.Variable(tf.truncated_normal([hl[i], hl[i-1]], stddev=1.0 / np.sqrt(hl[i-1]), dtype=tf.float32), - dtype=tf.float32, name="weights_hidden_%d" % i) - b = tf.Variable(tf.zeros([hl[i]], dtype=tf.float32), name="bias_hidden_%d" % i, dtype=tf.float32) - - weights.append(w) - biases.append(b) - - - # Weights from last hidden layer to output layer - w = tf.Variable(tf.truncated_normal([n_out, hl[-1]], - stddev=1.0 / np.sqrt(hl[-1]), dtype=tf.float32), - dtype=tf.float32, name="weights_out") - b = tf.Variable(tf.zeros([n_out], dtype=tf.float32), name="bias_out", dtype=tf.float32) - - weights.append(w) - biases.append(b) - - return weights, biases - -def get_batch_size(batch_size, n_samples): - - if batch_size > n_samples: - print("Warning: batch_size larger than sample size. It is going to be clipped") - return min(n_samples, batch_size) - - # see if the batch size can be modified slightly to make sure the last batch is similar in size - # to the rest of the batches - # This is always less that the requested batch size, so no memory issues should arise - - better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) - return better_batch_size - - # Custom exception to raise when we intentinoally catch an error # This way we can test that the right error was raised in test cases class InputError(Exception): From 00a2a7c7cb318ff2bf75e5b811c67765a03ea976 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Thu, 27 Sep 2018 15:20:49 +0100 Subject: [PATCH 36/59] Support for linear models (no hidden layers) --- qml/aglaia/tf_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py index ec2dd4a5c..a4cbadf95 100644 --- a/qml/aglaia/tf_utils.py +++ b/qml/aglaia/tf_utils.py @@ -77,6 +77,18 @@ def generate_weights(n_in, n_out, hl): weights = [] biases = [] + # Support for linear models + if len(hl) == 0: + # Weights from input layer to first hidden layer + w = tf.Variable(tf.truncated_normal([n_out, n_in], stddev = 1.0, dtype = tf.float32), + dtype = tf.float32, name = "weights_in") + b = tf.Variable(tf.zeros([n_out], dtype = tf.float32), name="bias_in", dtype = tf.float32) + + weights.append(w) + biases.append(b) + + return weights, biases + # Weights from input layer to first hidden layer w = tf.Variable(tf.truncated_normal([hl[0], n_in], stddev = 1.0 / np.sqrt(hl[0]), dtype = tf.float32), dtype = tf.float32, name = "weights_in") From a925ee0edadd1c6ebec60a48179b513bcd9e10f2 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Thu, 27 Sep 2018 16:22:11 +0100 Subject: [PATCH 37/59] fixed import bug in tf_utils --- qml/aglaia/tf_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py index a4cbadf95..f51b929f6 100644 --- a/qml/aglaia/tf_utils.py +++ b/qml/aglaia/tf_utils.py @@ -26,7 +26,9 @@ """ import os +import numpy as np import tensorflow as tf +from qml.utils import ceil class TensorBoardLogger(object): """ @@ -124,9 +126,9 @@ def get_batch_size(batch_size, n_samples): print("Warning: batch_size larger than sample size. It is going to be clipped") return min(n_samples, batch_size) - # see if the batch size can be modified slightly to make sure the last batch is similar in size - # to the rest of the batches - # This is always less that the requested batch size, so no memory issues should arise + # see if the batch size can be modified slightly to make sure the last batch is similar in size + # to the rest of the batches + # This is always less that the requested batch size, so no memory issues should arise better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) return better_batch_size From c8cf008537b491fff2d00cce26d505ba8d8d1106 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Thu, 27 Sep 2018 17:37:33 +0100 Subject: [PATCH 38/59] Added text to explain that you are scoring on training set --- examples/qmlearn.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index 46de04e05..f503f52c9 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -236,6 +236,7 @@ def pipelines_2(): model.fit(indices[:100]) + # Score on the TRAINING set, since you won't get good predictions in 500 iterations scores = model.score(indices[:100]) print("Negative MAE:", scores) @@ -264,6 +265,7 @@ def pipelines_3(): model.fit(indices[:100]) + # Score on the TRAINING set, since you won't get good predictions in 500 iterations scores = model.score(indices[:100]) print("Negative MAE:", scores) @@ -334,12 +336,12 @@ def cross_validation(): print("*** End CV examples ***") if __name__ == '__main__': - data() - preprocessing() - representations() - kernels() - models() - pipelines() - cross_validation() - pipelines_2() + #data() + #preprocessing() + #representations() + #kernels() + #models() + #pipelines() + #cross_validation() + #pipelines_2() pipelines_3() From 96713c0cf4247efec1f8091096138d79612acaad Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 11:09:16 +0100 Subject: [PATCH 39/59] Restructure. But elements are still not working Sorted elements --- qml/qmlearn/models.py | 390 ++++++++++++++++++++++++------------------ qml/utils/utils.py | 2 +- 2 files changed, 222 insertions(+), 170 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index e3793e160..553774e35 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -28,13 +28,20 @@ try: import tensorflow as tf + from ..aglaia.tf_utils import generate_weights, get_batch_size except ImportError: tf = None -from ..utils import is_numeric_array +from ..utils import is_numeric_array, is_numeric, is_numeric_1d_array, is_positive_integer_1d_array +from ..utils import get_unique from .data import Data from ..math import cho_solve +#TODO remove +from ..utils import check_hl, check_batchsize, check_learningrate, check_iterations, check_reg, check_scoring +def check_size(size): + return size + class _BaseModel(BaseEstimator): """ @@ -65,7 +72,7 @@ def score(self, X, y=None): y_pred = self.predict(X) # Get the true values - if is_numeric_array(y): + if is_numeric_1d_array(y): pass elif isinstance(X, Data): @@ -93,16 +100,14 @@ def score(self, X, y=None): class KernelRidgeRegression(_BaseModel): """ - Standard Kernel Ridge Regression using a cholesky solver + Standard Kernel Ridge Regression using a Cholesky solver + :param l2_reg: l2 regularization + :type l2_reg: float + :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') + :type scoring: string """ def __init__(self, l2_reg=1e-10, scoring='neg_mae'): - """ - :param l2_reg: l2 regularization - :type l2_reg: float - :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') - :type scoring: string - """ self.l2_reg = l2_reg self.scoring = scoring @@ -169,217 +174,256 @@ def predict(self, X): class NeuralNetwork(_BaseModel): """ Feed forward neural network that takes molecular or atomic representations for predicting molecular properties. + + :param hl1: number of hidden nodes in the 1st hidden layer + :type hl1: int + :param hl2: number of hidden nodes in the 2nd hidden layer + :type hl2: int + :param hl3: number of hidden nodes in the 3rd hidden layer + :type hl3: int + :param hl4: number of hidden nodes in the 4th hidden layer + :type hl4: int + :param batch_size: Number of samples to have in each batch during training + :type batch_size: int + :param learning_rate: Step size in optimisation algorithm + :type learning_rate: float + :param iterations: Number of iterations to do during training + :type iterations: int + :param l1_reg: L1 regularisation parameter + :type l1_reg: float + :param l2_reg: L2 regularisation parameter + :type l2_reg: float + :param scoring: What function to use for scoring. Available options are "neg_mae", "mae", "rmsd", "neg_rmsd", + "neg_log_mae" + :type scoring: string + :param size: Maximum number of atoms in a molecule to support. 'auto' will determine this automatically. + Is ignored when using molecular representations. + :type size: int """ def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0.001, iterations=500, l1_reg=0.0, - l2_reg=0.0, scoring="neg_mae"): - """ - :param hl1: number of hidden nodes in the 1st hidden layer - :type hl1: int - :param hl2: number of hidden nodes in the 2nd hidden layer - :type hl2: int - :param hl3: number of hidden nodes in the 3rd hidden layer - :type hl3: int - :param hl4: number of hidden nodes in the 4th hidden layer - :type hl4: int - :param batch_size: Number of samples to have in each batch during training - :type batch_size: int - :param learning_rate: Step size in optimisation algorithm - :type learning_rate: float - :param iterations: Number of iterations to do during training - :type iterations: int - :param l1_reg: L1 regularisation parameter - :type l1_reg: float - :param l2_reg: L2 regularisation parameter - :type l2_reg: float - :param scoring: What function to use for scoring. Available options are "neg_mae", "mae", "rmsd", "neg_rmsd", - "neg_log_mae" - :type scoring: string - """ + l2_reg=0.0, scoring="neg_mae", size='auto'): # Check if tensorflow is available self.tf_check() + # TODO this needs to be changed due to scikit-learn reasons self.hl1, self.hl2, self.hl3, self.hl4 = check_hl(hl1, hl2, hl3, hl4) self.batch_size = check_batchsize(batch_size) self.learning_rate = check_learningrate(learning_rate) self.iterations = check_iterations(iterations) self.l1_reg, self.l2_reg = check_reg(l1_reg, l2_reg) self.scoring = check_scoring(scoring) + self.size = check_size(size) - def fit(self, X, y=None): + # Will be overwritten at fit time + self.elements = [1,6,7,8,16] + self.representation_type = None + + # TODO add removal of unused features + # TODO get elements + def fit(self, X, y=None, nuclear_charges=None): """ - Fit a feedforward neural network. + Fit the neural network. A molecular/atomic network will be fit if + molecular/atomic representations are given. - :param X: Data object - :type X: object from the class Data - :param y: Energies + :param X: Data object or representations + :type X: Data object or array + :param y: Energies. Only used if X is representations. :type y: numpy array of shape (n_samples,) + :param nuclear_charges: Nuclear charges. Only used if X is representations. + :type nuclear_charges: array """ - # obtaining the representations and the energies from the data object + # Obtaining the representations and the energies from the data object if isinstance(X, Data): - g, zs, ene = X._representations, X.nuclear_charges[X._indices], X.energies[X._indices] - ene = np.reshape(ene, (ene.shape[0], 1)) + if y is not None or nuclear_charges is not None: + print("Parameter 'y' and 'nuclear_charges' are expected to be 'None'", + "when 'X' is a Data object") + raise SystemExit + + representations = X._representations + nuclear_charges = X.nuclear_charges[X._indices] + # 2D for tensorflow + energies = np.reshape(X.energies[X._indices], (-1,1)) + self.representation_type = X._representation_type else: - raise NotImplementedError + # y must be an array + if not is_numeric_1d_array(y): + print("Expected parameter 'y' to be a numeric 1d-array. Got %s" % type(y)) + raise SystemExit + + # 2D for tensorflow + energies = np.reshape(y, (-1,1)) + representations = np.asarray(X) + + # Check if the representations are molecular or atomic + if len(representations) > 0 and len(representations[0]) > 0: + item = representations[0][0] + if is_numeric(item): + self.representation_type = 'molecular' + elif is_numeric_1d_array(item): + self.representation_type = 'atomic' + else: + print("Could not recognise format of representations (parameter 'X')") + raise SystemExit + else: + print("Could not recognise format of representations (parameter 'X')") + raise SystemExit - if X._representation_type == 'molecular': + # Check that nuclear charges are also provided if the representation is atomic + if self.representation_type == 'atomic' and nuclear_charges is None: + print("Expected parameter 'nuclear_charges' to be a numeric array.") + raise SystemExit - self._generate_molecular_model(g.shape[-1]) - self._train(g, ene, zs, "molecular") + if self.representation_type == 'atomic': + representations, nuclear_charges = self._padding(representations, nuclear_charges) - elif X._representation_type == 'atomic': + # Get all unique elements + self.elements = get_unique(nuclear_charges) - g, zs = self._padding(g, zs) - self._generate_atomic_model(g.shape[1], g.shape[-1], X.unique_elements) - self._train(g, ene, zs, "atomic") + # Generate the model + self._generate_model(representations.shape) + # Train the model + self._train(representations, energies, nuclear_charges) - def predict(self, X): + def predict(self, X, nuclear_charges=None): """ - Function that predicts the molecular properties from a trained model. + Predict the molecular properties from a trained model. :param X: Data object :type X: object from class Data + :param nuclear_charges: Nuclear charges. Only used if X is representations. + :type nuclear_charges: array :return: predicted properties :rtype: numpy array of shape (n_samples,) """ + if self.representation_type is None: + print("The model have not been fitted") + raise SystemExit + if isinstance(X, Data): - g, zs = X._representations, X.nuclear_charges[X._indices] + if self.representation_type == 'molecular' and nuclear_charges is not None: + print("Parameter 'nuclear_charges' is expected to be 'None'", + "when 'X' is a Data object") + raise SystemExit + + representations, nuclear_charges = X._representations, X.nuclear_charges[X._indices] + else: + representations = X + + if self.representation_type == 'atomic' and nuclear_charges is None: + print("Parameter 'nuclear_charges' is expected to be numeric array") + raise SystemExit - if X._representation_type == "molecular": - y_pred = self._predict(g, zs, "molecular") - elif X._representation_type == "atomic": - g, zs = self._padding(g, zs) - y_pred = self._predict(g, zs, "atomic") + if self.representation_type == "molecular": + y_pred = self._predict(representations) + else: + representations, nuclear_charges = self._padding(representations, nuclear_charges) + y_pred = self._predict(representations, nuclear_charges) return y_pred - def _generate_molecular_model(self, n_features): + # TODO support for different activation functions + def _generate_model(self, shape): """ - This function generates the tensorflow graph for the molecular feed forward neural net. + Generate the Tensorflow graph for the molecular feed forward neural net. - :param n_features: number of features in the representation - :type n_features: int + :param shape: Shape of representations + :type shape: tuple """ - self.graph = tf.Graph() - - hidden_layers = [] - for item in [self.hl1, self.hl2, self.hl3, self.hl4]: - if item != 0: - hidden_layers.append(item) - - hidden_layers = tuple(hidden_layers) - - # Initial set up of the NN - with self.graph.as_default(): - with tf.name_scope("Data"): - ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") - ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") - batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") - buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") - - dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) - dataset = dataset.shuffle(buffer_size=buffer_tf) - dataset = dataset.batch(batch_size_tf) - - iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - tf_x, tf_y = iterator.get_next() - - with tf.name_scope("Weights"): - weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) - - with tf.name_scope("Model"): - z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) - h = tf.sigmoid(z) - - # Calculate the activation of the remaining hidden layers - for i in range(1, len(weights) - 1): - z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) - h = tf.sigmoid(z) - - # Calculating the output of the last layer - y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") + n_features = shape[-1] - with tf.name_scope("Cost_func"): - cost = self._cost(y_pred, tf_y, weights) - - with tf.name_scope("Optimiser"): - optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) - - iter_init_op = iterator.make_initializer(dataset, name="dataset_init") - - def _generate_atomic_model(self, n_atoms, n_features, elements): - """ - This function generates the atomic feed forward neural network. - - :param n_atoms: maximum number of atoms in the molecules - :type n_atoms: int - :param n_features: number of features in the representation - :type n_features: int - :param elements: unique elements in the data set - :type elements: array of ints - """ self.graph = tf.Graph() hidden_layers = [] for item in [self.hl1, self.hl2, self.hl3, self.hl4]: if item != 0: hidden_layers.append(item) - - hidden_layers = tuple(hidden_layers) + else: + break # Initial set up of the NN with self.graph.as_default(): with tf.name_scope("Data"): - ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") - ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") + if self.representation_type == 'atomic': + n_atoms = shape[1] + ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") + ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") + else: + ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") + # Has to be int64 for tf.data.Dataset batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") - dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) + if self.representation_type == 'atomic': + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) + else: + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) + dataset = dataset.shuffle(buffer_size=buffer_tf) dataset = dataset.batch(batch_size_tf) iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - tf_x, tf_zs, tf_y = iterator.get_next() - element_weights = {} - element_biases = {} + if self.representation_type == 'atomic': + tf_x, tf_zs, tf_y = iterator.get_next() + else: + tf_x, tf_y = iterator.get_next() with tf.name_scope("Weights"): - for i in range(len(elements)): + if self.representation_type == 'atomic': + weights = {} + biases = {} + + for i, element in enumerate(self.elements): + w, b = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + weights[element] = w + biases[element] = b + else: weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) - element_weights[elements[i]] = weights - element_biases[elements[i]] = biases with tf.name_scope("Model"): - all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) + if self.representation_type == 'atomic': + all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) + + for el in self.elements: + # Obtaining the indices where the current element occurs + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) + + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(tf_x, where_element) - for el in elements: - # Obtaining the indices of where in Zs there is the current element - current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) - where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, weights[el], + biases[el]) - # Extract the descriptor corresponding to the right element - current_element_in_x = tf.gather_nd(tf_x, where_element) + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) - # Calculate the atomic energy of all the atoms of type equal to the current element - atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, element_weights[el], - element_biases[el]) + # Summing the energies of all the atoms + y_pred = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) - # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies - updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) - all_atomic_energies = tf.add(all_atomic_energies, updates) + else: + if len(hidden_layers) > 0: + z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) + h = tf.sigmoid(z) + + # Calculate the activation of the remaining hidden layers + for i in range(1, len(weights) - 1): + z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) + h = tf.sigmoid(z) - # Summing the energies of all the atoms - total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) + # Calculating the output of the last layer + y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") with tf.name_scope("Cost_func"): - cost = self._cost(total_energies, tf_y, element_weights) + cost = self._cost(y_pred, tf_y, weights) with tf.name_scope("Optimiser"): optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) @@ -417,16 +461,16 @@ def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): return z_squeezed - def _train(self, representations, energies, zs, representation_type): + def _train(self, representations, energies, nuclear_charges): """ - This function trains the neural network. + Train the neural network. :param representations: the representations of the molecules (either atomic or molecular) :type representations: numpy array of shape (n_samples, max_n_atoms, n_features) or (n_samples, n_features) :param energies: the true molecular properties to be learnt :type energies: numpy array of shape (n_samples, ) - :param zs: the nuclear charges of all the atoms in the molecules - :type zs: numpy array of shape (n_samples, max_n_atoms) + :param nuclear_charges: the nuclear charges of all the atoms in the molecules + :type nuclear_charges: numpy array of shape (n_samples, max_n_atoms) :param representation_type: flag setting whether the representation is "atomic" or "molecular". :type representation_type: string """ @@ -436,7 +480,7 @@ def _train(self, representations, energies, zs, representation_type): with self.graph.as_default(): tf_x = self.graph.get_tensor_by_name("Data/Representation:0") tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") - if representation_type == "atomic": + if self.representation_type == "atomic": tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") @@ -450,18 +494,20 @@ def _train(self, representations, energies, zs, representation_type): for i in range(self.iterations): + # Alternate buffer size to improve global shuffling (Not sure that this is needed) if i % 2 == 0: buff = int(3.5 * batch_size) else: buff = int(4.5 * batch_size) - if representation_type == "atomic": - self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: zs, buffer_tf: buff, + if self.representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: nuclear_charges, buffer_tf: buff, batch_size_tf: batch_size}) else: self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, batch_size_tf:batch_size}) + # Iterate to the end of the given data while True: try: self.session.run(optimisation_op) @@ -485,7 +531,7 @@ def _cost(self, y_pred, y, weights): err = tf.square(tf.subtract(y, y_pred)) loss = tf.reduce_mean(err, name="loss") - if isinstance(weights, dict): + if self.representation_type == 'atomic': cost = loss if self.l2_reg > 0: l2_loss = 0 @@ -501,10 +547,10 @@ def _cost(self, y_pred, y, weights): cost = loss if self.l2_reg > 0: l2_loss = self._l2_loss(weights) - cost = cost + l2_loss + cost += l2_loss if self.l1_reg > 0: l1_loss = self._l1_loss(weights) - cost = cost + l1_loss + cost += l1_loss return cost @@ -542,26 +588,24 @@ def _l1_loss(self, weights): return self.l1_reg * reg_term - def _predict(self, representation, zs, representation_type): + def _predict(self, representations, nuclear_charges=None): """ - This function predicts the molecular properties from the representations. + Predicts the molecular properties from the representations. - :param representation: representation of the molecules, can be atomic or molecular - :type representation: numpy array of shape (n_samples, n_max_atoms, n_features) or (n_samples, n_features) - :param zs: nuclear charges of the molecules - :type zs: numpy array of shape (n_samples, n_max_atoms) - :param representation_type: flag saying whether the representation is the "molecular" or "atomic" - :type representation_type: string + :param representations: representations of the molecules, can be atomic or molecular + :type representations: numpy array of shape (n_samples, n_max_atoms, n_features) or (n_samples, n_features) + :param nuclear_charges: nuclear charges of the molecules. Only required for atomic representations. + :type nuclear_charges: numpy array of shape (n_samples, n_max_atoms) :return: molecular properties predictions :rtype: numpy array of shape (n_samples,) """ - batch_size = get_batch_size(self.batch_size, representation.shape[0]) + batch_size = get_batch_size(self.batch_size, representations.shape[0]) with self.graph.as_default(): tf_x = self.graph.get_tensor_by_name("Data/Representation:0") tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") - if representation_type == "atomic": + if self.representation_type == "atomic": tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") @@ -569,16 +613,17 @@ def _predict(self, representation, zs, representation_type): y_pred = self.graph.get_tensor_by_name("Model/Predicted_energies:0") - if representation_type == "atomic": - self.session.run(iter_init_op, feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), - tf_zs: zs, buffer_tf: 1, batch_size_tf: batch_size}) + if self.representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: np.empty((representations.shape[0], 1)), + tf_zs: nuclear_charges, buffer_tf: 1, batch_size_tf: batch_size}) else: self.session.run(iter_init_op, - feed_dict={tf_x: representation, tf_y: np.empty((representation.shape[0], 1)), + feed_dict={tf_x: representations, tf_y: np.empty((representations.shape[0], 1)), batch_size_tf: batch_size, buffer_tf:1}) tot_y_pred = [] + # Predict until reaching the end of the iterator while True: try: ene_predictions = self.session.run(y_pred) @@ -605,6 +650,13 @@ def _padding(self, representation, nuclear_charges): if n_atoms > max_n_atoms: max_n_atoms = n_atoms + if self.size == 'auto': + self.size = max_n_atoms + + elif max_n_atoms > self.size: + print("Trying to predict on larger molecules than given by the 'size' parameter at initialization") + raise SystemExit + padded_rep = np.zeros((len(representation), max_n_atoms, representation[0].shape[1])) padded_zs = np.zeros((len(representation), max_n_atoms)) diff --git a/qml/utils/utils.py b/qml/utils/utils.py index b694e4f4b..31638c55f 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -104,7 +104,7 @@ def get_unique(x): Gets all unique elements in lists of lists """ elements = list(set(item for l in x for item in l)) - return elements + return sorted(elements) def get_pairs(x): """ From f313effffcc6d6a25b6b4ff1c4803eab6498918b Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 13:24:19 +0100 Subject: [PATCH 40/59] Moved documentation from init to class --- qml/qmlearn/representations.py | 94 ++++++++++++++++------------------ 1 file changed, 44 insertions(+), 50 deletions(-) diff --git a/qml/qmlearn/representations.py b/qml/qmlearn/representations.py index 5cece8918..fb9c13bde 100644 --- a/qml/qmlearn/representations.py +++ b/qml/qmlearn/representations.py @@ -153,48 +153,44 @@ class _AtomicRepresentation(_BaseRepresentation): class CoulombMatrix(_MolecularRepresentation): """ Coulomb Matrix representation as described in 10.1103/PhysRevLett.108.058301 + Sorting of the elements can either be done by ``sorting="row-norm"`` or ``sorting="unsorted"``. + A matrix :math:`M` is constructed with elements - """ - - _representation_short_name = "cm" + .. math:: - def __init__(self, data=None, size='auto', sorting="row-norm"): - """ - Coulomb Matrix representation of a molecule. - Sorting of the elements can either be done by ``sorting="row-norm"`` or ``sorting="unsorted"``. - A matrix :math:`M` is constructed with elements + M_{ij} = + \\begin{cases} + \\tfrac{1}{2} Z_{i}^{2.4} & \\text{if } i = j \\\\ + \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} & \\text{if } i \\neq j + \\end{cases}, - .. math:: + where :math:`i` and :math:`j` are atom indices, :math:`Z` is nuclear charge and + :math:`\\bf R` is the coordinate in euclidean space. + If ``sorting = 'row-norm'``, the atom indices are reordered such that - M_{ij} = - \\begin{cases} - \\tfrac{1}{2} Z_{i}^{2.4} & \\text{if } i = j \\\\ - \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} & \\text{if } i \\neq j - \\end{cases}, + :math:`\\sum_j M_{1j}^2 \\geq \\sum_j M_{2j}^2 \\geq ... \\geq \\sum_j M_{nj}^2` - where :math:`i` and :math:`j` are atom indices, :math:`Z` is nuclear charge and - :math:`\\bf R` is the coordinate in euclidean space. - If ``sorting = 'row-norm'``, the atom indices are reordered such that + The upper triangular of M, including the diagonal, is concatenated to a 1D + vector representation. - :math:`\\sum_j M_{1j}^2 \\geq \\sum_j M_{2j}^2 \\geq ... \\geq \\sum_j M_{nj}^2` + If ``sorting = 'unsorted``, the elements are sorted in the same order as the input coordinates + and nuclear charges. - The upper triangular of M, including the diagonal, is concatenated to a 1D - vector representation. + The representation is calculated using an OpenMP parallel Fortran routine. - If ``sorting = 'unsorted``, the elements are sorted in the same order as the input coordinates - and nuclear charges. + :param size: The size of the largest molecule supported by the representation. + `size='auto'` will try to determine this automatically. + :type size: integer + :param sorting: How the atom indices are sorted ('row-norm', 'unsorted') + :type sorting: string + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + """ - The representation is calculated using an OpenMP parallel Fortran routine. + _representation_short_name = "cm" - :param size: The size of the largest molecule supported by the representation. - `size='auto'` will try to determine this automatically. - :type size: integer - :param sorting: How the atom indices are sorted ('row-norm', 'unsorted') - :type sorting: string - :param data: Optional Data object containing all molecules used in training \ - and/or prediction - :type data: Data object - """ + def __init__(self, data=None, size='auto', sorting="row-norm"): self.size = size self.sorting = sorting @@ -632,28 +628,26 @@ def transform(self, X): class AtomCenteredSymmetryFunctions(_AtomicRepresentation): """ The variant of Atom-Centered Symmetry Functions used in 10.1039/C7SC04934J + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param nbasis: Number of basis functions to use + :type nbasis: integer + :param cutoff: Cutoff radius + :type cutoff: float + :param precision: Precision in the basis functions. A value of 2 corresponds to the \ + basis functions intersecting at half maximum. Higher values makes \ + the basis functions narrower. + :type precision: float + :param elements: Atomnumber of elements that the representation should support. + `elements='auto'` will try to determine this automatically. + :type elements: list """ _representation_short_name = "acsf" alchemy = False def __init__(self, data=None, nbasis=3, precision=2, cutoff=5.0, elements='auto'): - """ - :param data: Optional Data object containing all molecules used in training \ - and/or prediction - :type data: Data object - :param nbasis: Number of basis functions to use - :type nbasis: integer - :param cutoff: Cutoff radius - :type cutoff: float - :param precision: Precision in the basis functions. A value of 2 corresponds to the \ - basis functions intersecting at half maximum. Higher values makes \ - the basis functions narrower. - :type precision: float - :param elements: Atomnumber of elements that the representation should support. - `elements='auto'` will try to determine this automatically. - :type elements: list - """ self._set_data(data) self.nbasis = nbasis @@ -712,9 +706,9 @@ def transform(self, X): representations = [] for charge, xyz, n in zip(nuclear_charges, coordinates, natoms): - representations.append( + representations.append(np.asarray( fgenerate_acsf(xyz, charge, self.elements, Rs, Rs, Ts, - eta, eta, zeta, self.cutoff, self.cutoff, n, size)) + eta, eta, zeta, self.cutoff, self.cutoff, n, size))) data._representations = np.asarray(representations) From 20b78b8a9e116a0cded016994b57e4bb64c19885 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 13:25:06 +0100 Subject: [PATCH 41/59] Constant features will now be removed at fit/predict time --- qml/qmlearn/models.py | 81 +++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index 553774e35..8b948f798 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -217,11 +217,10 @@ def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0 self.size = check_size(size) # Will be overwritten at fit time - self.elements = [1,6,7,8,16] - self.representation_type = None + self.elements = None + self._representation_type = None + self._constant_features = None - # TODO add removal of unused features - # TODO get elements def fit(self, X, y=None, nuclear_charges=None): """ Fit the neural network. A molecular/atomic network will be fit if @@ -246,7 +245,7 @@ def fit(self, X, y=None, nuclear_charges=None): nuclear_charges = X.nuclear_charges[X._indices] # 2D for tensorflow energies = np.reshape(X.energies[X._indices], (-1,1)) - self.representation_type = X._representation_type + self._representation_type = X._representation_type else: # y must be an array if not is_numeric_1d_array(y): @@ -261,9 +260,9 @@ def fit(self, X, y=None, nuclear_charges=None): if len(representations) > 0 and len(representations[0]) > 0: item = representations[0][0] if is_numeric(item): - self.representation_type = 'molecular' + self._representation_type = 'molecular' elif is_numeric_1d_array(item): - self.representation_type = 'atomic' + self._representation_type = 'atomic' else: print("Could not recognise format of representations (parameter 'X')") raise SystemExit @@ -272,21 +271,49 @@ def fit(self, X, y=None, nuclear_charges=None): raise SystemExit # Check that nuclear charges are also provided if the representation is atomic - if self.representation_type == 'atomic' and nuclear_charges is None: + if self._representation_type == 'atomic' and nuclear_charges is None: print("Expected parameter 'nuclear_charges' to be a numeric array.") raise SystemExit - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': representations, nuclear_charges = self._padding(representations, nuclear_charges) # Get all unique elements self.elements = get_unique(nuclear_charges) + # Remove constant features + representations = self._remove_constant_features(representations) + # Generate the model self._generate_model(representations.shape) # Train the model self._train(representations, energies, nuclear_charges) + def _remove_constant_features(self, representations): + """ + Removes constant features of the representations, and stores + which features should be removed at predict time. + """ + + if self._representation_type == 'atomic': + # Due to how the atomic neural network iś constructed, + # this cannot be done elementwise + rep = representations.reshape(-1, representations.shape[-1]) + if self._constant_features is None: + self._constant_features = np.all(rep == rep[0], axis=0) + + rep = rep[:,~self._constant_features] + rep = rep.reshape(representations.shape[0:2] + (-1,)) + + return rep + + else: + if self._constant_features is None: + self._constant_features = np.all(representations == representations[0], axis=0) + + return representations[:,~self._constant_features] + + def predict(self, X, nuclear_charges=None): """ Predict the molecular properties from a trained model. @@ -299,12 +326,12 @@ def predict(self, X, nuclear_charges=None): :rtype: numpy array of shape (n_samples,) """ - if self.representation_type is None: + if self._representation_type is None: print("The model have not been fitted") raise SystemExit if isinstance(X, Data): - if self.representation_type == 'molecular' and nuclear_charges is not None: + if self._representation_type == 'molecular' and nuclear_charges is not None: print("Parameter 'nuclear_charges' is expected to be 'None'", "when 'X' is a Data object") raise SystemExit @@ -313,15 +340,17 @@ def predict(self, X, nuclear_charges=None): else: representations = X - if self.representation_type == 'atomic' and nuclear_charges is None: + if self._representation_type == 'atomic' and nuclear_charges is None: print("Parameter 'nuclear_charges' is expected to be numeric array") raise SystemExit - if self.representation_type == "molecular": - y_pred = self._predict(representations) - else: + if self._representation_type == "atomic": representations, nuclear_charges = self._padding(representations, nuclear_charges) - y_pred = self._predict(representations, nuclear_charges) + + # Remove constant features + representations = self._remove_constant_features(representations) + + y_pred = self._predict(representations, nuclear_charges) return y_pred @@ -349,7 +378,7 @@ def _generate_model(self, shape): with self.graph.as_default(): with tf.name_scope("Data"): ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': n_atoms = shape[1] ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") @@ -359,7 +388,7 @@ def _generate_model(self, shape): batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) else: dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) @@ -369,13 +398,13 @@ def _generate_model(self, shape): iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': tf_x, tf_zs, tf_y = iterator.get_next() else: tf_x, tf_y = iterator.get_next() with tf.name_scope("Weights"): - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': weights = {} biases = {} @@ -387,7 +416,7 @@ def _generate_model(self, shape): weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) with tf.name_scope("Model"): - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) for el in self.elements: @@ -480,7 +509,7 @@ def _train(self, representations, energies, nuclear_charges): with self.graph.as_default(): tf_x = self.graph.get_tensor_by_name("Data/Representation:0") tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") - if self.representation_type == "atomic": + if self._representation_type == "atomic": tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") @@ -500,7 +529,7 @@ def _train(self, representations, energies, nuclear_charges): else: buff = int(4.5 * batch_size) - if self.representation_type == "atomic": + if self._representation_type == "atomic": self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: nuclear_charges, buffer_tf: buff, batch_size_tf: batch_size}) else: @@ -531,7 +560,7 @@ def _cost(self, y_pred, y, weights): err = tf.square(tf.subtract(y, y_pred)) loss = tf.reduce_mean(err, name="loss") - if self.representation_type == 'atomic': + if self._representation_type == 'atomic': cost = loss if self.l2_reg > 0: l2_loss = 0 @@ -605,7 +634,7 @@ def _predict(self, representations, nuclear_charges=None): with self.graph.as_default(): tf_x = self.graph.get_tensor_by_name("Data/Representation:0") tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") - if self.representation_type == "atomic": + if self._representation_type == "atomic": tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") @@ -613,7 +642,7 @@ def _predict(self, representations, nuclear_charges=None): y_pred = self.graph.get_tensor_by_name("Model/Predicted_energies:0") - if self.representation_type == "atomic": + if self._representation_type == "atomic": self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: np.empty((representations.shape[0], 1)), tf_zs: nuclear_charges, buffer_tf: 1, batch_size_tf: batch_size}) else: From 4b785f18422d38cc8dd416c9e779c72092040b8c Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 15:07:43 +0100 Subject: [PATCH 42/59] Moved get_batch_size back into utils, since it doesn't depend on tf --- qml/aglaia/tf_utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py index f51b929f6..5dc73aea5 100644 --- a/qml/aglaia/tf_utils.py +++ b/qml/aglaia/tf_utils.py @@ -120,16 +120,3 @@ def generate_weights(n_in, n_out, hl): return weights, biases -def get_batch_size(batch_size, n_samples): - - if batch_size > n_samples: - print("Warning: batch_size larger than sample size. It is going to be clipped") - return min(n_samples, batch_size) - - # see if the batch size can be modified slightly to make sure the last batch is similar in size - # to the rest of the batches - # This is always less that the requested batch size, so no memory issues should arise - - better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) - return better_batch_size - From 8d620840ccb284dc7a1b5be050e50a8c2752d964 Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 15:09:48 +0100 Subject: [PATCH 43/59] Made the NeuralNetwork class compliant with sklearn Cannot be any transforms of the input data --- examples/qmlearn.py | 16 ++++---- qml/qmlearn/models.py | 81 +++++++++++++++++++++++++++++++--------- qml/utils/utils.py | 87 +++++++++++++++++-------------------------- 3 files changed, 106 insertions(+), 78 deletions(-) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index f503f52c9..135ea4628 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -336,12 +336,12 @@ def cross_validation(): print("*** End CV examples ***") if __name__ == '__main__': - #data() - #preprocessing() - #representations() - #kernels() - #models() - #pipelines() - #cross_validation() - #pipelines_2() + data() + preprocessing() + representations() + kernels() + models() + pipelines() + cross_validation() + pipelines_2() pipelines_3() diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index 8b948f798..f4152eeb1 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -28,20 +28,16 @@ try: import tensorflow as tf - from ..aglaia.tf_utils import generate_weights, get_batch_size + from ..aglaia.tf_utils import generate_weights except ImportError: tf = None from ..utils import is_numeric_array, is_numeric, is_numeric_1d_array, is_positive_integer_1d_array -from ..utils import get_unique +from ..utils import get_unique, is_positive_integer_or_zero, get_batch_size, is_positive_integer +from ..utils import is_positive, is_positive_or_zero from .data import Data from ..math import cho_solve -#TODO remove -from ..utils import check_hl, check_batchsize, check_learningrate, check_iterations, check_reg, check_scoring -def check_size(size): - return size - class _BaseModel(BaseEstimator): """ @@ -98,6 +94,12 @@ def score(self, X, y=None): elif self.scoring == 'neg_log_mae': return - np.log(mean_absolute_error(y, y_pred)) + def _set_scoring(self, scoring): + if not scoring in ['mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae']: + raise InputError("Unknown scoring function") + + self.scoring = scoring + class KernelRidgeRegression(_BaseModel): """ Standard Kernel Ridge Regression using a Cholesky solver @@ -207,20 +209,66 @@ def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0 # Check if tensorflow is available self.tf_check() - # TODO this needs to be changed due to scikit-learn reasons - self.hl1, self.hl2, self.hl3, self.hl4 = check_hl(hl1, hl2, hl3, hl4) - self.batch_size = check_batchsize(batch_size) - self.learning_rate = check_learningrate(learning_rate) - self.iterations = check_iterations(iterations) - self.l1_reg, self.l2_reg = check_reg(l1_reg, l2_reg) - self.scoring = check_scoring(scoring) - self.size = check_size(size) + self._set_hl(hl1, hl2, hl3, hl4) + self._set_batch_size(batch_size) + self._set_learning_rate(learning_rate) + self._set_iterations(iterations) + self._set_reg(l1_reg, l2_reg) + self._set_scoring(scoring) + self._set_size(size) # Will be overwritten at fit time self.elements = None self._representation_type = None self._constant_features = None + def _set_hl(self, hl1, hl2, hl3, hl4): + + for hl in hl1, hl2, hl3, hl4: + if not is_positive_integer_or_zero(hl1): + print("Error: Expected the number of hidden nodes in a layer to be positive. Got %s" % str(hl)) + raise SystemExit + + self.hl1 = hl1 + self.hl2 = hl2 + self.hl3 = hl3 + self.hl4 = hl4 + + def _set_batch_size(self, bs): + if not is_positive_integer(bs) or int(bs) == 1: + print("'batch_size' should be larger than 1.") + raise SystemExit + + self.batch_size = bs + + def _set_size(self, size): + if not is_positive_integer(size) and size != 'auto': + print("Variable 'size' should be a positive integer. Got %s" % str(size)) + raise SystemExit + + self.size = size + + def _set_learning_rate(self, lr): + if not is_positive(lr): + print("Expected positive float value for variable 'learning_rate'. Got %s" % str(lr)) + raise SystemExit + + self.learning_rate = lr + + def _set_iterations(self, it): + if not is_positive_integer(it): + print("Expected positive integer value for variable 'iterations'. Got %s" % str(it)) + raise SystemExit + + self.iterations = it + + def _set_reg(self, l1_reg, l2_reg): + if not is_positive_or_zero(l1_reg) or not is_positive_or_zero(l2_reg): + raise InputError("Expected positive float value for regularisation variables 'l1_reg' and 'l2_reg. Got %s and %s" % (str(l1_reg), str(l2_reg))) + + self.l1_reg = l1_reg + self.l2_reg = l2_reg + def fit(self, X, y=None, nuclear_charges=None): """ Fit the neural network. A molecular/atomic network will be fit if @@ -313,7 +361,6 @@ def _remove_constant_features(self, representations): return representations[:,~self._constant_features] - def predict(self, X, nuclear_charges=None): """ Predict the molecular properties from a trained model. @@ -370,7 +417,7 @@ def _generate_model(self, shape): hidden_layers = [] for item in [self.hl1, self.hl2, self.hl3, self.hl4]: if item != 0: - hidden_layers.append(item) + hidden_layers.append(int(item)) else: break diff --git a/qml/utils/utils.py b/qml/utils/utils.py index 31638c55f..10197d328 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -64,8 +64,10 @@ def is_numeric_1d_array(x): def is_1d_array(x): return is_array_like(x) and (np.asarray(x).ndim == 1 or np.asarray(x).ndim == 2 and 1 in np.asarray(x).shape) +# Doesn't accept floats e.g. 1.0 def _is_integer(x): - return (is_numeric(x) and (float(x) == int(x))) + return isinstance(x, int) + #return (is_numeric(x) and (float(x) == int(x))) # will intentionally accept 0, 1 as well def is_bool(x): @@ -99,25 +101,6 @@ def is_positive_integer_array(x): def is_positive_integer_or_zero_array(x): return (_is_integer_array(x) and _is_positive_or_zero_array(x)) -def get_unique(x): - """ - Gets all unique elements in lists of lists - """ - elements = list(set(item for l in x for item in l)) - return sorted(elements) - -def get_pairs(x): - """ - Get all unique pairs. E.g. x = [1,2,3] will return - [[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]] - """ - pairs = [] - for i,v in enumerate(x): - for w in x[i:]: - pairs.append([v,w]) - return pairs - - # ------------- ** Checking inputs ** -------------------------- def check_global_representation(x): @@ -270,42 +253,26 @@ def check_classes(classes): return approved_classes -def check_hl(hl1, hl2, hl3, hl4): - - layers = [hl1, hl2, hl3, hl4] - approved_layers = [] - - for item in layers: - is_positive_integer(item) - approved_layers.append(int(item)) - - return approved_layers[0], approved_layers[1], approved_layers[2], approved_layers[3] - -def check_batchsize(bs): - is_positive_integer(bs) - if bs == 1: - raise InputError("Batch size should be larger than 1.") - return int(bs) +# ------------ ** Utility functions ** ---------------- -def check_learningrate(lr): - if not is_positive(lr): - raise InputError("Expected positive float value for variable learning_rate. Got %s" % str(lr)) - return float(lr) - -def check_iterations(it): - if not is_positive_integer(it): - raise InputError("Expected positive integer value for variable iterations. Got %s" % str(it)) - return int(it) +def get_unique(x): + """ + Gets all unique elements in lists of lists + """ + elements = list(set(item for l in x for item in l)) + return sorted(elements) -def check_reg(l1_reg, l2_reg): - if not is_positive_or_zero(l1_reg) or not is_positive_or_zero(l2_reg): - raise InputError("Expected positive float value for regularisation variables 'l1_reg' and 'l2_reg. Got %s and %s" % (str(l1_reg), str(l2_reg))) - return float(l1_reg), float(l2_reg) +def get_pairs(x): + """ + Get all unique pairs. E.g. x = [1,2,3] will return + [[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]] + """ + pairs = [] + for i,v in enumerate(x): + for w in x[i:]: + pairs.append([v,w]) + return pairs -def check_scoring(scoring): - if not scoring in ['mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae']: - raise InputError("Unknown scoring function") - return scoring # Custom exception to raise when we intentinoally catch an error # This way we can test that the right error was raised in test cases @@ -323,3 +290,17 @@ def ceil(a, b): """ return -(-a//b) + +def get_batch_size(batch_size, n_samples): + + if batch_size > n_samples: + print("Warning: batch_size larger than sample size. It is going to be clipped") + return min(n_samples, batch_size) + + # see if the batch size can be modified slightly to make sure the last batch is similar in size + # to the rest of the batches + # This is always less that the requested batch size, so no memory issues should arise + + better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) + return better_batch_size + From d8447e7f9d9e69229053e240acee7f43c050ab0e Mon Sep 17 00:00:00 2001 From: larsbratholm Date: Fri, 28 Sep 2018 15:23:36 +0100 Subject: [PATCH 44/59] Fixed tests that didn't pass --- test/test_neural_network.py | 10 +++++----- test/test_symm_funct.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_neural_network.py b/test/test_neural_network.py index 21ec5318b..798b076ac 100644 --- a/test/test_neural_network.py +++ b/test/test_neural_network.py @@ -107,7 +107,6 @@ def catch(s): # This should not raise an exception C(batch_size = 2) - C(batch_size = 2.0) C(batch_size = "auto") # This should be caught @@ -116,6 +115,7 @@ def catch(s): catch("x") catch(4.2) catch(None) + catch(2.0) def learning_rate(C): # Exceptions that are supposed to be caught @@ -146,13 +146,13 @@ def catch(s): # This should not raise an exception C(iterations = 1) - C(iterations = 1.0) # This should be caught catch(-2) catch("x") catch(4.2) catch(None) + catch(1.0) def tf_dtype(C): # Exceptions that are supposed to be caught @@ -193,7 +193,6 @@ def catch(s): # This should not raise an exception C(hl1 = 1) - C(hl1 = 1.0) # This should be caught catch(0) @@ -201,6 +200,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def hl2(C): # Exceptions that are supposed to be caught @@ -213,7 +213,6 @@ def catch(s): # This should not raise an exception C(hl2 = 1) - C(hl2 = 1.0) C(hl2 = 0) # This should be caught @@ -221,6 +220,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def hl3(C): # Exceptions that are supposed to be caught @@ -233,7 +233,6 @@ def catch(s): # This should not raise an exception C(hl2 = 2, hl3 = 1) - C(hl2 = 2, hl3 = 1.0) C(hl2 = 2, hl3 = 0) # This should be caught @@ -241,6 +240,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def representation(C): # Exceptions that are supposed to be caught diff --git a/test/test_symm_funct.py b/test/test_symm_funct.py index 6fa92c7ce..7a941d3ed 100644 --- a/test/test_symm_funct.py +++ b/test/test_symm_funct.py @@ -106,9 +106,9 @@ def fort_acsf(mols, path, elements): def tf_acsf(mols, path, elements): radial_cutoff = 5 angular_cutoff = 5 - radial_rs = np.linspace(0, radial_cutoff, 3) - angular_rs = np.linspace(0, angular_cutoff, 3) - theta_s = np.linspace(0, np.pi, 3) + n_radial_rs = 3 + n_angular_rs = 3 + n_theta_s = 3 zeta = 1.0 eta = 1.0 @@ -128,7 +128,7 @@ def tf_acsf(mols, path, elements): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, radial_rs, angular_rs, theta_s, zeta, eta) + acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta) sess = tf.Session() sess.run(tf.global_variables_initializer()) From a8f6062216913877c2abdcab03d1b75e8679da12 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Fri, 28 Sep 2018 17:13:05 +0100 Subject: [PATCH 45/59] Fixed mistake in checks of set_classes() in ARMP --- qml/aglaia/aglaia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 7c4337695..8429e7e0e 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -770,10 +770,10 @@ def set_classes(self, classes): if classes is None: raise InputError("Classes cannot be set to none.") else: - if is_positive_integer_array(classes): + if is_positive_integer_or_zero_array(classes): self.classes = np.asarray(classes) else: - raise InputError('Variable "gradients" expected to be array like of positive integers.') + raise InputError('Variable "classes" expected to be array like of positive integers.') def fit(self, x, y=None, dy=None, classes=None): """ From d4d98b6405d8ccb14ffde86236f507956defe51b Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 1 Oct 2018 14:10:47 +0100 Subject: [PATCH 46/59] started fixing ARMP bugs for QM7 --- qml/aglaia/aglaia.py | 112 +++++++++++-------------------------------- 1 file changed, 27 insertions(+), 85 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 8429e7e0e..dbbdf263b 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -702,7 +702,7 @@ def generate_representation(self, xyz=None, classes=None, method='fortran'): elif xyz is None: # Make representations from compounds - self.representation, self.classes = self._generate_representations_from_compounds() + self.representation, self.classes = self._generate_representations_from_compounds(method) else: raise InputError("Compounds have already been set but new xyz data is being passed.") @@ -1647,13 +1647,13 @@ def _generate_representations_from_data(self, xyz, classes, method): elif self.representation_name == 'acsf': if method == 'tf': - representation = self._generate_acsf_from_data_tf(xyz, classes) + representation = self._generate_acsf_tf(xyz, classes) else: - representation = self._generate_acsf_from_data_fortran(xyz, classes) + representation = self._generate_acsf_fortran(xyz, classes) return representation, classes - def _generate_acsf_from_data_tf(self, xyz, classes): + def _generate_acsf_tf(self, xyz, classes): """ This function generates the acsf from the cartesian coordinates and the classes. @@ -1733,7 +1733,7 @@ def _generate_acsf_from_data_tf(self, xyz, classes): return representation_conc - def _generate_acsf_from_data_fortran(self, xyz, classes): + def _generate_acsf_fortran(self, xyz, classes): """ This function uses fortran to generate the representation and the derivative of the representation with respect to the cartesian coordinates. @@ -1788,7 +1788,7 @@ def _generate_acsf_from_data_fortran(self, xyz, classes): return np.asarray(representation) - def _generate_representations_from_compounds(self): + def _generate_representations_from_compounds(self, method): """ This function generates the representations from the compounds. :return: the representations and the classes @@ -1804,22 +1804,26 @@ def _generate_representations_from_compounds(self): elif self.representation_name == 'acsf': - representations, classes = self._generate_acsf_from_compounds() + xyz, zs = self._extract_and_pad() + if method == "tf": + representations, classes = self._generate_acsf_tf(xyz, zs) + elif method == "fortran": + representations, classes = self._generate_acsf_fortran(xyz, zs) + else: + raise InputError("Method not recognised.") else: raise InputError("This should never happen, unrecognised representation %s." % (self.representation_name)) return representations, classes - def _generate_acsf_from_compounds(self): + def _extract_and_pad(self): """ - This function generates the atom centred symmetry functions. - - :return: representation acsf and classes - :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) + This function extracts cartesian coordinates and nuclear charges for the compounds and pads them. + :return: xyz and zs padded + :rtype: numpy arrays of shape (n_samples, max_n_atoms, 3) and (n_samples, max_n_atoms) """ - - # Obtaining the xyz and the nuclear charges + # Obtaining the xyz and the nuclear charges from the compounds xyzs = [] zs = [] max_n_atoms = 0 @@ -1830,80 +1834,18 @@ def _generate_acsf_from_compounds(self): if len(compound.nuclear_charges) > max_n_atoms: max_n_atoms = len(compound.nuclear_charges) - elements, element_pairs = self._get_elements_and_pairs(zs) + n_samples = len(xyzs) - # Padding so that all the samples have the same shape - n_samples = len(zs) - for i in range(n_samples): - current_n_atoms = len(zs[i]) - missing_n_atoms = max_n_atoms - current_n_atoms - zs_padding = np.zeros(missing_n_atoms) - zs[i] = np.concatenate((zs[i], zs_padding)) - xyz_padding = np.zeros((missing_n_atoms, 3)) - xyzs[i] = np.concatenate((xyzs[i], xyz_padding)) + # Padding the xyz and the nuclear charges + padded_xyz = np.zeros((n_samples, max_n_atoms, 3)) + padded_zs = np.zeros((n_samples, max_n_atoms)) - zs = np.asarray(zs, dtype=np.int32) - xyzs = np.asarray(xyzs, dtype=np.float32) - - if self.tensorboard: - self.tensorboard_logger_representation.initialise() - # run_metadata = tf.RunMetadata() - # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - - # Turning the quantities into tensors - with tf.name_scope("Inputs"): - zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") - xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - - dataset = tf.data.Dataset.from_tensor_slices((xyz_tf, zs_tf)) - dataset = dataset.batch(20) - iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - batch_xyz, batch_zs = iterator.get_next() - - representations = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - rcut=self.acsf_parameters['rcut'], - acut=self.acsf_parameters['acut'], - nRs2=self.acsf_parameters['nRs2'], - nRs3=self.acsf_parameters['nRs3'], - nTs=self.acsf_parameters['nTs'], - eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) - - sess = tf.Session() - sess.run(tf.global_variables_initializer()) - sess.run(iterator.make_initializer(dataset), feed_dict={xyz_tf: xyzs, zs_tf: zs}) - - representations_slices = [] - - if self.tensorboard: - self.tensorboard_logger_representation.set_summary_writer(sess) - - batch_counter = 0 - while True: - try: - representations_np = sess.run(representations, options=self.tensorboard_logger_representation.options, - run_metadata=self.tensorboard_logger_representation.run_metadata) - self.tensorboard_logger_representation.write_metadata(batch_counter) - - representations_slices.append(representations_np) - batch_counter += 1 - except tf.errors.OutOfRangeError: - break - else: - batch_counter = 0 - while True: - try: - representations_np = sess.run(representations) - representations_slices.append(representations_np) - batch_counter += 1 - except tf.errors.OutOfRangeError: - break - - representation_conc = np.concatenate(representations_slices, axis=0) - - sess.close() + for i in range(n_samples): + current_n_atoms = xyzs[i].shape[0] + padded_xyz[i, :current_n_atoms, :] = xyzs[i] + padded_zs[i, :current_n_atoms] = zs[i] - return representation_conc, zs + return padded_xyz, padded_zs def _generate_slatm_from_compounds(self): """ From 2b22aa34c8882aded86fed2fe4729127668fdce6 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Mon, 1 Oct 2018 16:19:06 +0100 Subject: [PATCH 47/59] Fixed bug in padding and added examples that give low errors --- docs/source/qml_examples/examples.ipynb | 110 ++++++++++++++++-------- examples/ARMP_1.py | 5 +- examples/ARMP_2.py | 2 +- examples/ARMP_3.py | 2 +- examples/ARMP_qm7.py | 41 +++++++++ qml/aglaia/aglaia.py | 10 +-- 6 files changed, 127 insertions(+), 43 deletions(-) create mode 100644 examples/ARMP_qm7.py diff --git a/docs/source/qml_examples/examples.ipynb b/docs/source/qml_examples/examples.ipynb index caad1725f..48181bb14 100644 --- a/docs/source/qml_examples/examples.ipynb +++ b/docs/source/qml_examples/examples.ipynb @@ -458,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -471,6 +471,7 @@ ], "source": [ "from qml.aglaia.aglaia import ARMP\n", + "import matplotlib.pyplot as plt\n", "import glob\n", "import numpy as np\n", "import os\n", @@ -488,17 +489,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we can create the estimator and specify all the hyper-parameters needed. For example, below we define an estimator that will do 100 training epochs, will use the Atom Centered Symmetry Functions (ACSF) as the representation, and will have a L1 regularisation parameter on the weights of 0.001." + "Then, we can create the estimator and specify all the hyper-parameters needed. For example, below we define an estimator that will do 5000 training epochs, will use the Atom Centered Symmetry Functions (ACSF) as the representation, and will have a L1 and L2 regularisation parameters on the weights of 0.0 and a learning rate of 0.075." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "estimator = ARMP(iterations=100, representation='acsf', representation_params={\"radial_rs\": np.arange(0,10, 1), \"angular_rs\": np.arange(0.5, 10.5, 1),\n", - "\"theta_s\": np.arange(0, 3.14, 1)}, l1_reg=0.001, scoring_function=\"rmse\")" + "acsf_params = {\"nRs2\": 5, \"nRs3\": 5, \"nTs\": 5, \"rcut\": 5, \"acut\": 5, \"zeta\": 220.127, \"eta\": 30.8065}\n", + "estimator = ARMP(iterations=5000, representation_name='acsf', representation_params=acsf_params, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")" ] }, { @@ -510,20 +511,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(100, 19, 270)\n" + "(100, 19, 165)\n" ] } ], "source": [ "estimator.generate_compounds(filenames)\n", - "estimator.generate_representation()\n", + "estimator.generate_representation(method=\"fortran\")\n", "print(estimator.representation.shape)" ] }, @@ -538,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -549,16 +550,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "At this point, we have stored 100 data samples in the class. Let's say that we want to train on 80% of the data and test on 20%. To do this we need to pass a list of indices to the fit method specifying on which samples we would like to train. This can be done as follows:" + "At this point, we have stored 100 data samples in the class. To train the model, we need to specify the indices of the samples we want to train on. Here, since we have only loaded 100 samples, we are going to use all of them." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "idx_train = np.arange(0,80)\n", + "idx_train = np.arange(0,100)\n", "\n", "estimator.fit(idx_train)" ] @@ -572,24 +573,37 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 74 kJ/mol\n" + "The RMSE is 0.05667379826437678 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAEKCAYAAAA8QgPpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XucXVV99/HPl0mA4aIDEmiuTcA0NohcnEIQ64OiJKFIoi9pQ/UxYmoeKxSRNkAKT/GCF4wVpLXaKGhURCjGECgSI0JpbROZkEC4pYSAMAmSKCQoTCGXX//Y6yRnJmfOnDlzrjPf9+t1Xmfvtdc565cNM7/Za+29liICMzOz/tqn3gGYmVlzcgIxM7OyOIGYmVlZnEDMzKwsTiBmZlYWJxAzMyuLE4iZmZXFCcTMzMriBGJmZmUZVu8Aqumwww6L8ePH1zsMM7OmsmrVql9HxIi+6g3qBDJ+/Hg6OjrqHYaZWVOR9MtS6rkLy8zMyuIEYmZmZXECMTOzsjiBmJlZWZxAzMysLIP6Liwzs6FmyeqNLFi2jk1buxjV1sq8qZOYefzoqrTlBGJmNkgsWb2R+YvX0rV9JwAbt3Yxf/FagKokkap3YUm6XtJmSQ8VOPY3kkLSYWlfkq6VtF7Sg5JOyKs7W9Lj6TW72nGbmTWbBcvW7U4eOV3bd7Jg2bqqtFeLMZBvA9N6FkoaC7wLeDqveDowMb3mAl9LdQ8FrgBOAk4ErpB0SFWjNjNrMpu2dvWrfKCqnkAi4l7g+QKHrgYuBiKvbAbwncisANokjQSmAssj4vmIeAFYToGkZGY2lI1qa+1X+UDV5S4sSWcBGyPigR6HRgPP5O13prLeys3MLJk3dRKtw1u6lbUOb2He1ElVaa/mg+iSDgAuA04vdLhAWRQpL/T9c8m6vxg3blyZUZqZNZ/cQPlgvgvrKGAC8IAkgDHA/ZJOJLuyGJtXdwywKZWf2qP8nkJfHhELgYUA7e3tBZOMmdlgNfP40VVLGD3VvAsrItZGxOERMT4ixpMlhxMi4lfAUuCD6W6sKcC2iHgWWAacLumQNHh+eiozM7M6qcVtvDcC/wVMktQpaU6R6ncAG4D1wDeAjwFExPPAZ4D70uvTqczMzOpEEYO3l6e9vT28HoiZWf9IWhUR7X3V81xYZmZWFicQMzMrixOImZmVxQnEzMzK4gRiZmZlcQIxM7OyOIGYmVlZnEDMzKwsTiBmZlYWJxAzMyuLE4iZmZXFCcTMzMriBGJmZmVxAjEzs7I4gZiZWVmcQMzMrCy1WJHwekmbJT2UV7ZA0mOSHpT0I0ltecfmS1ovaZ2kqXnl01LZekmXVjtuMzMrrhZXIN8GpvUoWw68MSLeBPw3MB9A0mRgFnB0+sw/SWqR1AJ8FZgOTAbOSXXNzKxOqp5AIuJe4PkeZT+JiB1pdwUwJm3PAH4QEa9ExJNka6OfmF7rI2JDRLwK/CDVNTOzOmmEMZAPAz9O26OBZ/KOdaay3srNzKxO6ppAJF0G7ABuyBUVqBZFygt951xJHZI6tmzZUplAzcxsL3VLIJJmA2cC74+IXDLoBMbmVRsDbCpSvpeIWBgR7RHRPmLEiMoHbmZmQJ0SiKRpwCXAWRHxct6hpcAsSftJmgBMBH4B3AdMlDRB0r5kA+1Lax23mZntMazaDUi6ETgVOExSJ3AF2V1X+wHLJQGsiIiPRsTDkm4GHiHr2jovInam7zkfWAa0ANdHxMPVjt3MzHqnPb1Hg097e3t0dHTUOwwzs6YiaVVEtPdVr88rEEntwB8Do4Au4CHgpxHxfNEPmpnZoNbrGIikD0m6n6y7qRVYB2wG3krW9bRI0rjahGlmZo2m2BXIgcApEdFV6KCk48gGuZ+uRmBmZtbYek0gEfHVYh+MiDWVD8fMzJpFrwlE0rXFPhgRF1Q+HDMzaxbFurBW1SwKMzNrOsW6sBbl70s6OCuO31U9KjMza3h9Poku6Y2SVpPdvvuIpFWSjq5+aGZm1shKmcpkIXBRRPx+RIwD/hr4RnXDMjOzRldKAjkwIu7O7UTEPWS3+JqZ2RBWylxYGyT9f+C7af8DwJPVC8nMzJpBKVcgHwZGAIuBH6Xtc6sZlJmZNb4+r0Ai4gXAz3yYmVk3pU6m+LfA+Pz6EfGm6oVlZmaNrpQxkBuAecBaYFd1wzEzs2ZRSgLZEhFe/c/MzLopZRD9CknflHSOpPfmXqU2IOl6SZslPZRXdqik5ZIeT++HpHJJulbSekkPSjoh7zOzU/3H03rqZmZWR6UkkHOB44BpwLvT68x+tPHt9Nl8lwJ3RcRE4K60DzCdbIr4icBc4GuQJRyypXBPAk4kS2qH9CMGMzOrsFK6sI6NiGPKbSAi7pU0vkfxDLJ10gEWAfcAl6Ty70S2zu4KSW2SRqa6y3OrIEpaTpaUbiw3LjMzG5hSrkBWSJpc4XaPiIhnAdL74al8NPBMXr3OVNZbuZmZ1UkpVyBvBWZLehJ4BRDZrLzVuI1XBcqiSPneXyDNJev+Ytw4r7hrZlYtpSSQnuMXlfCcpJER8WzqotqcyjuBsXn1xgCbUvmpPcrvKfTFEbGQbAJI2tvbCyYZMzMbuF67sCR1SPoK8IfAcxHxy/zXANtdCuTupJoN3JpX/sF0N9YUYFvq4loGnC7pkDR4fnoqMzOzOil2BTKFrPtqGvApSb8h+6X944j471IbkHQj2dXDYZI6ye6m+gJws6Q5wNPA2an6HcAZwHrgZdKcWxHxvKTPAPelep/ODaibmVl9KLvhqYSKWVfTdLKEMhH4r4j4WBVjG7D29vbo6OiodxhmZk1F0qqIaO+rXiljIMDuu6WuB66XtA9w8gDiMzOzJtdrApF0G73c6UR2N9YTkp6OiGd6qWNmZoNYsSuQL/XxuaOBm/GViJnZkNRrAomIfwOQ9OaIWJV/TNK7I+JaSZ7S3cxsiCrlSfRvSNo9lYmkc4DLASLiL6oVmJmZNbZSBtHfB9wi6f1kt/V+kOw5DDMzG8JKWdJ2g6RZwBKy+ahOj4iuqkdmZmYNrdhdWGvpfhfWoUALsFKSl7Q1Mxviil2B9GfNDzMzG2KKJZDfRMTvin1Y0kF91TEzs8Gp2F1Yt0r6e0lvk3RgrlDSkZLmSFpGdWbqNTOzJlDsOZDTJJ0B/D/glDQL7g5gHfCvwOyI+FVtwjQzs0ZT9C6siLiDbIZcMzOzbkp5kNDMzGwvTiBmZlYWJxAzMytLnwlE0lGS9kvbp0q6QFJbJRqX9AlJD0t6SNKNkvaXNEHSSkmPS7pJ0r6p7n5pf306Pr4SMZiZWXlKuQL5IbBT0uuB64AJwPcH2rCk0cAFQHtEvJHsKfdZwFXA1RExEXgBmJM+Mgd4ISJeD1yd6pmZWZ2UkkB2RcQO4D3ANRHxCWBkhdofBrRKGgYcADwLvAO4JR1fBMxM2zPSPun4aZJUoTjMzKyfSkkg29MU7rOB21PZ8IE2HBEbyRateposcWwDVgFbU8IC6ARGp+3RZJM5ko5vA1430DjMzKw8pSSQc8lWHfxsRDwpaQLwvYE2nB5MnEHWJTYKOBCYXqBqbkLHQlcbey25K2mupA5JHVu2bBlomGZm1os+E0hEPAJcAtyf9p+MiC9UoO13Ak9GxJaI2A4sBt4CtKUuLYAxwKa03QmMBUjHXws8XyDehRHRHhHtI0aMqECYZmZWSCl3Yb0bWAPcmfaPk7S0Am0/DUyRdEAayzgNeAS4m2wRK8i6zW5N20vTPun4zyJirysQMzOrjVK6sD4JnAhsBYiINWTdTgMSESvJBsPvB9amWBaSXe1cJGk92RjHdekj1wGvS+UXAZcONAYzMytfKUva7oiIbT1ueKrIX/4RcQVwRY/iDWQJq2fd/wHOrkS7ZmY2cKUkkIck/TnQImki2bMb/1ndsMzMrNGV0oX1V8DRwCvAjcCLwIXVDMrMzBpfn1cgEfEycFl6mZmZAUUSiKRrIuJCSbdRYMwjIs6qamRmVtCS1RtZsGwdm7Z2MaqtlXlTJzHz+NF9f9CswopdgXw3vX+pFoGYWd+WrN7I/MVr6dq+E4CNW7uYv3gtgJOI1VyxJW1Xpfd/q104ZlbMgmXrdiePnK7tO1mwbJ0TiNVcn2MgktaydxfWNqADuDIiflONwMxsb5u2dvWr3KyaSrmN98fATvZM4T6LbF6qbcC3gXdXJTIz28uotlY2FkgWo9pa6xCNDXWl3MZ7SkTMj4i16XUZ8H8i4ipgfHXDM7N886ZOonV4S7ey1uEtzJs6qU4R2VBWSgI5SNJJuR1JJwIHpd0dhT9iZtUw8/jRfP69xzC6rRUBo9ta+fx7j/H4h9VFKV1Yc4BvScoljd8CcyQdCHy+apGZWUEzjx/thGENoWgCkbQPcGREHCPptYAiYmtelZurGp2ZmTWsol1YEbELOD9tb+uRPMzMbAgrZQxkuaS/kTRW0qG5V9UjMzOzhlbKGMiH0/t5eWUBHFn5cMzMrFmUMpnigBePMjOzwaeUJW0PkHS5pIVpf6KkMyvRuKQ2SbdIekzSo5JOTl1kyyU9nt4PSXUl6VpJ6yU9KOmESsRgZmblKWUM5FvAq8Bb0n4ncGWF2v8KcGdEvAE4FniUbKnauyJiInAXe5aunQ5MTK+5wNcqFIOZmZWhlARyVER8EdgOEBFdZFOZDIik1wBvI615HhGvpru8ZgCLUrVFwMy0PQP4TmRWAG2SRg40DjMzK08pCeRVSa2kCRUlHUW2OuFAHQlsIXtIcbWkb6aHE4+IiGcB0vvhqf5o4Jm8z3emMjMzq4NSEsgVwJ3AWEk3kHUrXVyBtocBJwBfi4jjgZfY011VSKGrnr0WupI0V1KHpI4tW7ZUIEwzMyukzwQSEcuB9wIfIlsTvT0i7qlA251AZ0SsTPu3kCWU53JdU+l9c179sXmfHwNsKhDvwohoj4j2ESNGVCBMMzMrpJQrEID9gReAF4HJkt420IYj4lfAM5Jy04ieBjwCLAVmp7LZwK1peynwwXQ31hRgW66ry8zMaq+UBaWuAv4MeBjYlYoDuLcC7f8VcIOkfYENwLlkSe1mSXOAp4GzU907gDOA9cDLqa6ZmdVJKU+izwQmRUQlBs67iYg1QHuBQ6cVqBt0fxrezMzqqJQurA3A8GoHYmZmzaWUK5CXgTWS7iLv9t2IuKBqUZmZWcMrJYEsTS8zM7Pdek0gkl4TES9GxKICx8ZVNywzM2t0xcZA7sltpO6rfEuqEo2ZmTWNYgkk/8nvngtIDXguLDMza27FEkj0sl1o38zMhphig+iHS7qI7Gojt03a9xwhZmZDXLEE8g3g4ALbAN+sWkRmZtYUek0gEfGpWgZiZmbNpdTJFM3MzLpxAjEzs7I4gZiZWVmKPYl+UW/HACLiy5UPx8zMmkWxu7Byd11NAv6IPfNhvZvKrAViZmZNrM+7sCT9BDghIn6b9j8J/EtNojMzs4ZVyhjIOODVvP1XgfGVCkBSi6TVkm5P+xMkrZT0uKSb0mqFSNov7a9PxysWg5mZ9V8pCeS7wC8kfVLSFcBK4DsVjOHjwKN5+1cBV0fERLJ12Oek8jnACxHxeuDqVM/MzOqkzwQSEZ8lW3/8BWArcG5EfK4SjUsaA/wJ6cl2SQLeAdySqiwiW1IXYEbaJx0/LdU3M7M6KPU23gOAFyPiK0CnpAkVav8a4GJgV9p/HbA1Inak/U5gdNoeDTwDkI5vS/XNzKwO+kwgqdvqEmB+KhoOfG+gDUs6E9gcEavyiwtUjRKO5X/vXEkdkjq2bNky0DDNzKwXpVyBvAc4C3gJICI20X1ixXKdApwl6SngB2RdV9cAbZJyd4eNATal7U5gLEA6/lrg+Z5fGhELI6I9ItpHjPCkwWZm1VJKAnk1IoL0176kAyvRcETMj4gxETEemAX8LCLeD9wNvC9Vmw3cmraXpn3S8Z+luMzMrA5KSSA3S/pnsiuDjwA/pbrTuV8CXCRpPdkYx3Wp/Drgdan8IuDSKsZgZmZ9UCl/xEt6F3A62TjEsohYXu3AKqG9vT06OjrqHYaZWVORtCoi2vuqV2wqk9wXXRURlwDLC5SZmdkQ1WcCAd5F1q2Ub3qBMrMBuXzJWm5c+Qw7I2iROOeksVw585h6h2VmvSg2G+9fAh8DjpL0YN6hg4H/rHZgNrRcvmQt31vx9O79nRG7951EzBpTsUH075PNvHtres+93pzuljKrmBtXPtOvcjOrv2Kz8W4Dtkn6CvB83my8B0s6KSJW1ipIG3yWrN7IgmXr2LS1i1Ftrezs5WaO3srNrP5KuY33a8Dv8vZfSmVmZVmyeiPzF69l49YuAti4tavXui2e7sysYZWSQJT/wF5E7KK0wXezghYsW0fX9p0l1T3npLFVjsbMylVKAtkg6QJJw9Pr48CGagdmg9emEq44WiQ+MGWcB9DNGlgpVxIfBa4FLiebzuQuYG41g7LBbVRba8Fuq9Ftrfz80nfUISIzK0cp64FsjohZEXF4RBwREX8eEZtrEZwNTvOmTqJ1eEu3stbhLcybOqlOEZlZOYo9B3JxRHxR0j9QYNr0iLigqpHZoDXz+GyJl/y7sOZNnbS73MyaQ7EurNwys55Myipu5vGjnTDMmlyx50BuS++LeqtjZmZDV7EurNso0HWVExFnVSUiMzNrCsW6sL6U3t8L/B57lrE9B3iqijGZmVkTKNaF9W8Akj4TEW/LO3SbpHurHpmZmTW0Uh4kHCHpyNyOpAnAgBcblzRW0t2SHpX0cHpAEUmHSlou6fH0fkgql6RrJa2X9KCkEwYag5mZla+UBPIJ4B5J90i6h2zN8gsr0PYO4K8j4g+BKcB5kiaTLVV7V0RMJHtoMbd07XRgYnrNxfNxmZnVVZ9PokfEnZImAm9IRY9FxCsDbTgingWeTdu/lfQoMBqYAZyaqi0C7iFbvGoG8J00L9cKSW2SRqbvMTOzGuvzCkTSAcA84PyIeAAYJ+nMSgYhaTxwPLASOCKXFNL74anaaCB/cYjOVGZmZnVQShfWt4BXgZPTfidwZaUCkHQQ8EPgwoh4sVjVAmV73WYsaa6kDkkdW7ZsqVSYZmbWQykJ5KiI+CKwHSAiuij8y7zfJA0nSx43RMTiVPycpJHp+EggN+9WJ5A/t/cYYFPP74yIhRHRHhHtI0YMeKzfzMx6UUoCeVVSK+mvfUlHAQMeA5Ek4Drg0Yj4ct6hpcDstD2bbEndXPkH091YU4BtHv8wM6ufUqZzvwK4Exgr6QbgFOBDFWj7FOD/AmslrUllfwt8AbhZ0hzgaeDsdOwO4AxgPfAycG4FYjAzszIVTSDpKuExsqfRp5B1XX08In490IYj4j/ovSvstAL1AzhvoO3aHj3XJfeMuGbWH0UTSESEpCUR8WbgX2sUk9VAbl3y3NKyG7d2MX/xWgAnETMrSSljICsk/VHVI7GaKrQuedf2nSxYtq5OEZlZsyllDOTtwEclPQW8RNbtFBHxpmoGZtXV27rkxdYrNzPLV0oCmV71KKzmeluXfFRbax2iMbNm1GsXlqT9JV1I9hT6NGBjRPwy96pZhFYVXpfczAaq2BXIIrKHB/+d7CpkMvDxWgRl1ed1yc1soIolkMkRcQyApOuAX9QmJKsVr0tuZgNRLIFsz21ExI7skRBrZH6uw8xqqVgCOVZSbnJDAa1pP3cX1muqHp2VZMnqjXzqtod54eXdOd/PdZhZ1RVb0ralt2PWON715Xt4fPNLBY/lnutwAjGzaijlNl5rQJcvWcv3VjzdZz0/12Fm1eIE0mSWrN7IRTetYVeJ9f1ch5lVixNIEyn1qiPHz3WYWTU5gTSBy5es5YaVTxN7rb/Yu7bW4XzyrKM9/mFmVeME0sCWrN7IZT9ay0uv7uy7cp6Jhx/I8otOrU5QZmaJE0iD6m93Vc4pRx3KDR85ue+KZmYD5ATSgN7/jf/i508836/P+KrDzGqt6RKIpGnAV4AW4JsR8YU6h1QxxZ7p6I2A908Zx5Uzj6lOUGZmvWiqBCKpBfgq8C6gE7hP0tKIeKS+kQ3MktUbufCmNX1XzHPA8H343Hvf5EFyM6ubpkogwInA+ojYACDpB8AMoGkTSDlXHR/wFYeZNYBmSyCjgWfy9juBk/IrSJoLzAUYN25c7SLrp3KuOvYRfPlPj/NVh5k1hGZLIIWmBO72dERELAQWArS3t/fjyYnaedMVd/LiK/27NXffFvHF9x3r5GFmDaPZEkgnMDZvfwywqU6x9Fs5d1eBb801s8bUbAnkPmCipAnARmAW8Of1Dak0J312Oc/99tV+fUbA1X/mLisza0xNlUDSwlbnA8vIbuO9PiIernNYRZV71eGBcjNrdE2VQAAi4g7gjnrHUYpyxjr2bxGPffaMKkVkZlY5TZdAmkG5Vx3DhJOHmTUNJ5AKKjdxgKciMbPm4wRSIeV0VwG8Zr8WHvzUtCpEZGZWXU4gFfCGy+7gf3b2/5ETX3WYWTNzAhmAcqdcd+Iws8HACaRM5cxhBXCNn+sws0HCCaQMly9Z2+/k4asOMxtsnEDKcOPKZ/qulAzbR3zpbM9hZWaDjxNICZas3siCZevYtLWLUW2t7IzSBsw9h5WZDWZOIH1Ysnoj8xevpWt7dovuxq1dfX7Gt+aa2VCwT70DaHQLlq3bnTxK8YEp45w8zGxI8BVIAZcvWcuNK5/ps6uqRWJnBC0S55w01pMfmtmQ4gTSQ6nPdoxua+Xnl76jBhGZmTUmd2H1UModVq3DW5g3dVINojEza1y+AumhWLeVgFFtrcybOsm35ZrZkOcE0kNuXKNQ+ROf91TrZmY5denCkrRA0mOSHpT0I0ltecfmS1ovaZ2kqXnl01LZekmXViu2c04a269yM7Ohql5jIMuBN0bEm4D/BuYDSJpMts750cA04J8ktUhqAb4KTAcmA+ekuhV35cxj+MCUcbRIQHbl4eVlzcz2VpcurIj4Sd7uCuB9aXsG8IOIeAV4UtJ64MR0bH1EbACQ9INU95FqxHflzGOcMMzM+tAId2F9GPhx2h4N5N8G1ZnKeivfi6S5kjokdWzZsqUK4ZqZGVTxCkTST4HfK3Dosoi4NdW5DNgB3JD7WIH6QeFEV/B2qYhYCCwEaG9v7/8qT2ZmVpKqJZCIeGex45JmA2cCp0Xsvu2pE8gfrR4DbErbvZWbmVkd1OsurGnAJcBZEfFy3qGlwCxJ+0maAEwEfgHcB0yUNEHSvmQD7UtrHbeZme1Rr+dA/hHYD1iu7G6nFRHx0Yh4WNLNZIPjO4DzImIngKTzgWVAC3B9RDxcn9DNzAxAUeLaFs1I0hbgl0WqHAb8ukbh9EejxgWNG1ujxgWNG1ujxgWOrRyVjOv3I2JEX5UGdQLpi6SOiGivdxw9NWpc0LixNWpc0LixNWpc4NjKUY+4GuE2XjMza0JOIGZmVpahnkAW1juAXjRqXNC4sTVqXNC4sTVqXODYylHzuIb0GIiZmZVvqF+BmJlZmYZEAmnk6eMLxFqXdlPbYyXdLelRSQ9L+ngqP1TSckmPp/dDUrkkXZtifVDSCVWOr0XSakm3p/0JklamuG5KD5mSHkS9KcW1UtL4KsfVJumW9P/Yo5JObqBz9on03/IhSTdK2r9e503S9ZI2S3oor6zf50nS7FT/8TSjRTXiaojfGYViyzv2N5JC0mFpv2bnbLeIGPQv4HRgWNq+CrgqbU8GHiB7qHEC8ATZg4otaftIYN9UZ3IN4qxLu3ntjwROSNsHk021Pxn4InBpKr807/ydQTYRpoApwMoqx3cR8H3g9rR/MzArbX8d+Mu0/THg62l7FnBTleNaBPxF2t4XaGuEc0Y24eiTQGve+fpQvc4b8DbgBOChvLJ+nSfgUGBDej8kbR9Shbga4ndGodhS+ViyB6t/CRxW63O2O45q/c/bqC/gPcANaXs+MD/v2DLg5PRallferV4VY6tLu0XiuRV4F7AOGJnKRgLr0vY/A+fk1d9drwqxjAHuAt4B3J5+SH6d90O++9zl/jum7WGpnqoU12vIfkmrR3kjnLPcLNaHpvNwOzC1nucNGE/3X9T9Ok/AOcA/55V3q1epuHocq+vvjEKxAbcAxwJPsSeB1PScRcTQ6MLqoaLTx1dYvdrdS+q+OB5YCRwREc8CpPfDU7VaxnsNcDGwK+2/DtgaETsKtL07rnR8W6pfDUcCW4Bvpe61b0o6kAY4ZxGxEfgS8DTwLNl5WEVjnLec/p6nevyMNNTvDElnARsj4oEeh2oe26BJIJJ+mvp5e75m5NUpdfr43sqrrV7tdg9COgj4IXBhRLxYrGqBsorHK+lMYHNErCqx7Vqex2FkXQxfi4jjgZfIumJ6U7PY0njCDLKullHAgWSrevbWfkP8/5c0xM9mo/3OkHQAcBnwd4UO9xJD1WKr12SKFReDY/r4YvHUhKThZMnjhohYnIqfkzQyIp6VNBLYnMprFe8pwFmSzgD2J+s2ugZokzQs/bWc33Yurk5Jw4DXAs9XIa5cW50RsTLt30KWQOp9zgDeCTwZEVsAJC0G3kJjnLec/p6nTuDUHuX3VCOwBv2dcRTZHwQPKJuIdgxwv6QTi8RWvXNWyf7NRn2Rra/+CDCiR/nRdB8Q20A2GDYsbU9gz4DY0TWIsy7t5rUv4DvANT3KF9B9oPOLaftP6D5o94saxHgqewbR/4Xug8EfS9vn0X0w+OYqx/TvwKS0/cl0vup+zoCTgIeBA1J7i4C/qud5Y+8xkH6dJ7LxnCfJBoMPSduHViGuhvmd0TO2HseeYs8YSE3PWcQQGUQH1pP1Aa5Jr6/nHbuM7O6JdcD0vPIzyO5CeoJsFcVaxVqXdlPbbyW7tH0w71ydQdYPfhfweHo/NNUX8NUU61qgvQYxnsqeBHIk2Xox69Mvxf1S+f5pf306fmSVYzoO6EjnbUn6IW2IcwZ8CngMeAj4bvrFV5fzBtxINhazneyv4jnlnCeyMYn16XVuleJqiN8ZhWLrcfwp9iSQmp2z3MtPopuZWVkGzSC6mZnVlhOImZmVxQnEzMzK4gRiZmZlcQIxM7OyOIFYw5L0Oklr0utXkjbm7e9bwXbeKWlb3nevkfT2Sn1/L222SPr3Cn3XP0qDQ68zAAAEUUlEQVR6S9ruzJ85NpUNk7S1R9lySb83wHZfL2lN2j5O0jcH8n3WfAbNk+g2+ETEb8iesUDSJ4HfRcSX8usoexxXEbFr72/ol7sjYuYAv6ObvKe99xIRO4E/rkAbI4DjI+L8fnzmQODgiPjVQNvPiYg1ko6SNDqyObhsCPAViDWd9JfvQ5K+DtwPjM3/C1vSrNxfw5KOkLRYUoekX0iaUkY71ylbU+PHkvZPxyZKWiZplaR7Jf1BKv+epL+XdDfwOUmHS7pL0v2S/ildRbX1vCqQdGmK70FJf5fKDk5tPpDieF+BMM9mz0R/+bEfIOknks4t8Jl3AD9L9TolfVbSCkn3STohfe4JSR9JdfaR9OUUw9pe4oBstt8/K/X8WvNzArFmNRm4LrIJDIv9xXst2fQY7cCfAr11s7y9RxfW+FQ+iWxql6OBLiB3lbKQbAqQN5NN3f2Ped91FNn8SRcDnwbujIgTgDvIJjXsJs3xNY5s6pHjgLekLqkzgKci4tiIeCOwvEDcp5DNsJvvYLJf5osi4lsFPjMduDNv/6mImAKsAK4jm778LcBn0vGzyc73sWTT+18t6XD21kEFrqqsebgLy5rVExFxXwn13glMShPPARwiqTUiunrU26sLS9LrgfURsTYVrQLGpzGGKcAP8743/2fpX/K61N4KfBYgIm6X9NsCMZ5O9kt9ddo/CPgDsqn0vyDpC8BtEfHzAp8dSTadfL7bgc9FxE0F6pNivyBvf2l6X0u2TshLwEuSdimbmfmtwPdTt9uvJP0H0E42bUe+zRRIkDZ4OYFYs3opb3sX3aes3j9vW8CJEfFqme28kre9k+xnRsCvI+K4EmIrNJV2TwKujIjr9jogtZNdiSyQdHtEfK5HlS66/3sBfg5Ml3Rz9JirSNIkshl688dmcv/GXXT/9+5iz7+3FPuneGyIcBeWNb301/4LaVxiH7IumJyfks0yC2R3C1WgvReAZyW9J33nPpKO7aX6f5B1neW6qg4uUGcZMCcNbiNpjKTDJI0mu3Hgu8CXydYd6elR4PU9yv6WLIldW6D+NAqMmfThXmBWunPsCLJus44C9f6AbNJGGyKcQGywuISsX/8usllLc84DTkmD048AH+nl8z3HQN7TS72cWcBHJT1ANmX6mb3UuwL4E0n3kw1eP0f3KxQi4g6ydURWSFpLtmb5QWRjDvelW2UvBnpefQD8K93Xesg5H3itpM+RXUXkriym0X38oxS3kM3o+wBZQr4oIjYXqPf2FI8NEZ6N16yK0l1bOyJih6S3kg3It1fw+0V2lTM9elk9UtKbgX8gS2D3RsSJlWo/r41W4G7glDRWYkOAE4hZFUl6A9maDi1kVwEfje5L81aijZOB30bEXt1Hks4juwq7ICJ+Wsl2e7QziWx983ur1YY1HicQMzMri8dAzMysLE4gZmZWFicQMzMrixOImZmVxQnEzMzK4gRiZmZl+V8IEgkW813TcQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "idx_test = np.arange(80, 100)\n", + "score = estimator.score(idx_train)\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", "\n", - "score = estimator.score(idx_test)\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "energies_predict = estimator.predict(idx_train)\n", "\n", - "energies_predict = estimator.predict(idx_test)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -603,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +630,7 @@ "zs = data[\"arr_1\"]\n", "energies = data[\"arr_2\"]\n", "\n", - "estimator = ARMP(iterations=100, l2_reg=0.0, scoring_function=\"rmse\")\n", + "estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")\n", "\n", "estimator.set_representations(representations=representation)\n", "estimator.set_classes(zs)\n", @@ -627,34 +641,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the same way as we did before, now we can use indices to specify on which samples to train, predict and score the model." + "In the same way as we did before, now we can use indices to specify on which samples to train, predict and score the model. Again, since we only have loaded 100 samples, we will train and predict on the same sets." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 50 kJ/mol\n" + "The RMSE is 0.30888228285160746 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XuUXXV99/H3J8MgE0AnaKBkIE8gxriklESnEI3tEpFrBYKPFyhUij4iT6FIsREitMQKQomote1jGwqWakS5hDFQIAIVaalBEiZhCJiWm5AJQrgEkExhMvk+f+x9wpnJOXv2ZM515vNaa9ac/Tv7nP3NWTP5zv5dvj9FBGZmZuVMqHcAZmbW2JwozMwskxOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmXaqdwCV8I53vCOmTZtW7zDMzJrKqlWrno+IycOdNyYSxbRp01i5cmW9wzAzayqSfpXnPHc9mZlZJicKMzPL5ERhZmaZnCjMzCyTE4WZmWUaE7OezMzGm67uXhYtX8eGTX1MaW9j/pEzmTe7oyrXcqIwM2syXd29LFjaQ1//AAC9m/pYsLQHoCrJwl1PZmZNZtHydduSREFf/wCLlq+ryvWcKMzMmsyGTX0jah8tJwozsyYzpb1tRO2j5URhZtZk5h85k7bWlkFtba0tzD9yZlWu58FsM7MmUxiw9qwnMzMra97sjqolhqHc9WRmZpmcKMzMLJMThZmZZXKiMDOzTE4UZmaWyYnCzMwyOVGYmVkmJwozM8tUk0Qh6WpJz0l6qKhtD0l3SPrv9PuktF2Svi3pUUkPSnpvLWI0M7PSanVH8c/AUUPazgfuiogZwF3pMcDRwIz063TgOzWK0czMSqhJooiIe4AXhzQfD1yTPr4GmFfU/i+RWAG0S9q7FnGamdn26jlGsVdEPAOQft8zbe8Ani46b33aNoik0yWtlLRy48aNVQ/WzGy8asTBbJVoi+0aIhZHRGdEdE6ePLkGYZmZjU/1TBTPFrqU0u/Ppe3rgX2LztsH2FDj2MzMLFXPRLEMODV9fCrw46L2T6ezn+YALxe6qMzMrPZqsh+FpGuBDwHvkLQeuAi4DLhO0meBp4BPpKffChwDPApsBk6rRYxmZlZaTRJFRJxU5qnDSpwbwJnVjcjMzPJqxMFsMzNrIE4UZmaWyYnCzMwyOVGYmVkmJwozM8vkRGFmZpmcKMzMLJMThZmZZXKiMDOzTE4UZmaWyYnCzMwyDVvrSVIn8HvAFKAPeAi4MyKG7lhnZtZQurp7WbR8HRs29TGlvY35R85k3uzt9kGzYZRNFJL+GDgbeAJYBawDdgE+CJwn6SHgLyLiqRrEaWY2Ihd29bBkxVPbdj3r3dTHgqU9AE4WI5R1R7ErMDci+ko9KWkWMIOkRLiZWcPo6u4dlCQK+voHWLR8nRPFCJVNFBHx91kvjIjVlQ/HzGz0Fi1ft/3+yakNm0r+7WsZsrqevp31wog4u/LhmJmNTKlxiKxkMKW9rYbRjQ1ZXU+rahaFmdkO6OruZcHSHvr6B4A3xyHaJ7by0ub+7c4XMP/ImTWOsvlldT1dU3wsafekOX5T9ajMzIbR1d3LF69bw0AM7mTq6x/gLTtNoK21ZVsCgSRJnDxnqscndsCw6ygk/bakbpJpsQ9LWiXpgNFeWNJMSauLvl6RdI6khZJ6i9qPGe21zGxsKdxJDE0SBS/39XPpxw6ko70NAR3tbXzzU7O4eN6BtQ10jMizZ/Zi4NyI+CmApA8BVwIfGM2FI2IdMCt9zxagF7gJOA34ZkR8fTTvb2ZjT2E8oneYAekp7W3Mm93hu4cKyZModi0kCYCIuFvSrhWO4zDgsYj4laQKv7WZjQVDxyPKaWtt8ThEheVJFI9L+gvge+nxKSSL8CrpRODaouOzJH0aWAl8MSJeqvD1zKwJdHX3snDZWjb1bT8wXUqLxKUfO9B3EhWWp9bTZ4DJwFKSrqHJJN1DFSFpZ+A44Pq06TvAdJJuqWeAK8q87nRJKyWt3LhxY6XCMbMG0dXdy/zr1+ROEm2tLVzxyYOcJKpg2DuK9K/5aq6ZOBp4ICKeTa/3bOEJSVcCt5SJazHJ+AmdnZ3l1taYWZNatHwd/Vvz/Wp3uI5TVeUtCvhlYFrx+RHxOxWK4SSKup0k7R0Rz6SHJ5DMtjKzceDCrh5+cN9T5MwPtLW2uKupBvKMUSwB5gM9wNZKXlzSROBw4PNFzZendaQCeHLIc2Y2BnV19/KlG9bwxkD+zgHfRdROnkSxMSKWVePiEbEZePuQtj+qxrXMrDF1dfcy/4Y19OdMEq0tYtHHPRZRS3kSxUWS/gm4C3i90BgRS6sWlZmNaclU1wfp6x9ZJ8Wkia1cdOwBThI1lidRnAa8G2jlza6nIJkFZWY2Ihd29fD9FSPbnaCjvY17z/9wlSKy4eRJFAdFhNe9m9modXX3jjhJTMCF/OotzzqKFZLeU/VIzGzMW7R83YjOb50A3/jULHc11VmeO4oPAqdKeoJkjEIkVWQrNT3WzMaJvJsGtbe1svA4j0U0ijyJ4qiqR2FmY9LQTYXK7RNRcMqcqa7w2oCydrhbCdwL3AbcHRH/U7OozKzpldpUqHWCaJkgBkqsqJs7fQ8niQaVdUcxh6Tb6SjgK5JeAJYDt0XEf9UiODNrLsV3EBOk7faL6N8atLe1IrHtzsLdTI0va4e7LcDd6ReS9iapy3SxpBnAzyPiT2oQo5k1gQu7eliy4ikKqSFrU6EnLvuD2gVmo5ZnjAKAtP7S1cDVkiYA769aVGbWVE6+8ufc+9iLuc6d0t5W5Wis0rLGKG4Gyq2pfx14TNJTEfF0VSIzs4bW1d3LeTc+yOtb8q+u9qZCzSnrjiJrK9KdgAOA6/Cdhdm409Xdy7nXrc5V5bVFYmsEU1zEr2lljVH8DEDS+yJiVfFzko6NiG9L8loKs3FmJN1MAm8mNAbkGaO4UtKpEdEDIOkk4Bzg5oj4P1WNzswaxkgSxLbXzJnqJDEG5EkUHwdukHQyyXTZTwNHVDUqM2soO5IkvC5i7MizFerjkk4EuoCngSMiIt86fDNrahd29XDtfU+XnepajldYjy1Zs556GDzraQ+gBbhPUiW3QjWzBvTuC27lf0aw4xwkdxFLPuf5LWNN1h3FR2sWhZk1jK7uXv7sR6vLzo0vxQlibMtKFC9ExG+yXixpt+HOMbPmUajPNJIkMWPPXZ0kxrisRPFjSauBHwOrIuI1AEn7A4cCnwSuBG4YTQCSngReBQaALRHRKWkP4EfANOBJ4JMR8dJormNmpXV197Jw2Vo29ZWv6lrKzi3icu9dPS5kraM4TNIxwOeBuZImAVuAdcC/AqdGxK8rFMehEfF80fH5wF0RcZmk89Pj8yp0LTNLdXX3Mv/6NfTnWTmX8kD1+JM56ykibgVurVEsxY4HPpQ+voakMKEThVkF7cje1bu0yEliHMpdFLCKAviJpAD+MSIWA3ulRQiJiGck7VnXCM3GkB1ZEwGw1+47c98Fh1chImt0jZAo5kbEhjQZ3CHpl3leJOl04HSAqVOnVjM+szHj8G/czX8/91ru8ztcn8logEQRERvS789Jugk4GHhW0t7p3cTewHMlXrcYWAzQ2dk5ssneZuNMV3cvX7phDW/kXBfR2iIWeaDaUhOGO0HSdElvSR9/SNLZktorcXFJu0ravfCYpDTIQ8Ay4NT0tFNJZl6Z2Q64sKuHc360OneSmDSx1UnCBslzR3Ej0CnpncBVJP+J/wA4pgLX3wu4SVIhlh9ExO2S7geuk/RZ4CngExW4ltm4M9IBa89oslLyJIqtEbFF0gnAtyLibyV1V+LiEfE4cFCJ9heAwypxDbPxqqu7lyVOElYBeRJFf1pa/FTg2LSttXohmdmO2tEifh3tbU4SVlaeRHEacAZwSUQ8IWk/4PvVDcvMRqKru5dzf7Sa/JuSvsnbk9pw8pQZf1jSecDU9PgJ4LJqB2Zmw+vq7uUrN6/lpc0jK79R4OmvlsewiULSsST7Z+8M7CdpFvBXEXFctYMzs/Iu7OphyYqnRlTATyS7zrmbyUYiT9fTQpK1DXcDRMTqtPvJzOqkMFCdN0kImOK7B9tBeRLFloh4OZ3CWuAFbmZ10NXdy6Ll6+jdlH+TSc9mstHKkygekvSHQIukGcDZwH9WNywzG2pHuppm7Lmrk4SN2rArs4E/BQ4AXgeuBV4BzqlmUGY22Ei7miYouZO449wPVTMsGyfyzHraDFyQfplZjRS6mTZs6mOCNGyS8EC1VUvZRCHpWxFxjqSbKTEm4VlPZtUztBT4cAvoPM3VqinrjuJ76fev1yIQM0tc2NWTe78IAd/81CwnCKuqrK1QV6Xff1a7cMzs2vueznVeoavJScKqLc+Cux6273p6GVgJXJwW8DOzCsnqZmqR2BrhNRFWU3mmx94GDJCUFgc4keSPmZeBf+bNQoFmVgEtUtlkccUnvU+E1V6eRDE3IuYWHfdIujci5ko6pVqBmY1lxTOaht4dnHTIviX3kJg7fQ8nCauLPOsodpN0SOFA0sHAbunhlqpEZTaGdXX3smBpD72b+gigd1MfC5b20NXdC8DF8w7klDlTaUmrIbRInDJnKks+9/46Rm3jmWKYaXeSOoHv8mZyeBX4LPAw8AcRcV1VI8yhs7MzVq5cWe8wzHKZe9m/lSzB0dHexr3nf7gOEdl4JWlVRHQOd15m15OkCcD+EXGgpLeRJJZNRafUPUmYNZsNZeo0lWs3q7fMrqeI2AqclT5+eUiSMLMdMKW9bUTtZvWWZ4ziDkl/LmlfSXsUvkZ74fT9firpEUlrJX0hbV8oqVfS6vTrmNFey6yRzD9yJm2tLYPavMucNbI8s54+k34/s6gtgP1Hee0twBcj4gFJuwOrJN2RPvfNiPCKcBuTCjOXys16Mms0eYoCVmWTooh4BngmffyqpEcA/6ZY08qa8jrUvNkdTgzWNIbtepI0UdKFkhanxzMkfbSSQUiaBswG7kubzpL0oKSrJU2q5LXMqmG4Ka9mzSzPGMV3gTeAD6TH64GLKxWApN2AG4FzIuIV4DvAdGAWyR3HFWVed7qklZJWbty4sVLhmO2QRcvX0dc/MKitr3+ARcvX1Skis8rJkyimR8TlQD9ARPSRlPAYNUmtJEliSUQsTd//2YgYSGdcXUmyX/d2ImJxRHRGROfkyZMrEY7ZDvOUVxvL8gxmvyGpjbQwoKTpJLvdjYqSTbivAh6JiG8Ute+djl8AnAA8NNprmVVSV3cvC5etZVNfPwCTJrbytrbWbcfFPOXVxoI8ieIi4HZgX0lLgLnAH1fg2nOBPyKpHbU6bfsycJKkWSSJ6Ung8xW4lllFdHX3Mv/6NfRvfbOiwUub+2mZIFonaFC7p7zaWJFn1tMdkh4A5pB0OX0hIp4f7YUj4j8o3YV162jf26xaFi1fNygZFAxsDd46sZWJO+/kKa825uS5owDYBXgpPf89koiIe6oXllljyhpz2LS5n+6/PKKG0ZjVRp6Ni/4a+BSwFtiaNgfgRGHjzpT2tpIF/QrPmY1Fee4o5gEzI2LUA9hmzW7+kTO3G6MAaG2RxyNszMozPfZxoLXagZg1g3mzO1j0iYNob3vzV2LSxFYWfdw7z9nYleeOYjOwWtJdFE2LjYizqxaVWQNz+Q0bb/IkimXpl5mZjUNlE4Wkt0bEKxFxTYnnplY3LDMzaxRZYxR3Fx6k3U7FuqoSjZmZNZysRFG8GG7oRkUVqfVkZmaNLytRRJnHpY7NzGyMyhrM3lPSuSR3D4XHpMcu12pmNk5kJYorgd1LPAb4p6pFZGZmDaVsooiIr9QyEDMza0x5Vmabmdk45kRhZmaZnCjMzCxT1srsc8s9B1C8falZPXR197Jo+TpvFGRWZVmzngqznGYCv8ub9Z6OxXtRWJ0N3ZK0d1Mf869fA+BkYVZhw856kvQT4L0R8Wp6vBC4vibRmZWxcNna7faE6N8aLFy21onCrMLyjFFMBd4oOn4DmFaVaIpIOkrSOkmPSjq/2tez5rKpr39E7Wa24/KUGf8e8AtJN5GU7jgB+JdqBiWpBfh74HBgPXC/pGUR8XA1r2tmZtsb9o4iIi4BTgNeAjYBp0XE16oc18HAoxHxeES8AfwQOL7K17QmMmli6U0Xy7Wb2Y7LOz12IvBKRPwNsF7SflWMCaADeLroeH3aZgbARcceQGvL4CLGrS3iomMPqFNEZmPXsF1Pki4COklmP32XZP/s7wNzqxhXqTLmg0YuJZ0OnA4wdar3URpvCgPWnh5rVn15xihOAGYDDwBExAZJu2e/ZNTWA/sWHe8DbCg+ISIWA4sBOjs7XfZ8jBjJ2gjvXW1WG3kSxRsREZICQNKuVY4J4H5gRtrF1QucCPxhDa5rddTV3cuCpT309Q8AydqIBUt7AK+NMKunPGMU10n6R6Bd0ueAO6lymfGI2AKcBSwHHgGui4i11bym1d+i5eu2JYmCvv4BFi1fV6eIzAxy3FFExNclHQ68QjJO8ZcRcUe1A4uIW4Fbq30daxwbNvWNqN3MaiPPYPZfR8R5wB0l2swqZkp7G70lksKU9rY6RGNmBXm6ng4v0XZ0pQMxm3/kTNpaWwa1tbW2MP/ImXWKyMwgu3rs/wX+BJgu6cGip3YH/rPagdnYkmc2k6e8mjUmRZSeWSrpbcAk4FKguNbSqxHxYg1iy62zszNWrlxZ7zCsjAu7eliy4qlBC2HaWlu49GMHOgmY1ZGkVRHROdx5ZbueIuLliHgS+BvgxYj4VUT8CuiXdEjlQrWxrKu7d7skAZ7NZNZM8qyj+A7w3qLj10q0mQ1S6GoqNThd4NlMZs0hT6JQFPVPRcRWSXleZ+PU0IVz5Xg2k1lzyDPr6XFJZ0tqTb++ADxe7cCseZVaODeUwLOZzJpEnkRxBvABklIa64FDSIvxmZUyXJeSgJPnTPVAtlmTyLMy+zmSWktmuZRbOAfQ4SmvZk0nax3FlyLickl/C9tNWiEizq5qZNa05h85c7sxCk+HNWteWXcUj6TfvUDBBhlu8ZwXzpmNLWUX3DUTL7irnVIzmny3YNac8i64y+p6upkSXU4FEXHcDsZmTSyrFLgThdnYlNX19PX0+8eA3yLZ/hTgJODJKsZkdVbctdQ+sZUIeLmvP3OQ2ovnzMausokiIn4GIOmrEfH7RU/dLOmeqkdmdTG0a+mlzf3bnuvd1IcofZvpxXNmY1eedRSTJe1fOEi3J51cvZCsnoZbLBck6yCKuRS42diWpxTHnwF3Syqsxp4GfL5qEVld5elCCpL1EJ7RZDY+5Flwd7ukGcC706ZfRsTr1Q3L6iVrHKKgo72Ne8//cI0iMrN6G7brSdJEYD5wVkSsAaZK+uhoLippkaRfSnpQ0k2S2tP2aZL6JK1Ov/5hNNexkSu1y1wxdzOZjT95xii+C7wBvD89Xg9cPMrr3gH8dkT8DvBfwIKi5x6LiFnp1xmjvI6N0LzZHVz6sQPpaG9DwKSJrbS3tSKSOwmvlzAbf/KMUUyPiE9JOgkgIvokDR3PHJGI+EnR4Qrg46N5P6usebM7nAzMbJs8dxRvSGojnRUpaTpQyTGKzwC3FR3vJ6lb0s8k/V4Fr2NmZjsgzx3FRcDtwL6SlgBzgT8e7kWS7iRZqDfUBRHx4/ScC4AtwJL0uWeAqRHxgqT3AV2SDoiIV0q8/+mk5c6nTp2a459hw9VoMjMrJbPWU9rFtA+wGZhDMoV+RUQ8P+oLS6eS7HVxWERsLnPO3cCfR0RmISfXehqeazSZ2VB5az1ldj2lW6B2RcQLEfGvEXFLhZLEUcB5wHHFSULSZEkt6eP9gRl4N72KyKrRZGaWJc8YxQpJv1vh6/4dsDtwx5BpsL8PPChpDXADcEZEvFjha49L5RbSuUaTmQ0nzxjFocAZkp4EXiPpfop0ausOiYh3lmm/EbhxR9/Xyiu3kM41msxsOHkSxdFVj8Iq4sKuHq6972kGImiROOmQfbl43oFA+V3nvHjOzIaTtR/FLiSDze8EeoCrImJLrQKzkTn5yp9z72Nv9tINRPD9FU8BcPG8A73rnJntsLKzniT9COgH/p3kruJXEfGFGsaW23ie9dTV3cvCZWvZ1Ndf8vkWiccuPabGUZlZMxj1DnfAeyLiwPTNrgJ+UangrDJKTXkdamAMbHVrZvWVNetp25+o7nJqTMPtHQHJHYWZ2Whk3VEcJKmwIlpAW3pcmPX01qpHZ4MMXVk9XDlwgJMO2bcGkZnZWJa1FWr5WtNWc0O7mbK2JS2YO32PbbOezMx2VJ7psVZHhbuIUncPhW1JhyaLSRNbuejYAzyjycwqwomigeUZrPa2pGZWbU4UDSzPYLW3JTWzastT68nqZLg6TF5ZbWa14DuKBlBun4ismU0d7mYysxpxoqizoaU3ejf18cXr1wDl6zN5DwkzqyV3PdXRhV09g5JEwcDW4IKbepg3u4NLP3YgHe1tiOQuwknCzGrNdxR1dO19T5d97rU3kruIebM7nBjMrK58R1FHrsNkZs3AdxQ1UmrAukUqmyxcocnMGoXvKGqgsHCud1MfQTJgvWBpD3P2n1T2NSfPmVq7AM3MMjhR1ECphXN9/QM8+UIfp8yZut3dwylzprpGk5k1jLp0PUlaCHwO2Jg2fTkibk2fWwB8FhgAzo6I5fWIsZLKLZzbsKmPi+cd6KRgZg2tnmMU34yIrxc3SHoPcCJwADAFuFPSuyIiu45FAym1b3W5hXNT2tvqEKGZ2cg0WtfT8cAPI+L1iHgCeBQ4uM4x5XbylT/n+yue2jZAXdi3etrb22hrHVy13eU3zKxZ1DNRnCXpQUlXSyqM6nYAxYsL1qdt25F0uqSVklZu3Lix1Ck11dXdW3LxHMCKx1/ywjkza1pV63qSdCfwWyWeugD4DvBVkirZXwWuAD5D6VmhJeePRsRiYDFAZ2dn3RckLFq+ruxzAxFeOGdmTatqiSIiPpLnPElXArekh+uB4r079wE2VDi0qsiq9Op9q82smdWl60nS3kWHJwAPpY+XASdKeouk/YAZwC9qHd+OyBqY9r7VZtbM6jVGcbmkHkkPAocCfwYQEWuB64CHgduBM5tlxtP8I2duN2AN3rfazJpfXabHRsQfZTx3CXBJDcOpiML4Q6l9JczMmplrPVWQB6zNbCxyoiij3K5zZmbjjRPFEF3dvXzl5rW8tLl/W1uhiB/gZGFm406jrcyuq0KV1+IkUdDXP5C5VsLMbKxyoihSqsprsay1EmZmY5UTRZHhEoGL+JnZeDSuxyiGDli3T2wt2e0ELuJnZuPXuE0UhfGIQldT76Y+WieI1hbRPzC4dFR7WysLjzvAA9lmNi6N20RRajyif2vQ3tbKrm/ZydNizcxS4zZRlBuPeLmvn9UXHVHjaMzMGte4HcwuNzDtAWszs8HGbaIoVcTPA9ZmZtsbt11PLuJnZpbPuE0U4CJ+ZmZ5jNuuJzMzy8eJwszMMjlRmJlZJicKMzPL5ERhZmaZFBHDn9XgJG0EflXvOEp4B/B8vYPIybFWR7PE2ixxgmOtpP8VEZOHO2lMJIpGJWllRHTWO448HGt1NEuszRInONZ6cNeTmZllcqIwM7NMThTVtbjeAYyAY62OZom1WeIEx1pzHqMwM7NMvqMwM7NMThQVJmmhpF5Jq9OvY4qeWyDpUUnrJB1ZzzjTeBZJ+qWkByXdJKk9bZ8mqa/o3/AP9Y4VQNJR6Wf3qKTz6x1PMUn7SvqppEckrZX0hbS97M9DPUl6UlJPGtPKtG0PSXdI+u/0+6QGiHNm0We3WtIrks5plM9V0tWSnpP0UFFbyc9RiW+nP78PSnpvPWLeEe56qjBJC4HfRMTXh7S/B7gWOBiYAtwJvCsiBrZ7kxqRdATwbxGxRdJfA0TEeZKmAbdExG/XK7ahJLUA/wUcDqwH7gdOioiH6xpYStLewN4R8YCk3YFVwDzgk5T4eag3SU8CnRHxfFHb5cCLEXFZmognRcR59YpxqPRnoBc4BDiNBvhcJf0+8BvgXwq/L+U+xzSZ/SlwDMm/4W8i4pB6xT4SvqOoneOBH0bE6xHxBPAoSdKom4j4SURsSQ9XAPvUM55hHAw8GhGPR8QbwA9JPtOGEBHPRMQD6eNXgUeAZqthfzxwTfr4GpJE10gOAx6LiIZZXBsR9wAvDmku9zkeT5JQIiJWAO3pHxgNz4miOs5Kby2vLrp97wCeLjpnPY31H8lngNuKjveT1C3pZ5J+r15BFWn0z2+b9I5sNnBf2lTq56HeAviJpFWSTk/b9oqIZyBJfMCedYuutBNJ7soLGvFzhfKfY9P8DA/lRLEDJN0p6aESX8cD3wGmA7OAZ4ArCi8r8VZV7/cbJtbCORcAW4AladMzwNSImA2cC/xA0lurHesw6vL5jZSk3YAbgXMi4hXK/zzU29yIeC9wNHBm2oXSsCTtDBwHXJ82NernmqUpfoZLGdc73O2oiPhInvMkXQnckh6uB/YtenofYEOFQ9vOcLFKOhX4KHBYpANWEfE68Hr6eJWkx4B3ASurHG6Wunx+IyGplSRJLImIpQAR8WzR88U/D3UVERvS789Juomka+9ZSXtHxDNpl8hzdQ1ysKOBBwqfZ6N+rqlyn2PD/wyX4zuKChvS53gCUJgNsQw4UdJbJO0HzAB+Uev4ikk6CjgPOC4iNhe1T04HDpG0P0msj9cnym3uB2ZI2i/96/JEks+0IUgScBXwSER8o6i93M9D3UjaNR1wR9KuwBEkcS0DTk1POxX4cX0iLOkkirqdGvFzLVLuc1wGfDqd/TQHeLnQRdXoPOupwiR9j+R2OIAngc8XfhjSLp7PkHTznBMRt5V7n1qQ9CjwFuCFtGlFRJwh6X8Df0US5wBwUUTcXKcwt0lnjXwLaAGujohL6hzSNpI+CPw70ANsTZu/TPIfXMmfh3pJk/9N6eFOwA8i4hJJbweuA6YCTwGfiIihA7U1J2kiSd/+/hHxctpW9vesxrFdC3yIpErss8BFQBclPsf0j4m/A44CNgOnRUQ979Jzc6IwM7NM7noyM7NMThRmZpbJicJ9FqwqAAAEhUlEQVTMzDI5UZiZWSYnCjMzy+REYXUn6e1FVUB/PaQq6M4VvM5HJL2swdVID63U+5e5Zoukf6/Qe/2dpA+kj9crrfZb9PxOkjYNabtD0m+N8rrvlLQ6fTxL0j+N5v2s+XhlttVdRLxAMic+q/quSKZzb93+HUbkpxFR0WJ3knYqKq44SFodeNS1siRNBmZHxFkjeM2uwO4R8evRXr8gIlZLmi6pIyJ6K/W+1th8R2ENK/1L9iEl+2E8AOxb/BezpBMLf91K2kvSUkkrJf0iXfk60utcpWQvidsk7ZI+N0PS8rR43j2S3pW2f1/SFZJ+CnxN0p6S7pL0gKT/l94VtQ/9K1/S+Wl8D0r6y7Rt9/Saa9I4Pl4izE8wuGhj4f0mSvqJpNNKvObDwL+l562XdImkFZLul/Te9HWPSfpces4ESd9IY+gpEwck5TI+lffztebnRGGN7j3AVWmBwqy/YL8NXB4RnSR7QJTrHjl0SNfTtLR9JvCtiDgA6OPN0tCLgT+JiPcBC0hW1hZMJ6mR9SWSley3p4X2biXZc2SQdGX5VJK9CGYBH0i7ko4BnoyIg9I9De4oEfdckj0uiu1O8p/2NRHx3RKvORq4vej4yYiYQ1JS/iqS0hcfAL6aPv8Jks/7IJJ9P74pqVQF2ZVU4C7Jmoe7nqzRPRYR9+c47yPAzKSHCoBJktoiom/Iedt1PUl6J8leFz1p0ypgWjoGMAe4seh9i39nri/qCvsgcAlARNwi6dUSMR5B8p93d3q8G0mxxfuAyyRdBtwcEfeWeO3ewMYhbbcAX4uIH5U4nzT2s4uOC7WxeoCdIuI14DVJW5VUvf0gSTmPAeDXkv4D6CTZMKrYc5RIhDZ2OVFYo3ut6PFWBpdq3qXosYCD002NdsTrRY8HSH43BDwfEbNyxFaqhPRQAi6OiKu2e0LqJLmzWCTploj42pBT+hj87wW4Fzha0nWFyr9F7zcTeGLI2Enh37iVwf/erbz5781jlzQeGyfc9WRNI/3r/aV03GACSddJwZ3AmYUDSeX+cx/J9V4CnpF0QvqeEyQdVOb0/yDp8ip0Me1e4pzlwGfTQWYk7SPpHZI6SAbwvwd8Ayi1l/IjwDuHtH2ZJFl9u8T5R1FiTGMY95BUOG6RtBdJd1eponXvorGqtVqVOVFYszmPpN/9LpL6/gVnAnPTQeKHgc+Vef3QMYoTypxXcCJwhqQ1wFqSvTtKuQj4A0kPkAwiP8vgOw4i4lbgBmCFpB6SCqO7kYwJ3J9OQf0SMPRuAuBfSaqUDnUW8DZJXyO5KyjcKRzF4PGJPG4AfgmsIUm850ZEqT0pDk3jsXHC1WPNKiCdJbUlIrYoKTn+rXRgvVLvL5K7lqPTnfNKnfM+4G9JEtU9EVHxPdkltQE/Jdkhb6DS72+NyYnCrAIkvZtkY50Wkr/qz4iIobOURnuN9wOvRsR23T6SziS5qzo7Iu6s5HWHXGcmyZ7Q91TrGtZ4nCjMzCyTxyjMzCyTE4WZmWVyojAzs0xOFGZmlsmJwszMMjlRmJlZpv8PHve8+xg2OpEAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "idx_train = np.arange(0,80)\n", + "idx_train = np.arange(0,100)\n", "\n", "estimator.fit(idx_train)\n", "\n", - "idx_test = np.arange(80, 100)\n", + "score = estimator.score(idx_train)\n", "\n", - "score = estimator.score(idx_test)\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", "\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "energies_predict = estimator.predict(idx_train)\n", "\n", - "energies_predict = estimator.predict(idx_test)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -668,27 +695,42 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 43 kJ/mol\n" + "The RMSE is 0.06417642021013359 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X2UXXV97/H3J8MgE4gOaKQkkBuIMS4xJdEppEa7RIQAVQheUShUil6RWygiNkKEFqyglIhY215bKFgqiPIQRqBADFSkpQaZMCHDg2l5iJAJQkACCFOYTL73j71PODM5Z8+emfM483mtNWvO/p19zv7OWZN8Z/8evj9FBGZmZuVMqncAZmbW2JwozMwskxOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmXaodwCV8La3vS1mzpxZ7zDMzJrK6tWrn4uIqcOdNy4SxcyZM+nq6qp3GGZmTUXSr/Kc564nMzPL5ERhZmaZnCjMzCyTE4WZmWVyojAzs0zjYtaTmdlE09ndy7IV69i4uY9p7W0sWTSHxfOnV+VaThRmZk2ms7uXpct76OsfAKB3cx9Ll/cAVCVZuOvJzKzJLFuxbluSKOjrH2DZinVVuZ4ThZlZk9m4uW9E7WPlRGFm1mSmtbeNqH2snCjMzJrMkkVzaGttGdTW1trCkkVzqnI9D2abmTWZwoC1Zz2ZmVlZi+dPr1piGMpdT2ZmlsmJwszMMjlRmJlZJicKMzPL5ERhZmaZnCjMzCyTE4WZmWVyojAzs0w1SRSSrpD0rKQHi9p2k7RS0n+n33dN2yXpO5IelbRW0ntrEaOZmZVWqzuKfwYOHdJ2FnBnRMwG7kyPAQ4DZqdfJwHfrVGMZmZWQk0SRUTcDfxmSPORwJXp4yuBxUXt/xKJVUC7pD1qEaeZmW2vnmMUu0fE0wDp97en7dOBp4rO25C2DSLpJEldkro2bdpU9WDNzCaqRhzMVom22K4h4tKI6IiIjqlTp9YgLDOziameieKZQpdS+v3ZtH0DsFfReXsCG2scm5mZpeqZKG4CTkgfnwD8uKj90+nspwXAi4UuKjMzq72a7Ech6RrgQ8DbJG0AzgUuBK6V9FngSeDo9PRbgcOBR4FXgRNrEaOZmZVWk0QREceWeeqgEucGcEp1IzIzs7wacTDbzMwaiBOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmZwozMwskxOFmZllcqIwM7NMw9Z6ktQBfBCYBvQBDwJ3RMTQHevMzBpKZ3cvy1asY+PmPqa1t7Fk0RwWz99uHzQbRtlEIelPgNOAJ4DVwDpgJ+ADwJmSHgT+IiKerEGcZmYjck5nD1evenLbrme9m/tYurwHwMlihLLuKHYGFkZEX6knJc0DZpOUCDczaxid3b2DkkRBX/8Ay1asc6IYobKJIiL+PuuFEbGm8uGYmY3dshXrtt8/ObVxc8m/fS1DVtfTd7JeGBGnVT4cM7ORKTUOkZUMprW31TC68SGr62l1zaIwMxuFzu5eli7voa9/AHhjHKJ9cisvvNq/3fkCliyaU+Mom19W19OVxceSpiTN8duqR2VmNozO7l6+dO0DDMTgTqa+/gHetMMk2lpbtiUQSJLEcQtmeHxiFIZdRyHpPZK6SabFPixptaR9x3phSXMkrSn6eknS6ZLOk9Rb1H74WK9lZuNL4U5iaJIoeLGvn298fC7T29sQML29jUs+NY/zF8+tbaDjRJ49sy8FzoiInwJI+hBwGfD+sVw4ItYB89L3bAF6gRuBE4FLIuKbY3l/Mxt/CuMRvcMMSE9rb2Px/Om+e6iQPIli50KSAIiIuyTtXOE4DgIei4hfSarwW5vZeDB0PKKcttYWj0NUWJ5E8bikvwC+nx4fT7IIr5KOAa4pOj5V0qeBLuBLEfFCha9nZk0i710EQIvENz4+13cSFZan1tNngKnAcpKuoakk3UMVIWlH4AjgurTpu8Askm6pp4GLy7zuJEldkro2bdpUqXDMrIGc09nDF3+0JleSaGtt4eJP7uckUQWKMoNBNQtAOhI4JSIOKfHcTOCWiHhP1nt0dHREV1dXdQI0s5oayR1EwXTXcRoVSasjomO48/IWBfwKMLP4/Ij43bEEWORYirqdJO0REU+nh0eRzLYyswnguMt+zj2P5a832tba4q6mGsgzRnE1sAToAbZW8uKSJgMHA58var4orSMVwPohz5nZONTZ3cuXr3+A1wfy93D4LqJ28iSKTRFxUzUuHhGvAm8d0vbH1biWmTWmzu5ellz/AP05k4SASz41zwmihvIkinMl/RNwJ/BaoTEillctKjMb15Kprmvp6x9ZJ4VXV9dHnkRxIvAuoJU3up6CZBaUmdmInNPZw1WrRr47gbua6idPotgvIrzu3czGrLO7d8RJYuGs3bj6c79fpYgsjzzrKFZJenfVIzGzcW/ZinUjOt9JojHkuaP4AHCCpCdIxihEUkW2UtNjzWyCyLtpUHtbK+cdsa+7mRpEnkRxaNWjMLNxaeimQuX2iSg4fsEMV3htQFk73HUB9wC3AXdFxP/ULCoza3qlNhVqnSRaJomBrdtPhV04azcniQaVdUexgKTb6VDgq5KeB1YAt0XEf9UiODNrLsV3EJOk7faL6N8atLe1IrHtzsLdTI0va4e7LcBd6ReS9gAOA86XNBv4eUT8aQ1iNLMG19ndy1eWr+XVonURWZsKPXHhH9YqNKuAPGMUAKT1l64ArpA0CfBUBDOjs7uXM65dQ4nepJKmtbdVNyCruKwxiptJFtaV8hrwmKQnI+KpqkRmZg2ts7uXM29Yy2tb8q+u9qZCzSnrjiJrK9IdgH2Ba/GdhdmEM5K7iBaJrRFM88rqppU1RvEzAEnvi4jVxc9J+lhEfEeS11KYTSAjrfIq8GZC40CeMYrLJJ0QET0Ako4FTgdujoj/U9XozKxhHPytu/jvZ18Z0WtcwG98yJMoPgFcL+k4kumynwa2243OzMav4y77+YiThBfPjR/DJoqIeFzSMUAn8BRwSETk36PQzJrWOZ09XHPvU2WnupayY4u46BPubhpPsmY99TB41tNuQAtwr6RKboVqZg3oXWffyv+MYMc5cBG/8SrrjuKjNYvCzBpGZ3cvX/zRmrJz40txghjfshLF8xHx26wXS9pluHPMrHkU6jM5SVixrETxY0lrgB8DqyPiFQBJ+wAHAp8ELgOuH0sAktYDLwMDwJaI6JC0G/AjYCawHvhkRLwwluuYWWmd3b2cd9NDbO4rX9W1FCeIiSNrHcVBkg4HPg8slLQrsAVYB/wrcEJE/LpCcRwYEc8VHZ8F3BkRF0o6Kz0+s0LXMrNUZ3cvS657gP689TfwbKaJKHPWU0TcCtxao1iKHQl8KH18JUlhQicKswoazd7VO7XISWICyl0UsIoC+ImkAP4xIi4Fdk+LEBIRT0t6e10jNBtHRrNwDmD3KTty79kHVyEia3SNkCgWRsTGNBmslPTLPC+SdBJwEsCMGTOqGZ/ZuHHABSt55uXXc58/3fWZjAZIFBGxMf3+rKQbgf2BZyTtkd5N7AE8W+J1lwKXAnR0dIxssrfZBDPSGk2tLWKZF81ZatJwJ0iaJelN6eMPSTpNUnslLi5pZ0lTCo9JSoM8CNwEnJCedgLJzCszG4VzOns4/UdrcieJXSe3OknYIHnuKG4AOiS9A7ic5D/xHwCHV+D6uwM3SirE8oOIuF3SfcC1kj4LPAkcXYFrmU04Ix2w9owmKyVPotgaEVskHQV8OyL+VlJ3JS4eEY8D+5Vofx44qBLXMJuoOrt7udpJwiogT6LoT0uLnwB8LG1rrV5IZjZaoyni59lMNpw8ieJE4GTggoh4QtLewFXVDcvMRmo00169utryyFNm/GFJZwIz0uMngAurHZiZ5TfS/SJEsqmQu5osj2EThaSPkeyfvSOwt6R5wF9FxBHVDs7MyhtNjSYnCBuNPF1P55GsbbgLICLWpN1PZlYnI63RJGCaF8/ZKOVJFFsi4sV0CmuBF7iZ1UFndy/LVqyjd3P+TSY9m8nGKk+ieFDSHwEtkmYDpwH/Wd2wzGyoczp7uHrVkyP6K23223d2krAxG3ZlNvBnwL7Aa8A1wEvA6dUMyswGK6yJGOmGQivP+FC1QrIJJM+sp1eBs9MvM6uDZSvW5U4S7W2tnHfEvh6LsIopmygkfTsiTpd0MyXGJDzryax6RtPNtOvkVs79mBOEVV7WHcX30+/frEUgZpYYSX0mAZd8ap6Tg1VV1laoq9PvP6tdOGZ2zb1P5TqvsCbCScKqLc+Cux6273p6EegCzk8L+JlZhQxXp8lrIqzW8kyPvQ0YICktDnAMye/qi8A/80ahQDOrgBapbLKY3t7GPWd9uMYR2USXJ1EsjIiFRcc9ku6JiIWSjq9WYGbjWWHh3MbNfdvdHRx7wF4lxyhaJokli+bUOlSzXOsodpF0QOFA0v7ALunhlqpEZTaOdXb3snR5D72b+wigd3MfS5f30NndC8D5i+dy/IIZFNdC2HnHFi4+2rvOWX0ohusPlTqA7/FGcngZ+CzwMPCHEXFtVSPMoaOjI7q6uuodhlkuCy/8t5IlONytZLUmaXVEdAx3XmbXk6RJwD4RMVfSW0gSy+aiU+qeJMyazcYydZrKtZvVW2bXU0RsBU5NH784JEmY2ShMa28bUbtZveUZo1gp6c8l7SVpt8LXWC+cvt9PJT0i6SFJX0jbz5PUK2lN+nX4WK9l1kiWLJpDW2vLoLa21hYPVFvDyjPr6TPp91OK2gLYZ4zX3gJ8KSLulzQFWC1pZfrcJRHhFeE2LhUGpMvNejJrNHmKAlZlk6KIeBp4On38sqRHAP9LsaaVNeV1qMXzpzsxWNMYtutJ0mRJ50i6ND2eLemjlQxC0kxgPnBv2nSqpLWSrpC0ayWvZVYNw015NWtmecYovge8Drw/Pd4AnF+pACTtAtwAnB4RLwHfBWYB80juOC4u87qTJHVJ6tq0aVOlwjEblWUr1tHXPzCora9/gGUr1tUpIrPKyZMoZkXERUA/QET0waC1QKMmqZUkSVwdEcvT938mIgbSGVeXkezXvZ2IuDQiOiKiY+rUqZUIx2zUPOXVxrM8g9mvS2ojLQwoaRbJbndjomQT7suBRyLiW0Xte6TjFwBHAQ+O9VpmldTZ3ct5Nz3E5r5+INkH4i1trduOi3nKq40HeRLFucDtwF6SrgYWAn9SgWsvBP6YpHbUmrTtK8CxkuaRJKb1wOcrcC2ziujs7mXJdQ/Qv/WNigYvvNpPyyTROkmD2j3l1caLPLOeVkq6H1hA0uX0hYh4bqwXjoj/oHQX1q1jfW+zalm2Yt2gZFAwsDV48+RWJu+4g6e82riT544CYCfghfT8d0siIu6uXlhmjSlrzGHzq/10/+UhNYzGrDbybFz018CngIeArWlzAE4UNuFMa28rWdCv8JzZeJTnjmIxMCcixjyAbdbsliyas90YBUBri/eKsPErz/TYx4HWagdi1gwWz5/OsqP3o73tjX8Su05uZdknvFeEjV957iheBdZIupOiabERcVrVojJrYC6/YRNNnkRxU/plZmYTUNlEIenNEfFSRFxZ4rkZ1Q3LzMwaRdYYxV2FB2m3U7HOqkRjZmYNJytRFC+GG7pRUUVqPZmZWePLShRR5nGpYzMzG6eyBrPfLukMkruHwmPSY5drNTObILISxWXAlBKPAf6pahGZmVlDKZsoIuKrtQzEzMwaU56V2WZmNoE5UZiZWSYnCjMzy5S1MvuMcs8BFG9famZm41fWrKfCLKc5wO/xRr2nj+G9KKwBnNPZwzX3PsVABC0Sxx6wF+cvnlvvsMzGnWFnPUn6CfDeiHg5PT4PuK4m0ZmVcU5nD1etenLb8UDEtmMnC7PKyjNGMQN4vej4dWBmVaIpIulQSeskPSrprGpfz5rLNfc+NaJ2Mxu9PGXGvw/8QtKNJKU7jgL+pZpBSWoB/h44GNgA3Cfppoh4uJrXteYxEKWryJRrN7PRGzZRRMQFkm4DPpg2nRgR3dUNi/2BRyPicQBJPwSOBJwoDIAWqWRSaJHrVZpVWt7psZOBlyLib4ANkvauYkwA04HiPoQNaZsZAMcesNeI2s1s9IZNFJLOBc4ElqZNrcBV1QyK0mXMB/35KOkkSV2SujZt2lTlcKzRnL94LscvmLHtDqJF4vgFMzyQbVYFecYojgLmA/cDRMRGSVOyXzJmG4DiPw33BDYWnxARlwKXAnR0dLhjepzo7O5l2Yp1bNzcx7T2NpYsmlN2f+rzF891YjCrgTyJ4vWICEkBIGnnKscEcB8wO+3i6gWOAf6oBte1Ours7mXp8h76+gcA6N3cx9LlPQBlk4WZVV+eMYprJf0j0C7pc8AdVLnMeERsAU4FVgCPANdGxEPVvKbV37IV67YliYK+/gGWrVhXp4jMDPLNevqmpIOBl0hWaf9lRKysdmARcStwa7WvY41j4+a+EbWbWW0Mmygk/XVEnAmsLNFmVjHT2tvoLZEUprW31SEaMyvI0/V0cIm2wyodiNmSRXNoa20Z1NbW2sKSRXPqFJGZQXb12P8L/CkwS9LaoqemAP9Z7cBsfMkzm6lwnHfWk5nVhqJMyQNJbwF2Bb4BFNdaejkiflOD2HLr6OiIrq6ueodhZZzT2cPVq54ctBCmrbWFb3x8rpOAWR1JWh0RHcOdV7brKSJejIj1wN8Av4mIX0XEr4B+SQdULlQbzzq7e7dLEuDZTGbNJM86iu8C7y06fqVEm9kgha6mUoPTBZ7NZNYc8iQKRVH/VERslZTndTZBDV04V45nM5k1hzyznh6XdJqk1vTrC8Dj1Q7MmlephXNDCTybyaxJ5EkUJwPvJymlsQE4ADipmkFZcxuuS0nAcQtmeCDbrEnkWZn9LEmtJbNcyi2cA5juKa9mTSdrHcWXI+IiSX8L201aISJOq2pk1rSWLJqz3RiFp8OaNa+sO4pH0u9eoGCDDLd4zgvnzMaXsgvumokX3NVOqRlNvlswa055F9xldT3dTIkup4KIOGKUsVkTyyoF7kRhNj5ldT19M/3+ceB3eGP702OB9VWMyRrI0G6mcoPUXjxnNn6VTRQR8TMASV+LiD8oeupmSXdXPTKru1I7zonSt5lePGc2fuVZYT1V0j4R8ThAuj3p1OqGZfWUVX4jYLtk4VLgZuNbnkTxReAuSYXV2DOBz1ctIqurPOU3gmQ9hGc0mU0MeRbc3S5pNvCutOmXEfFadcOyeslTfmN6exv3nPXhGkVkZvU2bAkPSZOBJcCpEfEAMEPSR8dyUUnLJP1S0lpJN0pqT9tnSuqTtCb9+oexXMdGbrhBaXczmU08eWo9fQ94Hfj99HgDcP4Yr7sSeE9E/C7wX8DSoucei4h56dfJY7yOjVDWoPT09javlzCbgPIkilkRcRHQDxARfSTjmaMWET+JiC3p4Spgz7G8n1VOuX2rv/2pedxz1oedJMwmoDyJ4nVJbaQTXSTNAio5RvEZ4Lai470ldUv6maQPVvA6lsPi+dP5xsfnMr29DeG7CDPLN+vpXOB2YC9JVwMLgT8Z7kWS7iBZqDfU2RHx4/Scs4EtwNXpc08DMyLieUnvAzol7RsRL5V4/5NIy53PmDEjx49hw9VoKlg8f7oTg5ltk1nrSZJIuoVeBRaQdDmtiojnxnxh6QSSvS4OiohXy5xzF/DnEZFZyMm1nobnGk1mNlTeWk+ZXU/pFqidEfF8RPxrRNxSoSRxKHAmcERxkpA0VVJL+ngfYDbeTa8ismo0mZllyTNGsUrS71X4un8HTAFWDpkG+wfAWkkPANcDJ0fEbyp87Qmp3LRX12gys+HkGaM4EDhZ0nrgFdIKDunU1lGJiHeUab8BuGG072vllSvo5xpNZjacPInisKpHYRVxTmcP19z7FAMRtEgce8BenL94LlB+1zkvnjOz4WTtR7ETyWDzO4Ae4PKitQ/WYI677Ofc89gbvXQDEVy16kkAzl8817vOmdmolZ31JOlHJIvs/p3kruJXEfGFGsaW20Se9dTZ3ct5Nz3E5r7+ks+3SDz2jcNrHJWZNYMx73AHvDsi5qZvdjnwi0oFZ5WRp9LrwDjY6tbM6itr1tO2P1Hd5dSY8lR6bdGYqq2YmWXeUewnqbAiWkBbelyY9fTmqkdng+TdlrTYsQfsVYPIzGw8y9oKtaXcc1Z7I9mWtGDhrN22zXoyMxutPNNjrY5Gui0pwK6TWzn3Y/t6RpOZVYQTRQPztqRm1gicKBqYtyU1s0aQp9aT1Ym3JTWzRuA7igZQbp+IrJlN093NZGY14kRRZ0NLb/Ru7uNL1z0AlK/P5D0kzKyW3PVUR+d09gxKEgUDW4Ozb+zxtqRm1hB8R1FH19z7VNnnXnk9uYvwtqRmVm++o6gj12Eys2bgO4oaKTVg3SKVTRau0GRmjcJ3FDVQWDjXu7mPIBmwXrq8hwX77Fr2NcctmFG7AM3MMjhR1ECphXN9/QOsf76P4xfM2O7u4fgFM1yjycwaRl26niSdB3wO2JQ2fSUibk2fWwp8FhgATouIFfWIsZLKLZzbuLmP8xfPdVIws4ZWzzGKSyLim8UNkt4NHAPsC0wD7pD0zojIrmPRQErtW11u4dy09rY6RGhmNjKN1vV0JPDDiHgtIp4AHgX2r3NMuR132c+5atWT2waoC/tWz3xrG22tg6u2u/yGmTWLeiaKUyWtlXSFpMKo7nSgeHHBhrRtO5JOktQlqWvTpk2lTqmpzu7ekovnAFY9/oIXzplZ06pa15OkO4DfKfHU2cB3ga+RVMn+GnAx8BlKzwotOX80Ii4FLgXo6Oio+4KEZSvWlX1uIMIL58ysaVUtUUTER/KcJ+ky4Jb0cANQvHfnnsDGCodWFVmVXr1vtZk1s7p0PUnao+jwKODB9PFNwDGS3iRpb2A28ItaxzcaWQPT3rfazJpZvcYoLpLUI2ktcCDwRYCIeAi4FngYuB04pVlmPC1ZNGe7AWvwvtVm1vzqMj02Iv4447kLgAtqGE5FFMYfSu0rYWbWzFzrqYI8YG1m45ETRRnldp0zM5tonCiG6Ozu5as3P8QLr/ZvaysU8QOcLMxswmm0ldl1VajyWpwkCvr6BzLXSpiZjVdOFEVKVXktlrVWwsxsvHKiKDJcInARPzObiCb0GMXQAev2ya0lu53ARfzMbOKasImiMB5R6Grq3dxH6yTR2iL6BwaXjmpva+W8I/b1QLaZTUgTNlGUGo/o3xq0t7Wy85t28LRYM7PUhE0U5cYjXuzrZ825h9Q4GjOzxjVhB7PLDUx7wNrMbLAJmyhKFfHzgLWZ2fYmbNeTi/iZmeUzYRMFuIifmVkeE7bryczM8nGiMDOzTE4UZmaWyYnCzMwyOVGYmVkmRcTwZzU4SZuAX9U7jhLeBjxX7yBycqzV0SyxNkuc4Fgr6X9FxNThThoXiaJRSeqKiI56x5GHY62OZom1WeIEx1oP7noyM7NMThRmZpbJiaK6Lq13ACPgWKujWWJtljjBsdacxyjMzCyT7yjMzCyTE0WFSTpPUq+kNenX4UXPLZX0qKR1khbVM840nmWSfilpraQbJbWn7TMl9RX9DP9Q71gBJB2afnaPSjqr3vEUk7SXpJ9KekTSQ5K+kLaX/X2oJ0nrJfWkMXWlbbtJWinpv9PvuzZAnHOKPrs1kl6SdHqjfK6SrpD0rKQHi9pKfo5KfCf9/V0r6b31iHk03PVUYZLOA34bEd8c0v5u4Bpgf2AacAfwzogY2O5NakTSIcC/RcQWSX8NEBFnSpoJ3BIR76lXbENJagH+CzgY2ADcBxwbEQ/XNbCUpD2APSLifklTgNXAYuCTlPh9qDdJ64GOiHiuqO0i4DcRcWGaiHeNiDPrFeNQ6e9AL3AAcCIN8LlK+gPgt8C/FP69lPsc02T2Z8DhJD/D30TEAfWKfSR8R1E7RwI/jIjXIuIJ4FGSpFE3EfGTiNiSHq4C9qxnPMPYH3g0Ih6PiNeBH5J8pg0hIp6OiPvTxy8DjwDNVsP+SODK9PGVJImukRwEPBYRDbO4NiLuBn4zpLnc53gkSUKJiFgFtKd/YDQ8J4rqODW9tbyi6PZ9OvBU0TkbaKz/SD4D3FZ0vLekbkk/k/TBegVVpNE/v23SO7L5wL1pU6nfh3oL4CeSVks6KW3bPSKehiTxAW+vW3SlHUNyV17QiJ8rlP8cm+Z3eCgnilGQdIekB0t8HQl8F5gFzAOeBi4uvKzEW1W932+YWAvnnA1sAa5Om54GZkTEfOAM4AeS3lztWIdRl89vpCTtAtwAnB4RL1H+96HeFkbEe4HDgFPSLpSGJWlH4AjgurSpUT/XLE3xO1zKhN7hbrQi4iN5zpN0GXBLergB2Kvo6T2BjRUObTvDxSrpBOCjwEGRDlhFxGvAa+nj1ZIeA94JdFU53Cx1+fxGQlIrSZK4OiKWA0TEM0XPF/8+1FVEbEy/PyvpRpKuvWck7RERT6ddIs/WNcjBDgPuL3yejfq5psp9jg3/O1yO7ygqbEif41FAYTbETcAxkt4kaW9gNvCLWsdXTNKhwJnAERHxalH71HTgEEn7kMT6eH2i3OY+YLakvdO/Lo8h+UwbgiQBlwOPRMS3itrL/T7UjaSd0wF3JO0MHEIS103ACelpJwA/rk+EJR1LUbdTI36uRcp9jjcBn05nPy0AXix0UTU6z3qqMEnfJ7kdDmA98PnCL0PaxfMZkm6e0yPitnLvUwuSHgXeBDyfNq2KiJMl/W/gr0jiHADOjYib6xTmNumskW8DLcAVEXFBnUPaRtIHgH8HeoCtafNXSP6DK/n7UC9p8r8xPdwB+EFEXCDprcC1wAzgSeDoiBg6UFtzkiaT9O3vExEvpm1l/53VOLZrgA+RVIl9BjgX6KTE55j+MfF3wKHAq8CJEVHPu/TcnCjMzCyTu57MzCyTE4WZmWVyojAzs0xOFGZmlsmJwszMMjlRWN1JemtRFdBfD6kKumMFr/MRSS9qcDXSAyv1/mWu2SLp3yv0Xn8n6f3p4w1Kq/0WPb+DpM1D2lZK+p0xXvcdktakj+dJ+qexvJ+oUg23AAAEKklEQVQ1H6/MtrqLiOdJ5sRnVd8VyXTurdu/w4j8NCIqWuxO0g5FxRUHSasDj7lWlqSpwPyIOHUEr9kZmBIRvx7r9QsiYo2kWZKmR0Rvpd7XGpvvKKxhpX/JPqhkP4z7gb2K/2KWdEzhr1tJu0taLqlL0i/Sla8jvc7lSvaSuE3STulzsyWtSIvn3S3pnWn7VZIulvRT4OuS3i7pTkn3S/p/6V1R+9C/8iWdlca3VtJfpm1T0ms+kMbxiRJhHs3goo2F95ss6SeSTizxmg8D/5aet0HSBZJWSbpP0nvT1z0m6XPpOZMkfSuNoadMHJCUy/hU3s/Xmp8ThTW6dwOXpwUKs/6C/Q5wUUR0kOwBUa575MAhXU8z0/Y5wLcjYl+gjzdKQ18K/GlEvA9YSrKytmAWSY2sL5OsZL89LbR3K8meI4OkK8tnkOxFMA94f9qVdDiwPiL2S/c0WFki7oUke1wUm0Lyn/aVEfG9Eq85DLi96Hh9RCwgKSl/OUnpi/cDX0ufP5rk896PZN+PSySVqiDbRQXukqx5uOvJGt1jEXFfjvM+AsxJeqgA2FVSW0T0DTlvu64nSe8g2euiJ21aDcxMxwAWADcUvW/xv5nrirrCPgBcABARt0h6uUSMh5D8592dHu9CUmzxXuBCSRcCN0fEPSVeuwewaUjbLcDXI+JHJc4njf20ouNCbaweYIeIeAV4RdJWJVVvP0BSzmMA+LWk/wA6SDaMKvYsJRKhjV9OFNboXil6vJXBpZp3KnosYP90U6PReK3o8QDJvw0Bz0XEvByxlSohPZSA8yPi8u2ekDpI7iyWSbolIr4+5JQ+Bv+8APcAh0m6tlD5t+j95gBPDBk7KfyMWxn8827ljZ83j53SeGyCcNeTNY30r/cX0nGDSSRdJwV3AKcUDiSV+899JNd7AXha0lHpe06StF+Z0/+DpMur0MU0pcQ5K4DPpoPMSNpT0tskTScZwP8+8C2g1F7KjwDvGNL2FZJk9Z0S5x9KiTGNYdxNUuG4RdLuJN1dpYrWvZPGqtZqVeZEYc3mTJJ+9ztJ6vsXnAIsTAeJHwY+V+b1Q8cojipzXsExwMmSHgAeItm7o5RzgT+UdD/JIPIzDL7jICJuBa4HVknqIakwugvJmMB96RTULwND7yYA/pWkSulQpwJvkfR1kruCwp3CoQwen8jjeuCXwAMkifeMiCi1J8WBaTw2Qbh6rFkFpLOktkTEFiUlx7+dDqxX6v1FctdyWLpzXqlz3gf8LUmiujsiKr4nu6Q24KckO+QNVPr9rTE5UZhVgKR3kWys00LyV/3JETF0ltJYr/H7wMsRsV23j6RTSO6qTouIOyp53SHXmUOyJ/Td1bqGNR4nCjMzy+QxCjMzy+REYWZmmZwozMwskxOFmZllcqIwM7NMThRmZpbp/wO17cH4QILV3wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "estimator = ARMP(hidden_layer_sizes=(40, 20, 10), scoring_function=\"rmse\")\n", + "estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")\n", "\n", "estimator.fit(x=representation, y=energies, classes=zs)\n", "\n", "score = estimator.score(x=representation, y=energies, classes=zs)\n", "\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", + "\n", + "energies_predict = estimator.predict(x=representation, classes=zs)\n", "\n", - "energies_predict = estimator.predict(x=representation, classes=zs)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -738,7 +780,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/examples/ARMP_1.py b/examples/ARMP_1.py index f368646df..5c47fb58e 100644 --- a/examples/ARMP_1.py +++ b/examples/ARMP_1.py @@ -39,8 +39,9 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=10, representation_name='acsf', representation_params={"radial_rs": np.arange(0, 10, 1), "angular_rs": np.arange(0.5, 10.5, 1), -"theta_s": np.arange(0, 5, 1)}, tensorboard=False) +acsf_params = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} +estimator = ARMP(iterations=5000, representation_name='acsf', representation_params=acsf_params, tensorboard=False, + learning_rate=0.075, l1_reg=0.0, l2_reg=0.0) estimator.generate_compounds(filenames) estimator.set_properties(energies) diff --git a/examples/ARMP_2.py b/examples/ARMP_2.py index 34162b7e1..8d9e9b342 100644 --- a/examples/ARMP_2.py +++ b/examples/ARMP_2.py @@ -39,7 +39,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=100, l2_reg=0.0) +estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, tensorboard=True, store_frequency=50) estimator.set_representations(representations=descriptor) estimator.set_classes(zs) diff --git a/examples/ARMP_3.py b/examples/ARMP_3.py index 2745cc562..c326b93cd 100644 --- a/examples/ARMP_3.py +++ b/examples/ARMP_3.py @@ -39,7 +39,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=150, l2_reg=0.0, learning_rate=0.005, hidden_layer_sizes=(40, 20, 10)) +estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, tensorboard=True, store_frequency=50) ## ------------- ** Fitting to the data ** --------------- diff --git a/examples/ARMP_qm7.py b/examples/ARMP_qm7.py new file mode 100644 index 000000000..43fc9afb2 --- /dev/null +++ b/examples/ARMP_qm7.py @@ -0,0 +1,41 @@ +""" +This example shows how to use ARMP to overfit 100 data-points for the QM7 data set. It uses the Atom Centred Symmetry +functions as the representation. + +This example takes about 3.5 min to run. +""" + +from qml.aglaia.aglaia import ARMP +import glob +import numpy as np +import matplotlib.pyplot as plt +from sklearn import model_selection as modsel + +filenames = sorted(glob.glob("../test/qm7/*.xyz")) +energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=[1]) +n_samples = len(filenames) +print("%i files were loaded." % (n_samples)) + +acsf_params = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} +estimator = ARMP(iterations=6000, representation_name='acsf', representation_params=acsf_params, l1_reg=0.0, l2_reg=0.0, + scoring_function="rmse", tensorboard=False, store_frequency=10, learning_rate=0.075) + +estimator.set_properties(energies[:100]) +estimator.generate_compounds(filenames[:100]) +estimator.generate_representation(method="fortran") +print("The shape of the representation is: %s" % (str(estimator.representation.shape))) + +idx = list(range(100)) + +idx_train, idx_test = modsel.train_test_split(idx, test_size=0, random_state=42, shuffle=True) + +estimator.fit(idx_train) + +score = estimator.score(idx_train) +print("The RMSE is %s kcal/mol." % (str(score))) + +ene_pred = estimator.predict(idx_train) + +# Plotting the predictions against the true values +plt.scatter(energies[idx_train], ene_pred) +plt.show() \ No newline at end of file diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index dbbdf263b..daefc7e69 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1586,7 +1586,7 @@ def _initialise_representation(self, representation, parameters): if not is_string(representation): raise InputError("Expected string for variable 'representation'. Got %s" % str(representation)) if representation.lower() not in ['slatm', 'acsf']: - raise InputError("Unknown representation %s" % representation) + raise InputError("Representation %s not implemented." % representation) self.representation_name = representation.lower() if parameters is not None: @@ -1753,7 +1753,7 @@ def _generate_acsf_fortran(self, xyz, classes): for i in range(xyz.shape[0]): if 0 in classes[i]: - idx_zeros = np.where(classes == 0)[1] + idx_zeros = np.where(classes[i] == 0)[0] mol_xyz = xyz[i, :idx_zeros[0], :] mol_classes = classes[i, :idx_zeros[0]] @@ -1804,11 +1804,11 @@ def _generate_representations_from_compounds(self, method): elif self.representation_name == 'acsf': - xyz, zs = self._extract_and_pad() + xyz, classes = self._extract_and_pad() if method == "tf": - representations, classes = self._generate_acsf_tf(xyz, zs) + representations = self._generate_acsf_tf(xyz, classes) elif method == "fortran": - representations, classes = self._generate_acsf_fortran(xyz, zs) + representations = self._generate_acsf_fortran(xyz, classes) else: raise InputError("Method not recognised.") From 8f082c5847377beb7acf0c8dd6dbca59ed91dca9 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 2 Oct 2018 12:44:01 +0100 Subject: [PATCH 48/59] Attempted fix to make representations single precision --- qml/aglaia/aglaia.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index daefc7e69..bff395b5a 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1651,7 +1651,11 @@ def _generate_representations_from_data(self, xyz, classes, method): else: representation = self._generate_acsf_fortran(xyz, classes) - return representation, classes + # Hotfix t make sure the representation is single precision + single_precision_representation = representation.astype(dtype=np.float32) + del representation + + return single_precision_representation, classes def _generate_acsf_tf(self, xyz, classes): """ @@ -1815,7 +1819,11 @@ def _generate_representations_from_compounds(self, method): else: raise InputError("This should never happen, unrecognised representation %s." % (self.representation_name)) - return representations, classes + # Hotfix t make sure the representation is single precision + single_precision_representation = representations.astype(dtype=np.float32) + del representations + + return single_precision_representation, classes def _extract_and_pad(self): """ From 21fd2fdcc7ebf27e63302fda14ae80df5485a745 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 2 Oct 2018 14:55:59 +0100 Subject: [PATCH 49/59] Hot fix for AtomScaler --- qml/qmlearn/preprocessing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qml/qmlearn/preprocessing.py b/qml/qmlearn/preprocessing.py index cab329075..62ce5765e 100644 --- a/qml/qmlearn/preprocessing.py +++ b/qml/qmlearn/preprocessing.py @@ -27,6 +27,7 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression +from sklearn.linear_model import Ridge from .data import Data from ..utils import is_numeric_array, get_unique, is_positive_integer_or_zero_array @@ -50,7 +51,8 @@ def __init__(self, data=None, elements='auto'): self.elements = elements # Initialize model - self.model = LinearRegression() + # self.model = LinearRegression() + self.model = Ridge(alpha=1e-6) def _preprocess_input(self, X): """ From 9ce72c6eaf4d26def4d8a5b94cb0c797611e7159 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 2 Oct 2018 16:14:12 +0100 Subject: [PATCH 50/59] Minor bug fixes --- qml/aglaia/aglaia.py | 4 +++- qml/qmlearn/models.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 893fd98b7..7167bf64b 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -1087,10 +1087,12 @@ def _generate_representations_from_data(self, xyz, classes, method): # TODO implement raise InputError("Not implemented yet. Use compounds.") - def _generate_representations_from_compounds(self): + def _generate_representations_from_compounds(self, method): """ This function generates the representations from the compounds. + :param method: flag that tells whether to use fortran or tensrflow implementation when there is a choice (here there is only fortran) + :type method: string :return: the representation and None (in the ARMP class this would be the classes for atomic decomposition) :rtype: numpy array of shape (n_samples, n_features) and None """ diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index f4152eeb1..cc6dc7ca8 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -35,6 +35,7 @@ from ..utils import is_numeric_array, is_numeric, is_numeric_1d_array, is_positive_integer_1d_array from ..utils import get_unique, is_positive_integer_or_zero, get_batch_size, is_positive_integer from ..utils import is_positive, is_positive_or_zero +from ..utils import InputError from .data import Data from ..math import cho_solve From 70e862285681aa800134d3819da09b294d4a4b1a Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Tue, 2 Oct 2018 16:34:45 +0100 Subject: [PATCH 51/59] More bug fixes to make sure tests run --- qml/aglaia/np_symm_funct.py | 23 ++++--- test/test_acsf.py | 131 ++++++++++++++++++++++++++++++++++++ test/test_mrmp.py | 6 ++ 3 files changed, 149 insertions(+), 11 deletions(-) create mode 100644 test/test_acsf.py diff --git a/qml/aglaia/np_symm_funct.py b/qml/aglaia/np_symm_funct.py index 2a095302e..b82c1ea84 100644 --- a/qml/aglaia/np_symm_funct.py +++ b/qml/aglaia/np_symm_funct.py @@ -171,10 +171,10 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, cos_theta_ijk = get_costheta(xyzs[sample, i, :], xyzs[sample, j, :], xyzs[sample, k, :]) theta_ijk = np.arccos(cos_theta_ijk) - term_1 = np.power((1.0 + np.cos(theta_ijk - theta_value)), zeta) + term_1 = np.power((1.0 + np.cos(theta_ijk - theta_value))/2.0, zeta) term_2 = np.exp(- eta * np.power(0.5*(r_ij + r_ik) - rs_value, 2)) term_3 = fc(r_ij, angular_cutoff) * fc(r_ik, angular_cutoff) - g_term = term_1 * term_2 * term_3 * np.power(2.0, 1.0 - zeta) + g_term = term_1 * term_2 * term_3 * 2.0 # Compare the pair of neighbours to all the possible element pairs, then summ accordingly current_pair = np.flip(np.sort([Zs[sample][j], Zs[sample][k]]), axis=0) # Sorting the pair in descending order for m, pair in enumerate(element_pairs): @@ -187,8 +187,8 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, return np.asarray(total_descriptor) -def generate_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cutoff, radial_rs, - angular_rs, theta_s, zeta, eta): +def generate_acsf(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, + nRs3, nTs, zeta, eta): """ This function calculates the symmetry functions used in the tensormol paper. @@ -206,8 +206,12 @@ def generate_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cuto :return: numpy array of shape (n_samples, n_atoms, n_rad_rs*n_elements + n_ang_rs*n_thetas*n_element_pairs) """ - rad_term = acsf_rad(xyzs, Zs, elements, radial_cutoff, radial_rs, eta) - ang_term = acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, eta) + radial_rs = np.linspace(0, rcut, nRs2) + angular_rs = np.linspace(0, acut, nRs3) + theta_s = np.linspace(0, np.pi, nTs) + + rad_term = acsf_rad(xyzs, Zs, elements, rcut, radial_rs, eta) + ang_term = acsf_ang(xyzs, Zs, element_pairs, acut, angular_rs, theta_s, zeta, eta) acsf = np.concatenate((rad_term, ang_term), axis=-1) @@ -238,12 +242,9 @@ def generate_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cuto radial_rs = [0.0, 1.0] angular_rs = [0.0, 1.0] theta_s = [np.pi, np.pi * 0.5] - # radial_rs = [0.0] - # angular_rs = [0.0] - # theta_s = [3.0] zeta = 0.0 eta = 1.0 - rad_term = acsf_rad(xyzs, zs, elements, radial_cutoff, radial_rs, eta) - ang_term = acsf_ang(xyzs, zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, eta) + rad_term = acsf_rad(xyzs, zs, elements, radial_cutoff, nRs2, eta) + ang_term = acsf_ang(xyzs, zs, element_pairs, angular_cutoff, nRs3, nTs, zeta, eta) print(rad_term) diff --git a/test/test_acsf.py b/test/test_acsf.py new file mode 100644 index 000000000..10c3bdc6a --- /dev/null +++ b/test/test_acsf.py @@ -0,0 +1,131 @@ +# MIT License +# +# Copyright (c) 2018 Silvia Amabilino +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +This file contains tests for the tensorflow atom centred symmetry function module. It uses the numpy implementation +as a comparison. +""" + +import tensorflow as tf +import numpy as np + +from qml.aglaia import symm_funct +from qml.aglaia import np_symm_funct + + +def test_acsf_1(): + """ + This test compares the atom centred symmetry functions generated with tensorflow and numpy. + The test system consists of 5 configurations of CH4 + CN radical. + :return: None + """ + + nRs2 = 3 + nRs3 = 3 + nTs = 3 + rcut = 5 + acut = 5 + zeta = 220.127 + eta = 30.8065 + + input_data = "/Volumes/Transcend/repositories/my_qml_fork/qml/test/data/data_test_acsf.npz" + data = np.load(input_data) + + xyzs = data["arr_0"] + zs = data["arr_1"] + elements = data["arr_2"] + element_pairs = data["arr_3"] + + n_samples = xyzs.shape[0] + n_atoms = zs.shape[1] + + with tf.name_scope("Inputs"): + zs_tf = tf.placeholder(shape=[n_samples, n_atoms], dtype=tf.int32, name="zs") + xyz_tf = tf.placeholder(shape=[n_samples, n_atoms, 3], dtype=tf.float32, name="xyz") + + acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + sess = tf.Session() + sess.run(tf.global_variables_initializer()) + acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) + + acsf_np = np_symm_funct.generate_acsf(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + n_samples = xyzs.shape[0] + n_atoms = xyzs.shape[1] + + for i in range(n_samples): + for j in range(n_atoms): + acsf_np_sort = np.sort(acsf_np[i][j]) + acsf_tf_sort = np.sort(acsf_tf[i][j]) + np.testing.assert_array_almost_equal(acsf_np_sort, acsf_tf_sort, decimal=4) + +def test_acsf_2(): + """ + This test compares the atom centred symmetry functions generated with tensorflow and numpy. + The test system consists of 10 molecules from the QM7 data set. + :return: None + """ + nRs2 = 3 + nRs3 = 3 + nTs = 3 + rcut = 5 + acut = 5 + zeta = 220.127 + eta = 30.8065 + + input_data = "/Volumes/Transcend/repositories/my_qml_fork/qml/test/data/qm7_testdata.npz" + data = np.load(input_data) + + xyzs = data["arr_0"] + zs = data["arr_1"] + elements = data["arr_2"] + element_pairs = data["arr_3"] + + n_samples = xyzs.shape[0] + max_n_atoms = zs.shape[1] + + with tf.name_scope("Inputs"): + zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") + xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") + + acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + sess = tf.Session() + sess.run(tf.global_variables_initializer()) + acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) + + acsf_np = np_symm_funct.generate_acsf(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + for i in range(n_samples): + for j in range(max_n_atoms): + if zs[i][j] == 0: + continue + else: + acsf_np_sort = np.sort(acsf_np[i][j]) + acsf_tf_sort = np.sort(acsf_tf[i][j]) + np.testing.assert_array_almost_equal(acsf_np_sort, acsf_tf_sort, decimal=4) + + +if __name__ == "__main__": + test_acsf_1() + test_acsf_2() \ No newline at end of file diff --git a/test/test_mrmp.py b/test/test_mrmp.py index 6ee426acd..b31869467 100644 --- a/test/test_mrmp.py +++ b/test/test_mrmp.py @@ -31,6 +31,11 @@ import glob import os import shutil +try: + import tensorflow as tf +except ImportError: + print("Tensorflow not found but is needed for mrmp class.") + raise SystemExit def test_set_representation(): """ @@ -214,6 +219,7 @@ def test_load_external(): This function tests if a model that has been trained on a different computer can be loaded and used on a different computer. """ + tf.reset_default_graph() x = np.linspace(-10.0, 10.0, 2000) y = x ** 2 From 89dfda993d8420c7fd9f6d54dee708b9bb0a6d86 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 3 Oct 2018 11:33:45 +0100 Subject: [PATCH 52/59] Fixed some tests that had failures --- test/test_fchl_electric_field.py | 5 +++-- test/test_fchl_force.py | 5 ++++- test/test_mrmp.py | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/test/test_fchl_electric_field.py b/test/test_fchl_electric_field.py index 1d896aa4e..504fa0f97 100644 --- a/test/test_fchl_electric_field.py +++ b/test/test_fchl_electric_field.py @@ -235,14 +235,15 @@ def test_multiple_operators(): def test_generate_representation(): - + test_dir = os.path.dirname(os.path.realpath(__file__)) + coords = np.array([[1.464, 0.707, 1.056], [0.878, 1.218, 0.498], [2.319, 1.126, 0.952]]) nuclear_charges = np.array([8, 1, 1], dtype=np.int32) - rep_ref = np.loadtxt("data/fchl_ef_rep.txt").reshape((3, 6, 3)) + rep_ref = np.loadtxt(test_dir + "/data/fchl_ef_rep.txt").reshape((3, 6, 3)) # Test with fictitious charges from a numpy array fic_charges1 = np.array([-0.41046649, 0.20523324, 0.20523324]) diff --git a/test/test_fchl_force.py b/test/test_fchl_force.py index 0fb74b1b2..5782f4f99 100644 --- a/test/test_fchl_force.py +++ b/test/test_fchl_force.py @@ -2,6 +2,7 @@ import ast import time +import os import scipy import scipy.stats @@ -31,9 +32,11 @@ from qml.fchl import get_atomic_local_gradient_5point_kernels from qml.fchl import get_atomic_local_kernels +test_dir = os.path.dirname(os.path.realpath(__file__)) + FORCE_KEY = "forces" ENERGY_KEY = "om2_energy" -CSV_FILE = "data/amons_small.csv" +CSV_FILE = test_dir + "/data/amons_small.csv" SIGMAS = [0.64] TRAINING = 13 diff --git a/test/test_mrmp.py b/test/test_mrmp.py index b31869467..2a54c3aec 100644 --- a/test/test_mrmp.py +++ b/test/test_mrmp.py @@ -221,12 +221,14 @@ def test_load_external(): """ tf.reset_default_graph() + test_dir = os.path.dirname(os.path.realpath(__file__)) + x = np.linspace(-10.0, 10.0, 2000) y = x ** 2 x = np.reshape(x, (x.shape[0], 1)) estimator = MRMP() - estimator.load_nn("saved_model") + estimator.load_nn(test_dir + "/saved_model") score_after_loading = estimator.score(x, y) score_on_other_machine = -24.101043 From 908da823e01667e87b2ff22c6125d5126b5de65a Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 3 Oct 2018 14:03:43 +0100 Subject: [PATCH 53/59] Reverted the fchl tests to original --- test/test_fchl_electric_field.py | 5 ++--- test/test_fchl_force.py | 5 +---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/test/test_fchl_electric_field.py b/test/test_fchl_electric_field.py index 504fa0f97..1d896aa4e 100644 --- a/test/test_fchl_electric_field.py +++ b/test/test_fchl_electric_field.py @@ -235,15 +235,14 @@ def test_multiple_operators(): def test_generate_representation(): - test_dir = os.path.dirname(os.path.realpath(__file__)) - + coords = np.array([[1.464, 0.707, 1.056], [0.878, 1.218, 0.498], [2.319, 1.126, 0.952]]) nuclear_charges = np.array([8, 1, 1], dtype=np.int32) - rep_ref = np.loadtxt(test_dir + "/data/fchl_ef_rep.txt").reshape((3, 6, 3)) + rep_ref = np.loadtxt("data/fchl_ef_rep.txt").reshape((3, 6, 3)) # Test with fictitious charges from a numpy array fic_charges1 = np.array([-0.41046649, 0.20523324, 0.20523324]) diff --git a/test/test_fchl_force.py b/test/test_fchl_force.py index 5782f4f99..0fb74b1b2 100644 --- a/test/test_fchl_force.py +++ b/test/test_fchl_force.py @@ -2,7 +2,6 @@ import ast import time -import os import scipy import scipy.stats @@ -32,11 +31,9 @@ from qml.fchl import get_atomic_local_gradient_5point_kernels from qml.fchl import get_atomic_local_kernels -test_dir = os.path.dirname(os.path.realpath(__file__)) - FORCE_KEY = "forces" ENERGY_KEY = "om2_energy" -CSV_FILE = test_dir + "/data/amons_small.csv" +CSV_FILE = "data/amons_small.csv" SIGMAS = [0.64] TRAINING = 13 From c47ad62beeb51412eb91154cf8d74703a72efa71 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 3 Oct 2018 14:15:23 +0100 Subject: [PATCH 54/59] Fixed path in acsf test --- test/test_acsf.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_acsf.py b/test/test_acsf.py index 10c3bdc6a..9e6f90c15 100644 --- a/test/test_acsf.py +++ b/test/test_acsf.py @@ -30,6 +30,7 @@ from qml.aglaia import symm_funct from qml.aglaia import np_symm_funct +import os def test_acsf_1(): @@ -39,6 +40,8 @@ def test_acsf_1(): :return: None """ + test_dir = os.path.dirname(os.path.realpath(__file__)) + nRs2 = 3 nRs3 = 3 nTs = 3 @@ -47,7 +50,7 @@ def test_acsf_1(): zeta = 220.127 eta = 30.8065 - input_data = "/Volumes/Transcend/repositories/my_qml_fork/qml/test/data/data_test_acsf.npz" + input_data = test_dir + "/data/data_test_acsf.npz" data = np.load(input_data) xyzs = data["arr_0"] @@ -85,6 +88,8 @@ def test_acsf_2(): The test system consists of 10 molecules from the QM7 data set. :return: None """ + test_dir = os.path.dirname(os.path.realpath(__file__)) + nRs2 = 3 nRs3 = 3 nTs = 3 @@ -93,7 +98,7 @@ def test_acsf_2(): zeta = 220.127 eta = 30.8065 - input_data = "/Volumes/Transcend/repositories/my_qml_fork/qml/test/data/qm7_testdata.npz" + input_data = test_dir + "/data/qm7_testdata.npz" data = np.load(input_data) xyzs = data["arr_0"] From fdc485e03c3d613339d930f25a5923ed8fba6b2f Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 3 Oct 2018 14:34:38 +0100 Subject: [PATCH 55/59] Readded changes to tests --- test/test_fchl_electric_field.py | 5 +++-- test/test_fchl_force.py | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_fchl_electric_field.py b/test/test_fchl_electric_field.py index 1d896aa4e..504fa0f97 100644 --- a/test/test_fchl_electric_field.py +++ b/test/test_fchl_electric_field.py @@ -235,14 +235,15 @@ def test_multiple_operators(): def test_generate_representation(): - + test_dir = os.path.dirname(os.path.realpath(__file__)) + coords = np.array([[1.464, 0.707, 1.056], [0.878, 1.218, 0.498], [2.319, 1.126, 0.952]]) nuclear_charges = np.array([8, 1, 1], dtype=np.int32) - rep_ref = np.loadtxt("data/fchl_ef_rep.txt").reshape((3, 6, 3)) + rep_ref = np.loadtxt(test_dir + "/data/fchl_ef_rep.txt").reshape((3, 6, 3)) # Test with fictitious charges from a numpy array fic_charges1 = np.array([-0.41046649, 0.20523324, 0.20523324]) diff --git a/test/test_fchl_force.py b/test/test_fchl_force.py index 0fb74b1b2..5782f4f99 100644 --- a/test/test_fchl_force.py +++ b/test/test_fchl_force.py @@ -2,6 +2,7 @@ import ast import time +import os import scipy import scipy.stats @@ -31,9 +32,11 @@ from qml.fchl import get_atomic_local_gradient_5point_kernels from qml.fchl import get_atomic_local_kernels +test_dir = os.path.dirname(os.path.realpath(__file__)) + FORCE_KEY = "forces" ENERGY_KEY = "om2_energy" -CSV_FILE = "data/amons_small.csv" +CSV_FILE = test_dir + "/data/amons_small.csv" SIGMAS = [0.64] TRAINING = 13 From 255ea74f34782f28a5cea774a1ecf59cb5aea9c3 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 3 Oct 2018 16:34:26 +0100 Subject: [PATCH 56/59] Modifications after code review --- examples/ARMP_qm7.py | 2 +- qml/aglaia/aglaia.py | 40 +++++------ qml/aglaia/graceful_killer.py | 20 ++++-- qml/aglaia/np_symm_funct.py | 36 +--------- qml/aglaia/symm_funct.py | 121 +++++++++++++++++++++++----------- qml/qmlearn/preprocessing.py | 4 +- qml/utils/utils.py | 1 - test/test_acsf.py | 8 +-- test/test_symm_funct.py | 2 +- 9 files changed, 123 insertions(+), 111 deletions(-) diff --git a/examples/ARMP_qm7.py b/examples/ARMP_qm7.py index 43fc9afb2..ba9a2cea9 100644 --- a/examples/ARMP_qm7.py +++ b/examples/ARMP_qm7.py @@ -2,7 +2,7 @@ This example shows how to use ARMP to overfit 100 data-points for the QM7 data set. It uses the Atom Centred Symmetry functions as the representation. -This example takes about 3.5 min to run. +This example takes about 3.5 min to run on a mac. """ from qml.aglaia.aglaia import ARMP diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 7167bf64b..a7d22c386 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -30,7 +30,7 @@ import tensorflow as tf from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from sklearn.base import BaseEstimator -from .symm_funct import generate_parkhill_acsf +from .symm_funct import generate_acsf_tf from ..utils.utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \ is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, \ check_global_representation, check_y, check_sizes, check_dy, check_classes, is_numeric_array, is_non_zero_integer, \ @@ -38,7 +38,7 @@ from qml.aglaia.tf_utils import TensorBoardLogger from qml.representations import generate_acsf -from qml.aglaia.graceful_killer import GracefulKiller +from qml.aglaia.graceful_killer import _GracefulKiller from qml.data import Compound from qml import representations as qml_rep @@ -1689,13 +1689,13 @@ def _generate_acsf_tf(self, xyz, classes): iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) batch_xyz, batch_zs = iterator.get_next() - representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - rcut=self.acsf_parameters['rcut'], - acut=self.acsf_parameters['acut'], - nRs2=self.acsf_parameters['nRs2'], - nRs3=self.acsf_parameters['nRs3'], - nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + representation = generate_acsf_tf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -2280,7 +2280,7 @@ def _fit_from_scratch(self, x, y, dy, classes): self.session.run(init) # Initialising the object that enables graceful killing of the training - killer = GracefulKiller() + killer = _GracefulKiller() for i in range(self.iterations): @@ -2365,7 +2365,7 @@ def _fit_from_loaded(self, x, y, dy, classes): dataset_init_op = graph.get_operation_by_name("dataset_init") # Initialising the object that enables graceful killing of the training - killer = GracefulKiller() + killer = _GracefulKiller() for i in range(self.iterations): @@ -2415,15 +2415,15 @@ def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): iterator_init = iterator.make_initializer(dataset, name="dataset_init_pred") with tf.name_scope("Descriptor_pred"): - batch_representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=self.elements, - element_pairs=self.element_pairs, - rcut=self.acsf_parameters['rcut'], - acut=self.acsf_parameters['acut'], - nRs2=self.acsf_parameters['nRs2'], - nRs3=self.acsf_parameters['nRs3'], - nTs=self.acsf_parameters['nTs'], - eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + batch_representation = generate_acsf_tf(xyzs=batch_xyz, Zs=batch_zs, elements=self.elements, + element_pairs=self.element_pairs, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) with tf.name_scope("Model_pred"): batch_energies_nn = self._model(batch_representation, batch_zs, element_weights, element_biases) diff --git a/qml/aglaia/graceful_killer.py b/qml/aglaia/graceful_killer.py index f4d4f5009..d140008e2 100644 --- a/qml/aglaia/graceful_killer.py +++ b/qml/aglaia/graceful_killer.py @@ -1,11 +1,17 @@ +""" +Code adapted from Stack Overflow answer https://stackoverflow.com/a/31464349/7146757 + +This class is used to process SIGTERM signal gracefully from within the neural network training. +""" + import signal -class GracefulKiller: - kill_now = False +class _GracefulKiller: + kill_now = False - def __init__(self): - signal.signal(signal.SIGINT, self.exit_gracefully) - signal.signal(signal.SIGTERM, self.exit_gracefully) + def __init__(self): + signal.signal(signal.SIGINT, self.exit_gracefully) + signal.signal(signal.SIGTERM, self.exit_gracefully) - def exit_gracefully(self,signum, frame): - self.kill_now = True \ No newline at end of file + def exit_gracefully(self,signum, frame): + self.kill_now = True \ No newline at end of file diff --git a/qml/aglaia/np_symm_funct.py b/qml/aglaia/np_symm_funct.py index b82c1ea84..5de18bd98 100644 --- a/qml/aglaia/np_symm_funct.py +++ b/qml/aglaia/np_symm_funct.py @@ -187,8 +187,8 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, return np.asarray(total_descriptor) -def generate_acsf(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, - nRs3, nTs, zeta, eta): +def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, + nRs3, nTs, zeta, eta): """ This function calculates the symmetry functions used in the tensormol paper. @@ -216,35 +216,3 @@ def generate_acsf(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, acsf = np.concatenate((rad_term, ang_term), axis=-1) return acsf - -if __name__ == "__main__": - xyzs = np.array([[[0.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0]]]) - - zs = [[7, 2, 1, 1]] - - elements = [1, 2, 7] - element_pairs = [[1, 1], [2, 1], [7, 1], [7, 2]] - - # # input_data = "/Volumes/Transcend/repositories/Aglaia/aglaia/tests/data_test_acsf.npz" - # input_data = "/Volumes/Transcend/repositories/Aglaia/aglaia/tests/qm7_testdata.npz" - # data = np.load(input_data) - # - # xyzs = data["arr_0"] - # zs = data["arr_1"] - # elements = data["arr_2"] - # element_pairs = data["arr_3"] - - radial_cutoff = 1000.0 - angular_cutoff = 1000.0 - radial_rs = [0.0, 1.0] - angular_rs = [0.0, 1.0] - theta_s = [np.pi, np.pi * 0.5] - zeta = 0.0 - eta = 1.0 - - rad_term = acsf_rad(xyzs, zs, elements, radial_cutoff, nRs2, eta) - ang_term = acsf_ang(xyzs, zs, element_pairs, angular_cutoff, nRs3, nTs, zeta, eta) - print(rad_term) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index 276eeedca..84c1eaaad 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -37,13 +37,19 @@ def acsf_rad(xyzs, Zs, radial_cutoff, radial_rs, eta): This does the radial part of the symmetry function (G2 function in Behler's papers). It works only for datasets where all samples are the same molecule but in different configurations. - :param xyzs: tf tensor of shape (n_samples, n_atoms, 3) contaning the coordinates of each atom in each data sample - :param Zs: tf tensor of shape (n_samples, n_atoms) containing the atomic number of each atom in each data sample - :param radial_cutoff: scalar tensor - :param radial_rs: tf tensor of shape (n_rs,) with the R_s values - :param eta: tf scalar - - :return: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tf tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param radial_cutoff: cut off used in the radial term + :type radial_cutoff: scalar tensor + :param radial_rs: R_s values + :type radial_rs: tf tensor of shape (n_rs,) + :param eta: parameter in the radial term + :type eta: tf scalar + + :return: pre-sum radial term + :rtype: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) """ # Calculating the distance matrix between the atoms of each sample @@ -92,14 +98,23 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): This does the angular part of the symmetry function as mentioned here: https://arxiv.org/pdf/1711.06385.pdf It only works for systems where all the samples are the same molecule but in different configurations. - :param xyzs: tf tensor of shape (n_samples, n_atoms, 3) contaning the coordinates of each atom in each data sample - :param Zs: tf tensor of shape (n_samples, n_atoms) containing the atomic number of each atom in each data sample - :param angular_cutoff: scalar tensor - :param angular_rs: tf tensor of shape (n_ang_rs,) with the equivalent of the R_s values from the G2 - :param theta_s: tf tensor of shape (n_thetas,) - :param zeta: tf tensor of shape (1,) - :param eta: tf tensor of shape (1,) - :return: tf tensor of shape (n_samples, n_atoms, n_atoms, n_atoms, n_ang_rs * n_thetas) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tf tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param angular_cutoff: cut off used in the angular term + :type angular_cutoff: scalar tensor + :param angular_rs: angular R_s values + :type angular_rs: tf tensor of shape (n_ang_rs,) + :param theta_s: angular theta_s values + :type theta_s: tf tensor of shape (n_thetas,) + :param zeta: parameter in the angular term + :type zeta: tf tensor of shape (1,) + :param eta: parameter in the angular term + :type eta: tf tensor of shape (1,) + + :return: pre-sum angular term + :rtype: tf tensor of shape (n_samples, n_atoms, n_atoms, n_atoms, n_ang_rs * n_thetas) """ # Finding the R_ij + R_ik term @@ -228,11 +243,17 @@ def sum_rad(pre_sum, Zs, elements_list, radial_rs): Sum of the terms in the radial part of the symmetry function. The terms corresponding to the same neighbour identity are summed together. - :param pre_sum: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) - :param Zs: tf tensor of shape (n_samples, n_atoms) - :param elements_list: np.array of shape (n_elements,) - :param radial_rs: tf tensor of shape (n_rad_rs,) - :return: tf tensor of shape (n_samples, n_atoms, n_rad_rd * n_elements) + :param pre_sum: pre-sum radial term + :type pre_sum: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param elements_list: unique atoms in all of the samples + :type elements_list: np.array of shape (n_elements,) + :param radial_rs: radial R_s values + :type radial_rs: tf tensor of shape (n_rad_rs,) + + :return: the radial term + :rtype: tf tensor of shape (n_samples, n_atoms, n_rad_rd * n_elements) """ n_atoms = Zs.get_shape().as_list()[1] n_elements = len(elements_list) @@ -270,12 +291,19 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): This function does the sum of the terms in the radial part of the symmetry function. Three body interactions where the two neighbours are the same elements are summed together. - :param pre_sumterm: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas) - :param Zs: tf tensor of shape (n_samples, n_atoms) - :param element_pairs_list: np array of shape (n_elementpairs, 2) - :param angular_rs: tf tensor of shape (n_ang_rs,) - :param theta_s: tf tensor of shape (n_thetas,) - :return: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas * n_elementpairs) + :param pre_sumterm: pre-sum angular term + :type pre_sumterm: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param element_pairs_list: list of all the atom element pairs + :type element_pairs_list: np array of shape (n_elementpairs, 2) + :param angular_rs: angular R_s values + :type angular_rs: tf tensor of shape (n_ang_rs,) + :param theta_s: angular theta_s values + :type theta_s: tf tensor of shape (n_thetas,) + + :return: the angular term + :rtype: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas * n_elementpairs) """ n_atoms = Zs.get_shape().as_list()[1] @@ -352,26 +380,39 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): return clean_final_term -def generate_parkhill_acsf(xyzs, Zs, elements, element_pairs, rcut, acut, - nRs2, nRs3, nTs, zeta, eta): +def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, + nRs2, nRs3, nTs, zeta, eta): """ This function generates the atom centred symmetry function as used in the Tensormol paper. Currently only tested for single systems with many conformations. It requires the coordinates of all the atoms in each data sample, the atomic charges for each atom (in the same order as the xyz), the overall elements and overall element pairs. Then it requires the parameters for the ACSF that are used in the Tensormol paper: https://arxiv.org/pdf/1711.06385.pdf - :param xyzs: tensor of shape (n_samples, n_atoms, 3) - :param Zs: tensor of shape (n_samples, n_atoms) - :param elements: np.array of shape (n_elements,) - :param element_pairs: np.array of shape (n_elementpairs, 2) - :param rcut: scalar float - :param acut: scalar float - :param nRs2: positive integer - :param nRs3: positive integer - :param nTs: positive integer - :param zeta: scalar float - :param eta2: scalar float - :return: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tensor of shape (n_samples, n_atoms) + :param elements: unique atoms in all of the samples + :type elements: np.array of shape (n_elements,) + :param element_pairs: list of all the atom element pairs + :type element_pairs: np.array of shape (n_elementpairs, 2) + :param rcut: radial cut off length + :type rcut: scalar float + :param acut: angular cut off length + :type acut: scalar float + :param nRs2: number of distance bins to use in the radial term + :type nRs2: positive integer + :param nRs3: number of distance bins to use in the angular term + :type nRs3: positive integer + :param nTs: number of angle bins to use in the angular term + :type nTs: positive integer + :param zeta: parameter in the cosine term + :type zeta: scalar float + :param eta: parameter in the exponential terms + :type eta: scalar float + + :return: the atom centred symmetry functions + :rtype: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) """ radial_rs = np.linspace(0, rcut, nRs2) diff --git a/qml/qmlearn/preprocessing.py b/qml/qmlearn/preprocessing.py index 62ce5765e..cab329075 100644 --- a/qml/qmlearn/preprocessing.py +++ b/qml/qmlearn/preprocessing.py @@ -27,7 +27,6 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression -from sklearn.linear_model import Ridge from .data import Data from ..utils import is_numeric_array, get_unique, is_positive_integer_or_zero_array @@ -51,8 +50,7 @@ def __init__(self, data=None, elements='auto'): self.elements = elements # Initialize model - # self.model = LinearRegression() - self.model = Ridge(alpha=1e-6) + self.model = LinearRegression() def _preprocess_input(self, X): """ diff --git a/qml/utils/utils.py b/qml/utils/utils.py index 10197d328..ce510c74c 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -22,7 +22,6 @@ import numpy as np -import tensorflow as tf def is_positive(x): return (not is_array_like(x) and is_numeric(x) and x > 0) diff --git a/test/test_acsf.py b/test/test_acsf.py index 9e6f90c15..fb29dd006 100644 --- a/test/test_acsf.py +++ b/test/test_acsf.py @@ -65,13 +65,13 @@ def test_acsf_1(): zs_tf = tf.placeholder(shape=[n_samples, n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) n_samples = xyzs.shape[0] n_atoms = xyzs.shape[1] @@ -113,13 +113,13 @@ def test_acsf_2(): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) for i in range(n_samples): for j in range(max_n_atoms): diff --git a/test/test_symm_funct.py b/test/test_symm_funct.py index 7a941d3ed..5f13bb201 100644 --- a/test/test_symm_funct.py +++ b/test/test_symm_funct.py @@ -128,7 +128,7 @@ def tf_acsf(mols, path, elements): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta) sess = tf.Session() sess.run(tf.global_variables_initializer()) From e2feecfb559073c79647fec9039c97fd2e2232e9 Mon Sep 17 00:00:00 2001 From: SilviaAmAm Date: Tue, 11 Dec 2018 15:23:21 +0000 Subject: [PATCH 57/59] Version with the ACSF basis functions starting at 0.8 A --- qml/representations/representations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qml/representations/representations.py b/qml/representations/representations.py index d3c83ee86..70f9f4cd9 100644 --- a/qml/representations/representations.py +++ b/qml/representations/representations.py @@ -582,8 +582,8 @@ def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = :rtype: numpy array """ - Rs2 = np.linspace(0, rcut, nRs2) - Rs3 = np.linspace(0, acut, nRs3) + Rs2 = np.linspace(0.8, rcut, nRs2) + Rs3 = np.linspace(0.8, acut, nRs3) Ts = np.linspace(0, np.pi, nTs) n_elements = len(elements) natoms = len(coordinates) From 9584817ec843d2f32dc620b3786b646bdef8cb23 Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 12 Dec 2018 09:48:21 +0000 Subject: [PATCH 58/59] Updated ACSF representations so that the minimum distance at which to start the binning can be set by the user --- qml/aglaia/aglaia.py | 17 ++++++++++++----- qml/aglaia/np_symm_funct.py | 7 ++++--- qml/aglaia/symm_funct.py | 8 +++++--- qml/data/compound.py | 8 +++++--- qml/representations/representations.py | 8 +++++--- test/test_acsf.py | 11 ++++++----- test/test_symm_funct.py | 13 +++++++------ 7 files changed, 44 insertions(+), 28 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index a7d22c386..4f3b7c9b0 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -594,7 +594,7 @@ def _set_acsf_parameters(self, params): """ self.acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, - 'zeta': 220.127, 'eta': 30.8065} + 'zeta': 220.127, 'eta': 30.8065, 'min': 0.8} if params is not None: for key, value in params.items(): @@ -842,6 +842,10 @@ def _check_acsf_values(self): if is_numeric_array(self.acsf_parameters['zeta']): raise InputError("Expecting a scalar value for zeta. Got %s." % (self.acsf_parameters['zeta'])) + if not is_positive_or_zero(self.acsf_parameters['min']): + raise InputError( + "Expected positive or zero float for variable 'min'. Got %s." % str(self.acsf_parameters['min'])) + def _get_msize(self, pad = 0): """ Gets the maximum number of atoms in a single molecule. To support larger molecules @@ -1695,7 +1699,8 @@ def _generate_acsf_tf(self, xyz, classes): nRs2=self.acsf_parameters['nRs2'], nRs3=self.acsf_parameters['nRs3'], nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + zeta=self.acsf_parameters['zeta'], + min=self.acsf_parameters['min']) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -1765,7 +1770,8 @@ def _generate_acsf_fortran(self, xyz, classes): nTs=self.acsf_parameters['nTs'], eta2=self.acsf_parameters['eta'], eta3=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + zeta=self.acsf_parameters['zeta'], + min=self.acsf_parameters['min']) padded_g = np.zeros((initial_natoms, g.shape[-1])) padded_g[:g.shape[0], :] = g @@ -2121,7 +2127,7 @@ def _check_representation_parameters(self, parameters): elif self.representation_name == "acsf": acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, - 'zeta': 220.127, 'eta': 30.8065} + 'zeta': 220.127, 'eta': 30.8065, 'min':0.8} for key, value in parameters.items(): try: @@ -2423,7 +2429,8 @@ def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): nRs3=self.acsf_parameters['nRs3'], nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + zeta=self.acsf_parameters['zeta'], + min=self.acsf_parameters['min']) with tf.name_scope("Model_pred"): batch_energies_nn = self._model(batch_representation, batch_zs, element_weights, element_biases) diff --git a/qml/aglaia/np_symm_funct.py b/qml/aglaia/np_symm_funct.py index 5de18bd98..49af895e4 100644 --- a/qml/aglaia/np_symm_funct.py +++ b/qml/aglaia/np_symm_funct.py @@ -188,7 +188,7 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, return np.asarray(total_descriptor) def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, - nRs3, nTs, zeta, eta): + nRs3, nTs, zeta, eta, min): """ This function calculates the symmetry functions used in the tensormol paper. @@ -203,11 +203,12 @@ def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, :param theta_s: list of all the thetas parameters. Numpy array of shape (n_thetas,) :param zeta: parameter. scalar. :param eta: parameter. scalar. + :param min: value at which to start the binning of the distances :return: numpy array of shape (n_samples, n_atoms, n_rad_rs*n_elements + n_ang_rs*n_thetas*n_element_pairs) """ - radial_rs = np.linspace(0, rcut, nRs2) - angular_rs = np.linspace(0, acut, nRs3) + radial_rs = np.linspace(min, rcut, nRs2) + angular_rs = np.linspace(min, acut, nRs3) theta_s = np.linspace(0, np.pi, nTs) rad_term = acsf_rad(xyzs, Zs, elements, rcut, radial_rs, eta) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index 84c1eaaad..bc9158702 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -381,7 +381,7 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): return clean_final_term def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, - nRs2, nRs3, nTs, zeta, eta): + nRs2, nRs3, nTs, zeta, eta, min): """ This function generates the atom centred symmetry function as used in the Tensormol paper. Currently only tested for single systems with many conformations. It requires the coordinates of all the atoms in each data sample, the atomic @@ -410,13 +410,15 @@ def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, :type zeta: scalar float :param eta: parameter in the exponential terms :type eta: scalar float + :param min: the value at which to start binning the distances + :type min: positive float :return: the atom centred symmetry functions :rtype: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) """ - radial_rs = np.linspace(0, rcut, nRs2) - angular_rs = np.linspace(0, acut, nRs3) + radial_rs = np.linspace(min, rcut, nRs2) + angular_rs = np.linspace(min, acut, nRs3) theta_s = np.linspace(0, np.pi, nTs) with tf.name_scope("acsf_params"): diff --git a/qml/data/compound.py b/qml/data/compound.py index 2c95f5348..3f223597a 100644 --- a/qml/data/compound.py +++ b/qml/data/compound.py @@ -295,7 +295,7 @@ def generate_slatm(self, mbtypes, self.representation = slatm def generate_acsf(self, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, eta2 = 1, - eta3 = 1, zeta = 1, rcut = 5, acut = 5, gradients = False): + eta3 = 1, zeta = 1, rcut = 5, acut = 5, min=0.8, gradients = False): """ Generate the variant of atom-centered symmetry functions used in https://doi.org/10.1039/C7SC04934J @@ -317,12 +317,14 @@ def generate_acsf(self, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, et :type rcut: float :param acut: Cut-off radius of the three-body terms :type acut: float + :param min: the value at which to start binning the distances + :type min: positive float :param gradients: To return gradients or not :type gradients: boolean """ - Rs2 = np.linspace(0, rcut, nRs2) - Rs3 = np.linspace(0, acut, nRs3) + Rs2 = np.linspace(min, rcut, nRs2) + Rs3 = np.linspace(min, acut, nRs3) Ts = np.linspace(0, np.pi, nTs) n_elements = len(elements) natoms = len(self.coordinates) diff --git a/qml/representations/representations.py b/qml/representations/representations.py index 70f9f4cd9..92432831b 100644 --- a/qml/representations/representations.py +++ b/qml/representations/representations.py @@ -550,7 +550,7 @@ def generate_slatm(coordinates, nuclear_charges, mbtypes, return mbs def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, eta2 = 1, - eta3 = 1, zeta = 1, rcut = 5, acut = 5, gradients = False): + eta3 = 1, zeta = 1, rcut = 5, acut = 5, min=0.8, gradients = False): """ Generate the variant of atom-centered symmetry functions used in https://doi.org/10.1039/C7SC04934J @@ -576,14 +576,16 @@ def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = :type rcut: float :param acut: Cut-off radius of the three-body terms :type acut: float + :param min: the value at which to start binning the distances + :type min: positive float :param gradients: To return gradients or not :type gradients: boolean :return: Atom-centered symmetry functions representation :rtype: numpy array """ - Rs2 = np.linspace(0.8, rcut, nRs2) - Rs3 = np.linspace(0.8, acut, nRs3) + Rs2 = np.linspace(min, rcut, nRs2) + Rs3 = np.linspace(min, acut, nRs3) Ts = np.linspace(0, np.pi, nTs) n_elements = len(elements) natoms = len(coordinates) diff --git a/test/test_acsf.py b/test/test_acsf.py index fb29dd006..d74f1c761 100644 --- a/test/test_acsf.py +++ b/test/test_acsf.py @@ -49,6 +49,7 @@ def test_acsf_1(): acut = 5 zeta = 220.127 eta = 30.8065 + min = 0.0 input_data = test_dir + "/data/data_test_acsf.npz" data = np.load(input_data) @@ -65,13 +66,13 @@ def test_acsf_1(): zs_tf = tf.placeholder(shape=[n_samples, n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) n_samples = xyzs.shape[0] n_atoms = xyzs.shape[1] @@ -97,6 +98,7 @@ def test_acsf_2(): acut = 5 zeta = 220.127 eta = 30.8065 + min = 0.0 input_data = test_dir + "/data/qm7_testdata.npz" data = np.load(input_data) @@ -113,13 +115,13 @@ def test_acsf_2(): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) for i in range(n_samples): for j in range(max_n_atoms): @@ -130,7 +132,6 @@ def test_acsf_2(): acsf_tf_sort = np.sort(acsf_tf[i][j]) np.testing.assert_array_almost_equal(acsf_np_sort, acsf_tf_sort, decimal=4) - if __name__ == "__main__": test_acsf_1() test_acsf_2() \ No newline at end of file diff --git a/test/test_symm_funct.py b/test/test_symm_funct.py index 5f13bb201..ef3781cb8 100644 --- a/test/test_symm_funct.py +++ b/test/test_symm_funct.py @@ -63,7 +63,7 @@ def test_acsf(): "qm7/0110.xyz"] - path = test_dir = os.path.dirname(os.path.realpath(__file__)) + path = os.path.dirname(os.path.realpath(__file__)) mols = [] for xyz_file in files: @@ -85,7 +85,7 @@ def fort_acsf(mols, path, elements): # Generate atom centered symmetry functions representation # using the Compound class for i, mol in enumerate(mols): - mol.generate_acsf(elements = elements) + mol.generate_acsf(elements = elements, min=0.0) X_test = np.concatenate([mol.representation for mol in mols]) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -97,7 +97,7 @@ def fort_acsf(mols, path, elements): for i, mol in enumerate(mols): rep.append(generate_acsf(coordinates = mol.coordinates, nuclear_charges = mol.nuclear_charges, - elements = elements)) + elements = elements, min=0.0)) X_test = np.concatenate(rep) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -111,6 +111,7 @@ def tf_acsf(mols, path, elements): n_theta_s = 3 zeta = 1.0 eta = 1.0 + min=0.0 element_pairs = [] for i, ei in enumerate(elements): @@ -128,7 +129,7 @@ def tf_acsf(mols, path, elements): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta, min) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -142,7 +143,7 @@ def fort_acsf_gradients(mols, path, elements): # Generate atom centered symmetry functions representation # and gradients using the Compound class for i, mol in enumerate(mols): - mol.generate_acsf(elements = elements, gradients = True) + mol.generate_acsf(elements = elements, gradients = True, min=0.0) X_test = np.concatenate([mol.representation for mol in mols]) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -159,7 +160,7 @@ def fort_acsf_gradients(mols, path, elements): grad = [] for i, mol in enumerate(mols): r, g = generate_acsf(coordinates = mol.coordinates, nuclear_charges = mol.nuclear_charges, - elements = elements, gradients = True) + elements = elements, gradients = True, min=0.0) rep.append(r) grad.append(g) From 0639a32a87ad6ec5ff913a47abc708a2111fe77f Mon Sep 17 00:00:00 2001 From: Silvia Amabilino Date: Wed, 12 Dec 2018 10:23:57 +0000 Subject: [PATCH 59/59] Modified the name of the new parameter (minimum distance of the binning in ACSF) --- qml/aglaia/aglaia.py | 14 +++++++------- qml/aglaia/np_symm_funct.py | 8 ++++---- qml/aglaia/symm_funct.py | 10 +++++----- qml/data/compound.py | 10 +++++----- qml/representations/representations.py | 12 ++++++------ test/test_acsf.py | 12 ++++++------ test/test_symm_funct.py | 14 +++++++------- 7 files changed, 40 insertions(+), 40 deletions(-) diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 4f3b7c9b0..c345821c7 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -594,7 +594,7 @@ def _set_acsf_parameters(self, params): """ self.acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, - 'zeta': 220.127, 'eta': 30.8065, 'min': 0.8} + 'zeta': 220.127, 'eta': 30.8065, 'bin_min': 0.8} if params is not None: for key, value in params.items(): @@ -842,9 +842,9 @@ def _check_acsf_values(self): if is_numeric_array(self.acsf_parameters['zeta']): raise InputError("Expecting a scalar value for zeta. Got %s." % (self.acsf_parameters['zeta'])) - if not is_positive_or_zero(self.acsf_parameters['min']): + if not is_positive_or_zero(self.acsf_parameters['bin_min']): raise InputError( - "Expected positive or zero float for variable 'min'. Got %s." % str(self.acsf_parameters['min'])) + "Expected positive or zero float for variable 'bin_min'. Got %s." % str(self.acsf_parameters['bin_min'])) def _get_msize(self, pad = 0): """ @@ -1700,7 +1700,7 @@ def _generate_acsf_tf(self, xyz, classes): nRs3=self.acsf_parameters['nRs3'], nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], zeta=self.acsf_parameters['zeta'], - min=self.acsf_parameters['min']) + bin_min=self.acsf_parameters['bin_min']) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -1771,7 +1771,7 @@ def _generate_acsf_fortran(self, xyz, classes): eta2=self.acsf_parameters['eta'], eta3=self.acsf_parameters['eta'], zeta=self.acsf_parameters['zeta'], - min=self.acsf_parameters['min']) + bin_min=self.acsf_parameters['bin_min']) padded_g = np.zeros((initial_natoms, g.shape[-1])) padded_g[:g.shape[0], :] = g @@ -2127,7 +2127,7 @@ def _check_representation_parameters(self, parameters): elif self.representation_name == "acsf": acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, - 'zeta': 220.127, 'eta': 30.8065, 'min':0.8} + 'zeta': 220.127, 'eta': 30.8065, 'bin_min':0.8} for key, value in parameters.items(): try: @@ -2430,7 +2430,7 @@ def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], zeta=self.acsf_parameters['zeta'], - min=self.acsf_parameters['min']) + bin_min=self.acsf_parameters['bin_min']) with tf.name_scope("Model_pred"): batch_energies_nn = self._model(batch_representation, batch_zs, element_weights, element_biases) diff --git a/qml/aglaia/np_symm_funct.py b/qml/aglaia/np_symm_funct.py index 49af895e4..83030b6db 100644 --- a/qml/aglaia/np_symm_funct.py +++ b/qml/aglaia/np_symm_funct.py @@ -188,7 +188,7 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, return np.asarray(total_descriptor) def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, - nRs3, nTs, zeta, eta, min): + nRs3, nTs, zeta, eta, bin_min): """ This function calculates the symmetry functions used in the tensormol paper. @@ -203,12 +203,12 @@ def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, :param theta_s: list of all the thetas parameters. Numpy array of shape (n_thetas,) :param zeta: parameter. scalar. :param eta: parameter. scalar. - :param min: value at which to start the binning of the distances + :param bin_min: value at which to start the binning of the distances :return: numpy array of shape (n_samples, n_atoms, n_rad_rs*n_elements + n_ang_rs*n_thetas*n_element_pairs) """ - radial_rs = np.linspace(min, rcut, nRs2) - angular_rs = np.linspace(min, acut, nRs3) + radial_rs = np.linspace(bin_min, rcut, nRs2) + angular_rs = np.linspace(bin_min, acut, nRs3) theta_s = np.linspace(0, np.pi, nTs) rad_term = acsf_rad(xyzs, Zs, elements, rcut, radial_rs, eta) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index bc9158702..9917e1eb5 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -381,7 +381,7 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): return clean_final_term def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, - nRs2, nRs3, nTs, zeta, eta, min): + nRs2, nRs3, nTs, zeta, eta, bin_min): """ This function generates the atom centred symmetry function as used in the Tensormol paper. Currently only tested for single systems with many conformations. It requires the coordinates of all the atoms in each data sample, the atomic @@ -410,15 +410,15 @@ def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, :type zeta: scalar float :param eta: parameter in the exponential terms :type eta: scalar float - :param min: the value at which to start binning the distances - :type min: positive float + :param bin_min: the value at which to start binning the distances + :type bin_min: positive float :return: the atom centred symmetry functions :rtype: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) """ - radial_rs = np.linspace(min, rcut, nRs2) - angular_rs = np.linspace(min, acut, nRs3) + radial_rs = np.linspace(bin_min, rcut, nRs2) + angular_rs = np.linspace(bin_min, acut, nRs3) theta_s = np.linspace(0, np.pi, nTs) with tf.name_scope("acsf_params"): diff --git a/qml/data/compound.py b/qml/data/compound.py index 3f223597a..558ab6d84 100644 --- a/qml/data/compound.py +++ b/qml/data/compound.py @@ -295,7 +295,7 @@ def generate_slatm(self, mbtypes, self.representation = slatm def generate_acsf(self, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, eta2 = 1, - eta3 = 1, zeta = 1, rcut = 5, acut = 5, min=0.8, gradients = False): + eta3 = 1, zeta = 1, rcut = 5, acut = 5, bin_min=0.8, gradients = False): """ Generate the variant of atom-centered symmetry functions used in https://doi.org/10.1039/C7SC04934J @@ -317,14 +317,14 @@ def generate_acsf(self, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, et :type rcut: float :param acut: Cut-off radius of the three-body terms :type acut: float - :param min: the value at which to start binning the distances - :type min: positive float + :param bin_min: the value at which to start binning the distances + :type bin_min: positive float :param gradients: To return gradients or not :type gradients: boolean """ - Rs2 = np.linspace(min, rcut, nRs2) - Rs3 = np.linspace(min, acut, nRs3) + Rs2 = np.linspace(bin_min, rcut, nRs2) + Rs3 = np.linspace(bin_min, acut, nRs3) Ts = np.linspace(0, np.pi, nTs) n_elements = len(elements) natoms = len(self.coordinates) diff --git a/qml/representations/representations.py b/qml/representations/representations.py index 92432831b..a44ca7a88 100644 --- a/qml/representations/representations.py +++ b/qml/representations/representations.py @@ -549,8 +549,8 @@ def generate_slatm(coordinates, nuclear_charges, mbtypes, return mbs -def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, eta2 = 1, - eta3 = 1, zeta = 1, rcut = 5, acut = 5, min=0.8, gradients = False): +def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = 3, nRs3 = 3, nTs = 3, eta2 = 1, + eta3 = 1, zeta = 1, rcut = 5, acut = 5, bin_min=0.8, gradients = False): """ Generate the variant of atom-centered symmetry functions used in https://doi.org/10.1039/C7SC04934J @@ -576,16 +576,16 @@ def generate_acsf(nuclear_charges, coordinates, elements = [1,6,7,8,16], nRs2 = :type rcut: float :param acut: Cut-off radius of the three-body terms :type acut: float - :param min: the value at which to start binning the distances - :type min: positive float + :param bin_min: the value at which to start binning the distances + :type bin_min: positive float :param gradients: To return gradients or not :type gradients: boolean :return: Atom-centered symmetry functions representation :rtype: numpy array """ - Rs2 = np.linspace(min, rcut, nRs2) - Rs3 = np.linspace(min, acut, nRs3) + Rs2 = np.linspace(bin_min, rcut, nRs2) + Rs3 = np.linspace(bin_min, acut, nRs3) Ts = np.linspace(0, np.pi, nTs) n_elements = len(elements) natoms = len(coordinates) diff --git a/test/test_acsf.py b/test/test_acsf.py index d74f1c761..9046c83f8 100644 --- a/test/test_acsf.py +++ b/test/test_acsf.py @@ -49,7 +49,7 @@ def test_acsf_1(): acut = 5 zeta = 220.127 eta = 30.8065 - min = 0.0 + bin_min = 0.0 input_data = test_dir + "/data/data_test_acsf.npz" data = np.load(input_data) @@ -66,13 +66,13 @@ def test_acsf_1(): zs_tf = tf.placeholder(shape=[n_samples, n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, bin_min) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, bin_min) n_samples = xyzs.shape[0] n_atoms = xyzs.shape[1] @@ -98,7 +98,7 @@ def test_acsf_2(): acut = 5 zeta = 220.127 eta = 30.8065 - min = 0.0 + bin_min = 0.0 input_data = test_dir + "/data/qm7_testdata.npz" data = np.load(input_data) @@ -115,13 +115,13 @@ def test_acsf_2(): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, bin_min) sess = tf.Session() sess.run(tf.global_variables_initializer()) acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) - acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, min) + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta, bin_min) for i in range(n_samples): for j in range(max_n_atoms): diff --git a/test/test_symm_funct.py b/test/test_symm_funct.py index ef3781cb8..962919560 100644 --- a/test/test_symm_funct.py +++ b/test/test_symm_funct.py @@ -85,7 +85,7 @@ def fort_acsf(mols, path, elements): # Generate atom centered symmetry functions representation # using the Compound class for i, mol in enumerate(mols): - mol.generate_acsf(elements = elements, min=0.0) + mol.generate_acsf(elements = elements, bin_min=0.0) X_test = np.concatenate([mol.representation for mol in mols]) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -96,8 +96,8 @@ def fort_acsf(mols, path, elements): rep = [] for i, mol in enumerate(mols): rep.append(generate_acsf(coordinates = mol.coordinates, - nuclear_charges = mol.nuclear_charges, - elements = elements, min=0.0)) + nuclear_charges = mol.nuclear_charges, + elements = elements, bin_min=0.0)) X_test = np.concatenate(rep) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -111,7 +111,7 @@ def tf_acsf(mols, path, elements): n_theta_s = 3 zeta = 1.0 eta = 1.0 - min=0.0 + bin_min=0.0 element_pairs = [] for i, ei in enumerate(elements): @@ -129,7 +129,7 @@ def tf_acsf(mols, path, elements): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta, min) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta, bin_min) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -143,7 +143,7 @@ def fort_acsf_gradients(mols, path, elements): # Generate atom centered symmetry functions representation # and gradients using the Compound class for i, mol in enumerate(mols): - mol.generate_acsf(elements = elements, gradients = True, min=0.0) + mol.generate_acsf(elements = elements, gradients = True, bin_min=0.0) X_test = np.concatenate([mol.representation for mol in mols]) X_ref = np.loadtxt(path + "/data/acsf_representation.txt") @@ -160,7 +160,7 @@ def fort_acsf_gradients(mols, path, elements): grad = [] for i, mol in enumerate(mols): r, g = generate_acsf(coordinates = mol.coordinates, nuclear_charges = mol.nuclear_charges, - elements = elements, gradients = True, min=0.0) + elements = elements, gradients = True, bin_min=0.0) rep.append(r) grad.append(g)