MRMP changes (#112)

Silvia · andersx · commit 44b6c3e84c4e · 2019-06-26T08:55:59.000+02:00
* Corrected small bug in predict function * Started updating so that model can be trained after its been reloaded * Minor modifications * Updated model so one can predict from xyz and disabled shuffling in training because it leads to a problem with predictions * Fix for the problem of shuffling * Added some tests to make sure the predictions work * Fixed a tensorboard problem * The saving of the model doesn't cause an error if the directory already exists * Fixed a bug that made a test fail * Modified the name of a parameter * Made modifications to make te symmetry functions more numerically stable * Added a hack that makes ARMP work with fortran ACSF when there are padded representations. Currently works *ONLY* when there is one molecule for the whole data set. * corrected bug in score function for padded molecules * Changes that make the model work quickly even when there is padding. * Fixed discrepancies between fortran and TF acsf * Corrected bug in setting of ACSF parameters * Attempt at fixing issue #10 * another attempt at fixing #10 * Removed a pointless line * set-up * Added the graceful killer * Modifications which prevent installation from breaking on BC4 * Modification to add neural networks to qmlearn * Fix for issue #8 * Random comment * Started including the atomic model * Made the atomic neural network work * Fixed a bug with the indices * Now training and predictions don't use the default graph, to avoid problems * uncommented examples * Removed unique_elements in data class This can be stored in the NN class, but I might reverse the change later * Made tensorflow an optional dependency The reason for this approach is that pip would just auto install tensorflow and you might want the gpu version or your own compiled one. * Made is_numeric non-private and removed legacy code * Added 1d array util function * Removed QML check and moved functions from utils to tf_utils * Support for linear models (no hidden layers) * fixed import bug in tf_utils * Added text to explain that you are scoring on training set * Restructure. But elements are still not working Sorted elements * Moved documentation from init to class * Constant features will now be removed at fit/predict time * Moved get_batch_size back into utils, since it doesn't depend on tf * Made the NeuralNetwork class compliant with sklearn Cannot be any transforms of the input data * Fixed tests that didn't pass * Fixed mistake in checks of set_classes() in ARMP * started fixing ARMP bugs for QM7 * Fixed bug in padding and added examples that give low errors * Attempted fix to make representations single precision * Hot fix for AtomScaler * Minor bug fixes * More bug fixes to make sure tests run * Fixed some tests that had failures * Reverted the fchl tests to original * Fixed path in acsf test * Readded changes to tests * Modifications after code review * Version with the ACSF basis functions starting at 0.8 A * Updated ACSF representations so that the minimum distance at which to start the binning can be set by the user * Modified the name of the new parameter (minimum distance of the binning in ACSF) * Added a function to the atomscaler that enables to revert back * Relaxed tolerance in tests * Fixed bug in the padding of the representation in the ARMP network used in the pipeline * Made a modification to how the Fortran ACSF are generated that helps with how much memory is used. Currrently only float32 ACSF are available * Added a check to make sure there are no NANs in the representations. * Small mistake corrected in aglaia * Fixed extra space before -lpthread flag * Removed what I added * Implemented MRMP representations from xyz * Generate atomic slatm from data * Fixed typo * Fixed problem with slatm and ARMP * Fixed bug for MRMP tensorboard logger * Actually fixed the tensorboard bug for MRMP and added tests to catch future errors * Fixed another tensorboard bug * Changed the behaviour of logging to tensorboard in MRMP
diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py
@@ -982,6 +982,22 @@ def _get_classes(self, indices):
 
         return np.asarray(zs, dtype=np.float32)
 
+    def _generate_compounds_from_data(self, xyz, classes):
+        """
+        This function generates the compounds from xyz data and nuclear charges.
+
+        :param xyz: cartesian coordinates
+        :type xyz: numpy array of shape (n_samples, n_atoms, 3)
+        :param classes: classes for atomic decomposition
+        :type classes: None
+        :return: array of compound objects
+        """
+        compounds = np.empty(xyz.shape[0], dtype=object)
+        for i in range(xyz.shape[0]):
+            compounds[i] = Compound()
+            compounds[i].set_compounds(xyz=xyz[i], zs=classes[i])
+        return compounds
+
     def predict(self, x, classes=None):
         """
         This function calls the predict function for either ARMP or MRMP.
@@ -1088,8 +1104,13 @@ def _generate_representations_from_data(self, xyz, classes, method):
         :type method: string
         :return: numpy array of shape (n_samples, n_features) and None
         """
-        # TODO implement
-        raise InputError("Not implemented yet. Use compounds.")
+
+        if method != "fortran":
+            raise NotImplementedError
+
+        self.compounds = self._generate_compounds_from_data(xyz, classes)
+
+        return self._generate_representations_from_compounds('fortran')
 
     def _generate_representations_from_compounds(self, method):
         """
@@ -1238,9 +1259,9 @@ def _fit(self, x, y, dy, classes):
                     opt, c = self.session.run([optimisation_op, cost], feed_dict=feed_dict)
                 avg_cost += c * batch_x.shape[0] / x_approved.shape[0]
 
-                if self.tensorboard:
+                if self.tensorboard and j == 0:
                     if i % self.tensorboard_logger_training.store_frequency == 0:
-                        self.tensorboard_logger_training.write_summary(self.session, feed_dict, i, j)
+                        self.tensorboard_logger_training.write_summary(self.session, i, feed_dict=feed_dict)
 
             self.training_cost.append(avg_cost)
 
@@ -1642,20 +1663,16 @@ def _generate_representations_from_data(self, xyz, classes, method):
         representation = None
 
         if self.representation_name == 'slatm':
-            # TODO implement
-            raise InputError("Slatm from data has not been implemented yet. Use Compounds.")
+            self.compounds = self._generate_compounds_from_data(xyz, classes)
+            representation, classes =  self._generate_representations_from_compounds('fortran')
 
         elif self.representation_name == 'acsf':
             if method == 'tf':
                 representation = self._generate_acsf_tf(xyz, classes)
             else:
                 representation = self._generate_acsf_fortran(xyz, classes)
 
-        # Hotfix t make sure the representation is single precision
-        single_precision_representation = representation.astype(dtype=np.float32)
-        del representation
-
-        return single_precision_representation, classes
+        return representation, classes
 
     def _generate_acsf_tf(self, xyz, classes):
         """
@@ -1776,7 +1793,11 @@ def _generate_acsf_fortran(self, xyz, classes):
                 padded_g = np.zeros((initial_natoms, g.shape[-1]))
                 padded_g[:g.shape[0], :] = g
 
-                representation.append(padded_g)
+                # Hotfix t make sure the representation is single precision
+                single_precision_g = padded_g.astype(dtype=np.float32)
+                del padded_g
+
+                representation.append(single_precision_g)
 
             else:
 
@@ -1790,7 +1811,10 @@ def _generate_acsf_fortran(self, xyz, classes):
                                   eta3=self.acsf_parameters['eta'],
                                   zeta=self.acsf_parameters['zeta'])
 
-                representation.append(g)
+                single_precision_g = g.astype(dtype=np.float32)
+                del g
+
+                representation.append(single_precision_g)
 
         return np.asarray(representation)
 
@@ -2275,7 +2299,8 @@ def _fit_from_scratch(self, x, y, dy, classes):
         init = tf.global_variables_initializer()
         iterator_init = iterator.make_initializer(dataset, name="dataset_init")
 
-        self._build_model_from_xyz(self.n_atoms, element_weights, element_biases)
+        if self.representation_name == "acsf":
+            self._build_model_from_xyz(self.n_atoms, element_weights, element_biases)
 
         self.session = tf.Session()
 
diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py
@@ -55,10 +55,13 @@ def set_store_frequency(self, freq):
     def set_summary_writer(self, sess):
         self.summary_writer = tf.summary.FileWriter(logdir=self.path, graph=sess.graph)
 
-    def write_summary(self, sess, iteration):
+    def write_summary(self, sess, iteration, feed_dict=None):
 
         self.merged_summary = tf.summary.merge_all()
-        summary = sess.run(self.merged_summary)
+        if not isinstance(feed_dict, type(None)):
+            summary = sess.run(self.merged_summary, feed_dict)
+        else:
+            summary = sess.run(self.merged_summary)
         self.summary_writer.add_summary(summary, iteration)
         self.summary_writer.add_run_metadata(self.run_metadata, 'iteration %d' % (iteration))
 
diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py
@@ -734,8 +734,8 @@ def _padding(self, representation, nuclear_charges):
             print("Trying to predict on larger molecules than given by the 'size' parameter at initialization")
             raise SystemExit
 
-        padded_rep = np.zeros((len(representation), max_n_atoms, representation[0].shape[1]))
-        padded_zs = np.zeros((len(representation), max_n_atoms))
+        padded_rep = np.zeros((len(representation), self.size, representation[0].shape[1]))
+        padded_zs = np.zeros((len(representation), self.size))
 
         for i in range(len(representation)):
             n_atoms = representation[i].shape[0]
diff --git a/qml/qmlearn/representations.py b/qml/qmlearn/representations.py
@@ -710,6 +710,11 @@ def transform(self, X):
                         fgenerate_acsf(xyz, charge, self.elements, Rs, Rs, Ts,
                             eta, eta, zeta, self.cutoff, self.cutoff, n, size)))
 
+        # Check to make sure there are no NANs
+        # if np.any(np.isnan(representations)):
+        #     print("There are NANs in the representations.")
+        #     exit()
+
         data._representations = np.asarray(representations)
 
         return data
diff --git a/qml/utils/compound.py b/qml/utils/compound.py
@@ -373,3 +373,22 @@ def read_xyz(self, filename):
             self.coordinates[i] = np.asarray(tokens[1:4], dtype=float)
    
         self.natypes = dict([(key, len(value)) for key,value in self.atomtype_indices.items()])
+
+    def set_compounds(self, xyz, zs):
+        """
+        Generating the compounds straight from XYZ rather than from the files.
+
+        :param xyz: coordinates
+        :type xyz: np array of shape (n_samples, n_atoms, 3)
+        :param zs: nuclear charges
+        :type zs: np array of shape (n_samples, n_atoms)
+        :return: None
+        """
+
+        self.natoms = xyz.shape[0]
+        self.nuclear_charges = zs
+        self.coordinates = xyz
+        self.atomtypes = np.unique(zs)
+
+        self.name = "Compound"
+
diff --git a/test/test_armp.py b/test/test_armp.py
@@ -162,6 +162,28 @@ def test_fit_3():
     estimator = ARMP()
     estimator.fit(x=descriptor, y=energies, classes=classes)
 
+def test_fit_4():
+    """
+    This function tests the second way of fitting the descriptor: the data is passed by storing the compounds in the
+    class.
+    """
+    test_dir = os.path.dirname(os.path.realpath(__file__))
+
+    data = np.load(test_dir + "/data/local_slatm_ch4cn_light.npz")
+    descriptor = data["arr_0"]
+    classes = data["arr_1"]
+    energies = data["arr_2"]
+
+    estimator = ARMP(tensorboard=True, tensorboard_subdir="./tb_test_4")
+    estimator.set_representations(representations=descriptor)
+    estimator.set_classes(classes=classes)
+    estimator.set_properties(energies)
+
+    idx = np.arange(0, 100)
+    estimator.fit(idx)
+
+    shutil.rmtree("./tb_test_4")
+
 def test_score_3():
     """
     This function tests that all the scoring functions work.
@@ -303,6 +325,7 @@ def test_retraining():
     test_fit_1()
     test_fit_2()
     test_fit_3()
+    test_fit_4()
     test_score_3()
     test_predict_3()
     test_predict_fromxyz()
diff --git a/test/test_mrmp.py b/test/test_mrmp.py
@@ -169,6 +169,22 @@ def test_fit_3():
     estimator = MRMP()
     estimator.fit(descriptor, energies)
 
+def test_fit_4():
+    """
+    This function tests a third way of fitting the descriptor:
+    The data is passed directly to the fit function.
+    """
+    test_dir = os.path.dirname(os.path.realpath(__file__))
+
+    data = np.load(test_dir + "/data/CN_isopent_light_UCM.npz")
+    descriptor = data["arr_0"]
+    energies = data["arr_1"]
+
+    estimator = MRMP(tensorboard=True, tensorboard_subdir="./tb_test_4")
+    estimator.fit(descriptor, energies)
+
+    shutil.rmtree("./tb_test_4")
+
 def test_score():
     """
     This function tests that all the scoring functions work.
@@ -264,6 +280,7 @@ def test_load_external():
     test_fit_1()
     test_fit_2()
     test_fit_3()
+    test_fit_4()
     test_score()
     test_load_external()
     # test_get_params()