diff --git a/README.rst b/README.rst index 6e88afed3..aede0c9b8 100644 --- a/README.rst +++ b/README.rst @@ -19,6 +19,8 @@ Current list of contributors: - Alexandre Tkatchenko (University of Luxembourg) - Klaus-Robert Muller (Technische Universitat Berlin/Korea University) - \O. Anatole von Lilienfeld (University of Basel) +- Peter Zaspel (University of Basel) +- Helmut Harbrecht (University of Basel) 1) Citing QML: -------------- diff --git a/examples/README.cqml b/examples/README.cqml new file mode 100644 index 000000000..cc0803876 --- /dev/null +++ b/examples/README.cqml @@ -0,0 +1,8 @@ +To be able to use the examples for the CQML approach, please first download +the file "DataSet_cqml.tgz" from + +https://doi.org/10.6084/m9.figshare.9130973 + +and decompress it in the "examples" folder by + +tar xvzf DataSet_cqml.tgz diff --git a/examples/cqml2d_CI9.py b/examples/cqml2d_CI9.py new file mode 100644 index 000000000..bc18ab06d --- /dev/null +++ b/examples/cqml2d_CI9.py @@ -0,0 +1,273 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +from math import * +import os +import numpy as np +import sys +#import matplotlib.pyplot as plt + +import qml +from qml.models.cqml2d import * + + +trials = 20 + + +def write_data(filename, costs, errors): + outfile = open(filename,'w') + outfile.write("N error\n") + for i in range(len(errors)): + outfile.write("%d %e\n" % (costs[i],errors[i])) + outfile.close() + +# this test compares cqml2d with different numbers of level (PM7 + DFT + G4MP2 vs. DFT + G4MP2 vs. G4MP2) for the CI9 data set +# with Coulomb matrix representation, cf. Figure 6 in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml2d_coulomb(): + + ######################### + # loading multilevel data + ######################### + + # getting structure for config of multilevel data + ml_data_cfg = multilevel_data_config() + + # setting data files per level for data which is to be learned + ml_data_cfg.data_files = ["DataSet_cqml2d_CI9/Y1.dat","DataSet_cqml2d_CI9/Y2.dat","DataSet_cqml2d_CI9/Y3.dat"] + + # setting directories per level for the xyz files + ml_data_cfg.xyz_directories = ["DataSet_cqml2d_CI9/db/ci6k_PM7","DataSet_cqml2d_CI9/db/ci6k_B3LYP631G2DFP","DataSet_cqml2d_CI9/db/ci6k_G4MP2"] + + # choose representation + ml_data_cfg.representation_type = "coulomb_matrix" + + # choose the number of compounds that shall be loaded + ml_data_cfg.N_load = 6095 + + # choose the number of levels to load + ml_data_cfg.level_count_load = 3 + + # load the multilevel data + ml_data = load_multilevel_data(ml_data_cfg) + + # artificially imposint costs for the level (in the future this shall be delivered with the data) + ml_data.level_costs = [0,0,1] + + ############################### + # testing combination technique + ############################### + + # getting structure for the configuraiton of the combination technique test + ct_cfg = cqml2d_config() + + # choose the maximum resolution level (i.e. number of learning samples = 2^max_resolution_level) for which the convergence shall be checked + ct_cfg.max_resolution_level = 12 # 9 + + # set the scaling parameters for each level (this is the global level as in the ml_data structure) + ct_cfg.scalings = [400,400,400] + + # set the regularization parameter for each level (this is the clobal level as in the ml_data structure) + ct_cfg.regularizations = [10**(-10),10**(-10),10**(-10)]; + + # set the jump size between each individual level + ct_cfg.level_jump_size = [1] + + # set the (global!) level on which the error shall be evaluated (starting from 0!) + ct_cfg.error_level = 2 + + + # set the current (global!) base level from which the combination technique shall be started (starting from 0) + # here: start on level 0 + ct_cfg.base_level = 0 + # set the current (local!) number of levels for which the combination technique shall be computed + # here: use all levels + ct_cfg.level_count = 3 + + # => this is the full combination technique on all levels + + # create a figure +# f = plt.figure() + + # do the computation with averaging over "trials" trials + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot the result +# plt.loglog(costs,errors, label="PM7+DFT+G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_PM7_DFT_G4MP2_coulomb.csv',costs,errors) + + # here: start on level 1 + ct_cfg.base_level = 1 + # here: use only DFT+G4MP2 + ct_cfg.level_count = 2 + + # => this corresponds to a modified version of Delta-ML + + # compute + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot +# plt.loglog(costs,errors, label="DFT+G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_DFT_G4MP2_coulomb.csv',costs,errors) + + # here: start on level 2 + ct_cfg.base_level = 2 + # here: use only one level + ct_cfg.level_count = 1 + + # => this corresponds to standard learning on level 2 + + # compute + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot +# plt.loglog(costs,errors, label="G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_G4MP2_coulomb.csv',costs,errors) + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. G4MP2 solution") +# plt.title("CQML: Plot wrt. number of most expensive molecules") + +# plt.show() + + + +# this test compares cqml2d with different numbers of level (PM7 + DFT + G4MP2 vs. DFT + G4MP2 vs. G4MP2) for the CI9 data set +# with SLATM representation, cf. Figure 6 in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml2d_slatm(): + + ######################### + # loading multilevel data + ######################### + + # getting structure for config of multilevel data + ml_data_cfg = multilevel_data_config() + + # setting data files per level for data which is to be learned + ml_data_cfg.data_files = ["DataSet_cqml2d_CI9/Y1.dat","DataSet_cqml2d_CI9/Y2.dat","DataSet_cqml2d_CI9/Y3.dat"] + + # setting directories per level for the xyz files + ml_data_cfg.xyz_directories = ["DataSet_cqml2d_CI9/db/ci6k_PM7","DataSet_cqml2d_CI9/db/ci6k_B3LYP631G2DFP","DataSet_cqml2d_CI9/db/ci6k_G4MP2"] + + # choose representation + ml_data_cfg.representation_type = "slatm" + + # choose the number of compounds that shall be loaded + ml_data_cfg.N_load = 6095 + + # choose the number of levels to load + ml_data_cfg.level_count_load = 3 + + # load the multilevel data + ml_data = load_multilevel_data(ml_data_cfg) + + # artificially imposint costs for the level (in the future this shall be delivered with the data) + ml_data.level_costs = [0,0,1] + + ############################### + # testing combination technique + ############################### + + # getting structure for the configuraiton of the combination technique test + ct_cfg = cqml2d_config() + + # choose the maximum resolution level (i.e. number of learning samples = 2^max_resolution_level) for which the convergence shall be checked + ct_cfg.max_resolution_level = 12 # 9 + + # set the scaling parameters for each level (this is the global level as in the ml_data structure) + ct_cfg.scalings = [400,400,400] + + # set the regularization parameter for each level (this is the clobal level as in the ml_data structure) + ct_cfg.regularizations = [10**(-10),10**(-10),10**(-10)]; + + # set the jump size between each individual level + ct_cfg.level_jump_size = [1] + + # set the (global!) level on which the error shall be evaluated (starting from 0!) + ct_cfg.error_level = 2 + + + # set the current (global!) base level from which the combination technique shall be started (starting from 0) + # here: start on level 0 + ct_cfg.base_level = 0 + # set the current (local!) number of levels for which the combination technique shall be computed + # here: use all levels + ct_cfg.level_count = 3 + + # => this is the full combination technique on all levels + + # create a figure +# f = plt.figure() + + # do the computation with averaging over "trials" trials + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot the result +# plt.loglog(costs,errors, label="PM7+DFT+G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_PM7_DFT_G4MP2_slatm.csv',costs,errors) + + # here: start on level 1 + ct_cfg.base_level = 1 + # here: use only DFT+G4MP2 + ct_cfg.level_count = 2 + + # => this corresponds to a modified version of Delta-ML + + # compute + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot +# plt.loglog(costs,errors, label="DFT+G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_DFT_G4MP2_slatm.csv',costs,errors) + + # here: start on level 2 + ct_cfg.base_level = 2 + # here: use only one level + ct_cfg.level_count = 1 + + # => this corresponds to standard learning on level 2 + + # compute + (errors, costs) = compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials); + # plot +# plt.loglog(costs,errors, label="G4MP2") + write_data('DataSet_cqml2d_CI9/DataSet_cqml2d_CI9_G4MP2_slatm.csv',costs,errors) + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. G4MP2 solution") +# plt.title("CQML: Plot wrt. number of most expensive molecules") + +# plt.show() + + + +def main(): + + test_cqml2d_coulomb() + test_cqml2d_slatm() + +if __name__ == '__main__': + main() diff --git a/examples/cqml_QM7b.py b/examples/cqml_QM7b.py new file mode 100644 index 000000000..2f9e1d9a0 --- /dev/null +++ b/examples/cqml_QM7b.py @@ -0,0 +1,668 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +from math import * +import os +import numpy as np +import sys +# import matplotlib.pyplot as plt + +import qml +import qml.models.cqml as cq + + +trials = 20 + + +def write_data(filename, costs, errors): + outfile = open(filename,'w') + outfile.write("N error\n") + for i in range(len(errors)): + outfile.write("%d %e\n" % (costs[i],errors[i])) + outfile.close() + +# this test compares 3D cqml, 2D cqml and conventional QML for the QM7b data set "with shift" and "s=1", cf. Figure 5 in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml_with_shift(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 2 + + # set evaluation subspace + evaluation_subspace = (2,2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 400 + + + + # create a figure +# f = plt.figure(figsize=(12,9)) + + # HF + MP2 + CCSD(T) (3D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_shift(0,scalings,regularizations,training_offset, 2) + + # compute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + # plot the contributions for all subspaces +# plt.xscale("log") +# plt.yscale("log") +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="3d CQML: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_3DCT_jump1_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + # CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 13 + # define training offset + training_offset = (2,2,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.yscale('log') +# plt.xscale('log') +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="standard ML: CCSD(T) + cc-pVDZ") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_standard_QML.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # fixing ccpvdz basis set, variation in theory (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 11 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 1, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. theories: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCT_jump1_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # fixing CCSD(T) theory, variation in basis set size (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 11 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 0, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. basis set size: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCTbasis_jump1_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("# most expensive training data (in terms of molecules)") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) + cc-pVDZ solution") +# plt.title("cqml comparison (s=1)") + +# plt.show() + + +# this test compares 3D cqml, 2D cqml and conventional QML for the QM7b data set "with shift" and "s=2", cf. Figure 5 in the paper +# "Boosting quantum machine learning models with multi-level combination technique: Pople diagrams revisited" +def test_cqml_with_shift_jump4(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 4 + + # set evaluation subspace + evaluation_subspace = (2,2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 400 + + + # create a figure +# f = plt.figure(figsize=(12,9)) + + # HF + MP2 + CCSD(T) (3D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 5 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_shift(0,scalings,regularizations,training_offset, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the contributions for all subspaces +# plt.xscale("log") +# plt.yscale("log") +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="3d CQML: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_3DCT_jump2_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + # CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 13 + # define training offset + training_offset = (2,2,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.yscale('log') +# plt.xscale('log') +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="standard ML: CCSD(T) + cc-pVDZ") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_jump2_standard_QML.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # fixing ccpvdz basis set, variation in theory (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0,0) + # generate traning subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 1, 2) + + # compute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. theories: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCT_jump2_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + + # fixing CCSD(T) theory, variation in basis set size (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 0, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. basis set size: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCTbasis_jump2_all_levels.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("# most expensive training data (in terms of molecules)") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) + cc-pVDZ solution") +# plt.title("3d CQML") + +# plt.show() + + +# this test compares 3D cqml, 2D cqml and conventional QML for the QM7b data set "with shift" and "s=2" with respect to the root mean +# square error. This data is used in Figure 5 of the paper +# "Boosting quantum machine learning models with multi-level combination technique: Pople diagrams revisited" +def test_cqml_with_shift_jump4_rmse(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 4 + + # set evaluation subspace + evaluation_subspace = (2,2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 400 + + + # create a figure +# f = plt.figure(figsize=(12,9)) + + # HF + MP2 + CCSD(T) (3D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 5 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_shift(0,scalings,regularizations,training_offset, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence_rmse(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the contributions for all subspaces +# plt.xscale("log") +# plt.yscale("log") +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="3d CQML: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_3DCT_jump2_all_levels_rmse.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + # CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 13 + # define training offset + training_offset = (2,2,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence_rmse(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.yscale('log') +# plt.xscale('log') +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="standard ML: CCSD(T) + cc-pVDZ") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_jump2_standard_QML_rmse.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # fixing ccpvdz basis set, variation in theory (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0,0) + # generate traning subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 1, 2) + + # compute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence_rmse(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. theories: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCT_jump2_all_levels_rmse.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + + # fixing CCSD(T) theory, variation in basis set size (2D CT) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces_with_one_fixed_dimension(1, scalings, regularizations, training_offset, 0, 2) + + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence_rmse(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # get subspaces + subspaces = [] + for s in sample_counts_list[0]: + subspaces.append(s) + + # construct dictionary which will associate the subspaces to a sample count list for that subspace + sample_count_list_per_subspace = dict() + for s in subspaces: + sample_count_list_per_subspace[s] = [] + + # fill dictionary + for sl in sample_counts_list: + for s in sl: + sample_count_list_per_subspace[s].append(sl[s]) + + + # plot the result +# plt.plot(sample_count_list_per_subspace[(2,2)],average_error_list, label="2d CQML wrt. basis set size: CCSD(T) + cc-pVDZ contribution") + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2DCTbasis_jump2_all_levels_rmse.csv',sample_count_list_per_subspace[(2,2)],average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("# most expensive training data (in terms of molecules)") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) + cc-pVDZ solution") +# plt.title("3d CQML") + +# plt.show() + + + + +def main(): + + # select whether to use already precomputed data + use_precomputed_data = False + + # initialize combination technique object + cqml_object = cq.cqml() + + # define energy subspaces to learn from + cqml_object.subspaces = [(0,0),(0,1),(0,2),(1,0),(1,1),(1,2),(2,0),(2,1),(2,2)] + + if (not use_precomputed_data): + # set training data files + cqml_object.data_files[(0,0)] = "DataSet_cqml_QM7b/HF_sto3g.txt" + cqml_object.data_files[(0,1)] = "DataSet_cqml_QM7b/HF_631g.txt" + cqml_object.data_files[(0,2)] = "DataSet_cqml_QM7b/HF_ccpvdz.txt" + cqml_object.data_files[(1,0)] = "DataSet_cqml_QM7b/MP2_sto3g.txt" + cqml_object.data_files[(1,1)] = "DataSet_cqml_QM7b/MP2_631g.txt" + cqml_object.data_files[(1,2)] = "DataSet_cqml_QM7b/MP2_ccpvdz.txt" + cqml_object.data_files[(2,0)] = "DataSet_cqml_QM7b/CCSDT_sto3g.txt" + cqml_object.data_files[(2,1)] = "DataSet_cqml_QM7b/CCSDT_631g.txt" + cqml_object.data_files[(2,2)] = "DataSet_cqml_QM7b/CCSDT_ccpvdz.txt" + + # set molecule data directories + # WARNING: "cqml" (not "cqml2d") currently only supports one single geometry for all subspaces ! + cqml_object.xyz_directories[(0,0)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(0,1)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(0,2)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(1,0)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(1,1)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(1,2)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(2,0)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(2,1)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(2,2)] = "DataSet_cqml_QM7b/geometry" + + # set number of molecules (to be loaded) per energy subspace + N = 7211 + cqml_object.N_load[(0,0)] = N + cqml_object.N_load[(0,1)] = N + cqml_object.N_load[(0,2)] = N + cqml_object.N_load[(1,0)] = N + cqml_object.N_load[(1,1)] = N + cqml_object.N_load[(1,2)] = N + cqml_object.N_load[(2,0)] = N + cqml_object.N_load[(2,1)] = N + cqml_object.N_load[(2,2)] = N + + # fix costs per energy subspace + cqml_object.subspace_costs[(0,0)] = 1 + cqml_object.subspace_costs[(0,1)] = 1 + cqml_object.subspace_costs[(0,2)] = 1 + cqml_object.subspace_costs[(1,0)] = 1 + cqml_object.subspace_costs[(1,1)] = 1 + cqml_object.subspace_costs[(1,2)] = 1 + cqml_object.subspace_costs[(2,0)] = 1 + cqml_object.subspace_costs[(2,1)] = 1 + cqml_object.subspace_costs[(2,2)] = 1 + + # load the data + cqml_object.load_data() + + + # define scalings for kernel ridge regression per energy subpspace + scalings = dict(); + scalings[(0,0)] = 400 + scalings[(0,1)] = 400 + scalings[(0,2)] = 400 + scalings[(1,0)] = 400 + scalings[(1,1)] = 400 + scalings[(1,2)] = 400 + scalings[(2,0)] = 400 + scalings[(2,1)] = 400 + scalings[(2,2)] = 400 + + + # define regularizations per energy subspace + regularizations = dict() + regularizations[(0,0)] = 10**(-10) + regularizations[(0,1)] = 10**(-10) + regularizations[(0,2)] = 10**(-10) + regularizations[(1,0)] = 10**(-10) + regularizations[(1,1)] = 10**(-10) + regularizations[(1,2)] = 10**(-10) + regularizations[(2,0)] = 10**(-10) + regularizations[(2,1)] = 10**(-10) + regularizations[(2,2)] = 10**(-10) + + + if (not use_precomputed_data): + + # construct all training kernel matrices + cqml_object.generate_full_kernel_matrices(scalings, regularizations) + + # save CT data + cqml_object.save_binary("DataSet_cqml_QM7b/cqml_QM7b_data.dat") + + if (use_precomputed_data): + # load precomputed CT object + print("Loading data ...") + cqml_object = cq.cqml.load("DataSet_cqml_QM7b/cqml_QM7b_data.dat") + print("Done!") + + + # set number of dimensions of CT. Here (theory_level,basis_set_size,sampling_level) -> 3 + cqml_object.ct_dim=3 + + + test_cqml_with_shift(cqml_object, scalings, regularizations) + test_cqml_with_shift_jump4(cqml_object, scalings, regularizations) + test_cqml_with_shift_jump4_rmse(cqml_object, scalings, regularizations) + + + +if __name__ == '__main__': + main() diff --git a/examples/cqml_in_2d_QM7b.py b/examples/cqml_in_2d_QM7b.py new file mode 100644 index 000000000..74a3bbd17 --- /dev/null +++ b/examples/cqml_in_2d_QM7b.py @@ -0,0 +1,283 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +from math import * +import os +import numpy as np +import sys +# import matplotlib.pyplot as plt + + +import qml +import qml.models.cqml as cq + + +trials = 20 + +def write_data(filename, costs, errors): + outfile = open(filename,'w') + outfile.write("N error\n") + for i in range(len(errors)): + outfile.write("%d %e\n" % (costs[i],errors[i])) + outfile.close() + +# this test compares cqml with different numbers of level (HF + MP2 + CCSD(T) vs. MP2 + CCSD(T) vs. CCSD(T)) for the QM7b data set +# in two dimensions (i.e. fixed basis set size) and "s=1", cf. Figure 5 (left) in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml_in_2d_jump_factor_2(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 2 + + # set evaluation subspace + evaluation_subspace = (2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 200 + + + + # create a figure +# f = plt.figure() + + # HF + MP2 + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 11 + # define training offset + training_offset = (0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="HF+MP2+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump1_HF_MP2_CCSDT.csv',costs,average_error_list) + + # MP2 + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 12 + # define training offset + training_offset = (1,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="MP2+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump1_MP2_CCSDT.csv',costs,average_error_list) + + + # CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 13 + # define training offset + training_offset = (2,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump1_CCSDT.csv',costs,average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) solution") +# plt.title("Combination technique: Plot wrt. number of most expensive molecules") + +# plt.show() + + +# this test compares cqml with different numbers of level (HF + MP2 + CCSD(T) vs. MP2 + CCSD(T) vs. CCSD(T)) for the QM7b data set +# in two dimensions (i.e. fixed basis set size) and "s=2", cf. Figure 5 (left) in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml_in_2d_jump_factor_4(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 4 + + # set evaluation subspace + evaluation_subspace = (2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 200 + + + + # create a figure +# f = plt.figure() + + # HF + MP2 + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 9 + # define training offset + training_offset = (0,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="HF+MP2+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump2_HF_MP2_CCSDT.csv',costs,average_error_list) + + # MP2 + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 11 + # define training offset + training_offset = (1,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="MP2+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump2_MP2_CCSDT.csv',costs,average_error_list) + + + # CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 13 + # define training offset + training_offset = (2,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump2_CCSDT.csv',costs,average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) solution") +# plt.title("Combination technique: Plot wrt. number of most expensive molecules") + +# plt.show() + + + +def main(): + + # select whether to use already precomputed data + use_precomputed_data = False + + # initialize combination technique object + cqml_object = cq.cqml() + + + # define energy subspaces to learn from + cqml_object.subspaces = [(0,),(1,),(2,)] + + if (not use_precomputed_data): + # set training data files + cqml_object.data_files[(0,)] = "DataSet_cqml_QM7b/HF_ccpvdz.txt" + cqml_object.data_files[(1,)] = "DataSet_cqml_QM7b/MP2_ccpvdz.txt" + cqml_object.data_files[(2,)] = "DataSet_cqml_QM7b/CCSDT_ccpvdz.txt" + + # set molecule data directories + # WARNING: "cqml" (not "cqml2d") currently only supports one single geometry for all subspaces ! + cqml_object.xyz_directories[(0,)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(1,)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(2,)] = "DataSet_cqml_QM7b/geometry" + + # set number of data (to be loaded) per energy subspace + cqml_object.N_load[(0,)] = 7211 + cqml_object.N_load[(1,)] = 7211 + cqml_object.N_load[(2,)] = 7211 + + # fix costs per energy subspace + cqml_object.subspace_costs[(0,)] = 1 + cqml_object.subspace_costs[(1,)] = 1 + cqml_object.subspace_costs[(2,)] = 1 + + # load the data + cqml_object.load_data() + + + # define scalings for kernel ridge regression per energy subpspace + scalings = dict(); + scalings[(0,)] = 400 + scalings[(1,)] = 400 + scalings[(2,)] = 400 + # define regularizations per energy subspace + regularizations = dict() + regularizations[(0,)] = 10**(-10) + regularizations[(1,)] = 10**(-10) + regularizations[(2,)] = 10**(-10) + + if (not use_precomputed_data): + # construct all training kernel matrices + cqml_object.generate_full_kernel_matrices(scalings, regularizations) + + # save CT data + cqml_object.save_binary("DataSet_cqml_QM7b/cqml_QM7b_data_2d.dat") + + if (use_precomputed_data): + # load precomputed CT object + print("Loading data ...") + cqml_object = cq.cqml.load("DataSet_cqml_QM7b/cqml_QM7b_data_2d.dat") + print("Done!") + + # set number of dimensions of CT. Here (theory_level,sampling_level) -> 2 + cqml_object.ct_dim=2 + + test_cqml_in_2d_jump_factor_2(cqml_object, scalings, regularizations) + test_cqml_in_2d_jump_factor_4(cqml_object, scalings, regularizations) + +if __name__ == '__main__': + main() diff --git a/examples/cqml_in_2d_QM7b_HF_CCSDT.py b/examples/cqml_in_2d_QM7b_HF_CCSDT.py new file mode 100644 index 000000000..ec35f0f4b --- /dev/null +++ b/examples/cqml_in_2d_QM7b_HF_CCSDT.py @@ -0,0 +1,201 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +from math import * +import os +import numpy as np +import sys +# import matplotlib.pyplot as plt + +import qml +import qml.models.cqml as cq + + +trials = 20 + +def write_data(filename, costs, errors): + outfile = open(filename,'w') + outfile.write("N error\n") + for i in range(len(errors)): + outfile.write("%d %e\n" % (costs[i],errors[i])) + outfile.close() + +# this test computes the learning curve for HF + CCSD(T) for the QM7b data set +# in two dimensions (i.e. fixed basis set size) and "s=1", cf. Figure 5 (left) in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml_in_2d_only_HF_CCSDT_jump_factor_2(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 2 + + # set evaluation subspace + evaluation_subspace = (2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 200 + + + # create a figure +# f = plt.figure() + + + # HF + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 12 + # define training offset + training_offset = (1,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="HF+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump1_HF_CCSDT.csv',costs,average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) solution") +# plt.title("Combination technique: Plot wrt. number of most expensive molecules") + +# plt.show() + + +# this test computes the learning curve for HF + CCSD(T) for the QM7b data set +# in two dimensions (i.e. fixed basis set size) and "s=2", cf. Figure 5 (left) in the paper +# "Boosting quantum machine learning models with multi-level cqml_objectnation technique: Pople diagrams revisited" +def test_cqml_in_2d_only_HF_CCSDT_jump_factor_4(cqml_object, scalings, regularizations): + + ############################### + # testing combination technique + ############################### + + + # set level jump size + level_jump_size = 4 + + # set evaluation subspace + evaluation_subspace = (2,2) + + # set number of evaluations to do for a single trial + evaluation_count = 200 + + + # create a figure +# f = plt.figure() + + + # HF + CCSD(T) + + # define maximum learning level (for smallest sampling level) + maximum_level = 11 + # define training offset + training_offset = (1,0) + # generate training subspaces + cqml_object.generate_uniform_ct_training_subspaces(2,scalings,regularizations,training_offset) + # combpute convergence + (average_error_list, sample_counts_list) = cqml_object.check_nested_convergence(maximum_level, level_jump_size, evaluation_subspace, evaluation_count, trials) + + # plot the result +# plt.loglog([sample_counts_list[l][(2,)] for l in range(len(average_error_list))],average_error_list, label="HF+CCSD(T)") + costs = [sample_counts_list[l][(2,)] for l in range(len(average_error_list))] + write_data('DataSet_cqml_QM7b/DataSet_cqml_QM7b_2d_jump2_HF_CCSDT.csv',costs,average_error_list) + + + # generate a legend for the plot +# plt.legend() + # add axis labels and a title +# plt.xlabel("number of most expensive molecules") +# plt.ylabel("MAE [kcal/mol] wrt. CCSD(T) solution") +# plt.title("Combination technique: Plot wrt. number of most expensive molecules") + +# plt.show() + + + +def main(): + + # initialize combination technique object + cqml_object = cq.cqml() + + # define energy subspaces to learn from + cqml_object.subspaces = [(0,),(1,),(2,)] + + # # set training data files + cqml_object.data_files[(0,)] = "DataSet_cqml_QM7b/HF_ccpvdz.txt" + cqml_object.data_files[(1,)] = "DataSet_cqml_QM7b/HF_ccpvdz.txt" + cqml_object.data_files[(2,)] = "DataSet_cqml_QM7b/CCSDT_ccpvdz.txt" + + # set molecule data directories + # WARNING: "cqml" (not "cqml2d") currently only supports one single geometry for all subspaces ! + cqml_object.xyz_directories[(0,)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(1,)] = "DataSet_cqml_QM7b/geometry" + cqml_object.xyz_directories[(2,)] = "DataSet_cqml_QM7b/geometry" + # + # set number of data (to be loaded) per energy subspace + cqml_object.N_load[(0,)] = 7211 + cqml_object.N_load[(1,)] = 7211 + cqml_object.N_load[(2,)] = 7211 + + # fix costs per energy subspace + cqml_object.subspace_costs[(0,)] = 1 + cqml_object.subspace_costs[(1,)] = 1 + cqml_object.subspace_costs[(2,)] = 1 + + # load the data + cqml_object.load_data() + + # define scalings for kernel ridge regression per energy subpspace + scalings = dict(); + scalings[(0,)] = 400 + scalings[(1,)] = 400 + scalings[(2,)] = 400 + # define regularizations per energy subspace + regularizations = dict() + regularizations[(0,)] = 10**(-10) + regularizations[(1,)] = 10**(-10) + regularizations[(2,)] = 10**(-10) + + # construct all training kernel matrices + cqml_object.generate_full_kernel_matrices(scalings, regularizations) + + # set number of dimensions of CT. Here (theory_level,sampling_level) -> 2 + cqml_object.ct_dim=2 + + test_cqml_in_2d_only_HF_CCSDT_jump_factor_2(cqml_object, scalings, regularizations) + test_cqml_in_2d_only_HF_CCSDT_jump_factor_4(cqml_object, scalings, regularizations) + + +if __name__ == '__main__': + main() diff --git a/qml/models/cqml.py b/qml/models/cqml.py new file mode 100644 index 000000000..7bcf90dd4 --- /dev/null +++ b/qml/models/cqml.py @@ -0,0 +1,702 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from math import * +import os +import numpy as np +import qml +from scipy import special +from qml.kernels import laplacian_kernel,gaussian_kernel +from qml.math import cho_solve +import scipy.spatial.distance as ssd +import pickle + +from qml.representations import get_slatm_mbtypes + + + +# struct / class containing all configuration parameters for the combination technique +class cqml: + + # space dimensionality + space_dims = [0,0] + # list of subspaces + subspaces = []; + # dictionary of learning data files per subspace + data_files = dict() + # dictionary of directories with xyz files per subspace + # WARNING: cqml currently only supports one single type of geometry for all subspaces ! + xyz_directories = dict() + # dictionary of number of compounds to load for training and testing per subspace + N_load = dict() + # representation type ("slatm" / "coulomb_matrix") + representation_type = "slatm" + + # dictionary of representations arrays for each subspace + # WARNING: cqml currently only supports one single type of geometry for all subspaces ! + X = dict() + # dictionary of energy arrays for each subspace + Y = dict() + # dictionary of cost arrays for each level + c = dict() + # dictionary of full kernel matrices (for all input data) for each subspace + A_full = dict() + # dictionary of subspace-wise cost of data + subspace_costs = dict() + + X_eval = [] + + # ct + ct_dim = 3; + # dictionary of regularization parameters for each subspace (l1,l2) + regularizations = dict() + # dictionary of scaling parameters for each subspace (l1,l2) + scalings = dict() + # (l1,l2,l3) + subspace_coefficients = dict() + # (l1,l2,l3) + alphas = dict() + # dictionary of (l1,l2,l3)s associating for each subspace s=(l1,l2) and a given sampling level l3 a list of indices of representations used on that sampling level + samplings = dict() + # training subspaces with all multi-indices (l1,l2,l3) for current training task + training_subspaces = dict() + +# evaluation_subspace = None + evaluation_sampling = None +# evaluation_sampling_size = 0 + + # function to store precomputed class data to file + def save(self, file_name): + f = open(file_name, "wb") + pickle.dump(self.space_dims, f) + pickle.dump(self.subspaces, f) + pickle.dump(self.data_files, f) + pickle.dump(self.xyz_directories, f) + pickle.dump(self.N_load, f) + pickle.dump(self.representation_type, f) + pickle.dump(self.X, f) + pickle.dump(self.Y, f) + pickle.dump(self.c, f) + pickle.dump(self.A_full, f) + pickle.dump(self.subspace_costs, f) + pickle.dump(self.ct_dim, f) + pickle.dump(self.regularizations, f) + pickle.dump(self.scalings, f) + pickle.dump(self.subspace_coefficients, f) + pickle.dump(self.alphas, f) + pickle.dump(self.samplings, f) + pickle.dump(self.training_subspaces, f) + pickle.dump(self.X_eval, f) + pickle.dump(self.evaluation_sampling, f) + f.close() + + # function to store precomputed class data to binary file + def save_binary(self, file_name): + f = open(file_name, "wb") + pickle.dump(self.space_dims, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.subspaces, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.data_files, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.xyz_directories, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.N_load, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.representation_type, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.X, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.Y, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.c, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.A_full, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.subspace_costs, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.ct_dim, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.regularizations, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.scalings, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.subspace_coefficients, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.alphas, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.samplings, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.training_subspaces, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.X_eval, f, pickle.HIGHEST_PROTOCOL) + pickle.dump(self.evaluation_sampling, f, pickle.HIGHEST_PROTOCOL) + f.close() + + + + # function to load precomputed class data + @staticmethod + def load(file_name): + combi = cqml() + f = open(file_name, "rb") + combi.space_dims = pickle.load(f); + combi.subspaces = pickle.load(f); + combi.data_files = pickle.load(f); + combi.xyz_directories = pickle.load(f); + combi.N_load = pickle.load(f); + combi.representation_type = pickle.load(f); + combi.X = pickle.load(f); + combi.Y = pickle.load(f); + combi.c = pickle.load(f); + combi.A_full = pickle.load(f); + combi.subspace_costs = pickle.load(f); + combi.ct_dim = pickle.load(f); + combi.regularizations = pickle.load(f); + combi.scalings = pickle.load(f); + combi.subspace_coefficients = pickle.load(f); + combi.alphas = pickle.load(f); + combi.samplings = pickle.load(f); + combi.training_subspaces = pickle.load(f); + combi.X_eval = pickle.load(f); + combi.evaluation_sampling = pickle.load(f); + f.close() + return combi + + + # function to load the energies / the information to be learned + def load_energies(self): + + # go over all subspaces that shall be loaded + for s in self.subspaces: + # take the file name of the data file + filename = self.data_files[s] + + # read in all lines of the data file + f = open(filename, "r") + lines = f.readlines() + f.close() + + # this list will contain the learning data for the current subspace + current_energies = [] + + # read in learning data from read file data + for line in lines[:self.N_load[s]]: + current_energies.append(float(line)) + + # set the learning data for the current subspace + self.Y[s]=np.array(current_energies) + + + # load data for all subspaces + # i.e. load training data and xyz files and generate + # the selected representations + # for now, only coulomb matrices and slatm are supported + def load_data(self): + + # get all energies / information to be learned + self.load_energies() + + # go over all subspaces + for s in self.subspaces: + + # information + print("Loading data for subspace ",s) + + # generate a list of qml.Compound() objects + mols = [] + + # loop over all samples / molecules, which shall be loaded + for i in range(self.N_load[s]): + + # get file name of current xyz file, which is to be used + filename = ("%s/frag_%04d.xyz" % (self.xyz_directories[s],i+1)) + + # initialize the qml.Compound() object, loading the xyz date file + mol = qml.Compound(xyz=filename); + + # attach the current molecule to the list of all molecules / samples + mols.append(mol) + + # in case of the slatm representation, precompute the necessary data + if self.representation_type == "slatm": + mbtypes = get_slatm_mbtypes(np.array([mol.nuclear_charges for mol in mols])) + + # loop over all samples / molecules + for mol in mols: + # generate coulomb matrix, if this type was chosen + if self.representation_type == "coulomb_matrix": + # This is a Molecular Coulomb matrix sorted by row norm + mol.generate_coulomb_matrix(size=23, sorting="row-norm") + # generate slatm representation, if this type was chosen + elif self.representation_type == "slatm": + mol.generate_slatm(mbtypes) + else: + print("Error: Representation type not supported") + + # add all representations to the representation list for the current level + self.X[s]=np.array([mol.representation for mol in mols]) + + + # generate kernel matrices using laplacian kernel + def generate_full_kernel_matrices(self, scalings, regularizations): + + for s in self.subspaces: + # setting hyper parameter + sigma = scalings[s]; + # generate kernel matrix + current_A_full = laplacian_kernel(self.X[s], self.X[s], sigma) + # adding regularization + current_A_full[np.diag_indices_from(current_A_full)] += regularizations[s]; + # store to global array + self.A_full[s] = current_A_full + + + # do training on the previously defined training_subspaces + def train(self): + alphas = dict() + + # loop over all training subspaces + for t in self.training_subspaces: + # get theory subspace multiindices + s = t[:self.ct_dim-1] + # get subset of kernel matrix + Asmall = self.A_full[s][np.ix_(self.samplings[t],self.samplings[t])]; + + # do the learning + self.alphas[t] = cho_solve(Asmall, self.Y[s][self.samplings[t]]); + + + # evaluate trained ML model in the given evaluation subspace + def evaluate(self): + + # set result vector to zeros + result_size = len(self.evaluation_sampling) + result_pos = np.zeros(result_size); + result_neg = np.zeros(result_size); + result = np.zeros(result_size); + + # loop over all training subspaces with positive coefficients + for t in self.training_subspaces: + if self.subspace_coefficients[t]>0: + # get training subspace index without sampling level + s_train = t[:self.ct_dim-1] + # get evaluation matrix for current subspace + X_eval = self.X_eval[self.evaluation_sampling] + X_train = self.X[s_train][self.samplings[t]] + A_eval = self.A_full[s_train][np.ix_(self.evaluation_sampling,self.samplings[t])]; + # add up subspace contribution + result_pos = result_pos + self.subspace_coefficients[t] * A_eval.dot(self.alphas[t]) + + # loop over all training subspaces with negative coefficients + for t in self.training_subspaces: + if self.subspace_coefficients[t]<0: + # get training subspace index without sampling level + s_train = t[:self.ct_dim-1] + # get evaluation matrix for current subspace + X_eval = self.X_eval[self.evaluation_sampling] + X_train = self.X[s_train][self.samplings[t]] + A_eval = self.A_full[s_train][np.ix_(self.evaluation_sampling,self.samplings[t])]; + # add up subspace contribution + result_pos = result_pos + self.subspace_coefficients[t] * A_eval.dot(self.alphas[t]) + + # finally combine positive and negative contributionts (avoiding cancellation issues) + result = result_pos + result_neg + + return result + + + # helper function to set up training subspaces and standard parameters for standard uniform CT + # warning: for now, only 2d and 3d are implemented + def generate_uniform_ct_training_subspaces(self, level, scaling, regularization, offsets): + # empty training subspace list + self.training_subspaces = []; + # empty subspace coefficients dictionary + self.subspace_coefficients = dict(); + + self.scalings = dict(); + + self.regularizations = dict(); + + print(level) + + L = level; + d = self.ct_dim; + + # case of 3d CT + if (self.ct_dim==3): + # loop over multi-index (l1,l2,l3) + for l1 in range(offsets[0],L+d-1 +1): + for l2 in range(offsets[1],L+d-1 +1): + for l3 in range(offsets[2],L+d-1 +1): + # loop over q = 0 .. ct_dim-1 + for q in range(d-1 +1): + # identify valid subspace + if (l1+l2+l3 == L+(d-1)-q): + # define multi-index + s = (l1,l2,l3) + print(s) + # add current subspace to training subspaces + self.training_subspaces.append(s) + # compute coefficient for current subspace + self.subspace_coefficients[s] = pow(-1,q) * special.binom(d-1,q) + # set default regularization + self.regularizations[s] = regularization[(l1,l2)] + # set default scaling + self.scalings[s] = scaling[(l1,l2)] + + + # reset combination coefficients following eqs. (15),(16) in the paper + for l in self.training_subspaces: + coeff = 0; + for l1 in range(2): + for l2 in range(2): + for l3 in range(2): + z = (l1,l2,l3) + norm1 = l1+l2+l3 + l_plus_z = (l[0]+l1,l[1]+l2,l[2]+l3) + if l_plus_z in self.training_subspaces: + coeff = coeff + (-1)**norm1 + self.subspace_coefficients[l] = coeff + + # case of 2d CT + if (self.ct_dim==2): + # loop over multi-index (l1,l2) + for l1 in range(offsets[0],level+1): + for l2 in range(offsets[1],level+1): + # loop over q = 0 .. ct_dim-1 + for q in range(2): + # identify valid subspace + if (l1+l2 == level-q): + # define multi-index + s = (l1,l2) + # add current subspace to training subspaces + self.training_subspaces.append(s) + # compute coefficient for current subspace + self.subspace_coefficients[s] = pow(-1,q) * special.binom(1,q) + # set default regularization + self.regularizations[s] = regularization[(l1,)] + # set default scaling + self.scalings[s] = scaling[(l1,)] + + # helper function to set up training subspaces and standard parameters for CT with "shift" + # warning: this is currently only supported in 3d + def generate_uniform_ct_training_subspaces_with_shift(self, level,scaling, regularization, offsets, shift): + # empty training subspace list + self.training_subspaces = []; + # empty subspace coefficients dictionary + self.subspace_coefficients = dict(); + + self.scalings = dict(); + + self.regularizations = dict(); + + print(level) + + L = level; + d = self.ct_dim; + + # case of 3d CT + if (self.ct_dim==3): + # loop over multi-index (l1,l2,l3) + for l1 in range(offsets[0],L+d-1 +1): + for l2 in range(offsets[1],L+d-1 +1): + for l3 in range(offsets[2]-shift,L+d-1 +1): + # loop over q = 0 .. ct_dim-1 + for q in range(d-1 +1): + # identify valid subspace + if (l1+l2+l3 == L+(d-1)-q): + # define multi-index + s = (l1,l2,l3+shift) + print(s) + # add current subspace to training subspaces + self.training_subspaces.append(s) + # compute coefficient for current subspace + self.subspace_coefficients[s] = pow(-1,q) * special.binom(d-1,q) + # set default regularization + self.regularizations[s] = regularization[(l1,l2)] + # set default scaling + self.scalings[s] = scaling[(l1,l2)] + + # reset combination coefficients following eqs. (15),(16) in the paper + for l in self.training_subspaces: + coeff = 0; + for l1 in range(2): + for l2 in range(2): + for l3 in range(2): + z = (l1,l2,l3) + norm1 = l1+l2+l3 + l_plus_z = (l[0]+l1,l[1]+l2,l[2]+l3) + if l_plus_z in self.training_subspaces: + coeff = coeff + (-1)**norm1 + self.subspace_coefficients[l] = coeff + + + # helper function to set up training subspaces and standard parameters for 3D CQML, where we fix one dimension + # -> 2D CQML for 3D data + # warning: this is for now only supported in 3d + def generate_uniform_ct_training_subspaces_with_one_fixed_dimension(self, level, scaling, regularization, offsets, fixed_dimension, fixed_level): + # empty training subspace list + self.training_subspaces = []; + # empty subspace coefficients dictionary + self.subspace_coefficients = dict(); + + self.scalings = dict(); + + self.regularizations = dict(); + + print(level) + + if (self.ct_dim==3): + L = level; + d = self.ct_dim-1; + + if fixed_dimension==0: + # loop over multi-index (.,l2,l3) + l1 = fixed_level + for l2 in range(offsets[1],L+d-1 +1): + for l3 in range(offsets[2],L+d-1 +1): + # loop over q = 0 .. ct_dim-1 + for q in range(d-1 +1): + # identify valid subspace + if (l2+l3 == L+(d-1)-q): + # define multi-index + s = (l1,l2,l3) + print(s) + # add current subspace to training subspaces + self.training_subspaces.append(s) + # compute coefficient for current subspace + self.subspace_coefficients[s] = pow(-1,q) * special.binom(d-1,q) + # set default regularization + self.regularizations[s] = regularization[(l1,l2)] + # set default scaling + self.scalings[s] = scaling[(l1,l2)] + + if fixed_dimension==1: + # loop over multi-index (l1,.,l3) + l2 = fixed_level + for l1 in range(offsets[0],L+d-1 +1): + for l3 in range(offsets[2],L+d-1 +1): + # loop over q = 0 .. ct_dim-1 + for q in range(d-1 +1): + # identify valid subspace + if (l1+l3 == L+(d-1)-q): + # define multi-index + s = (l1,l2,l3) + print(s) + # add current subspace to training subspaces + self.training_subspaces.append(s) + # compute coefficient for current subspace + self.subspace_coefficients[s] = pow(-1,q) * special.binom(d-1,q) + # set default regularization + self.regularizations[s] = regularization[(l1,l2)] + # set default scaling + self.scalings[s] = scaling[(l1,l2)] + + + + + + # helper function to generate the different levels of sampling given an already defined training_subspaces list + def generate_samplings(self, jump_factor, base_count): + + # empty indices dictionary which will contain per energy subspace the global indexing of all samples for that subspace + indices = dict() + + # go over all energy subspaces (without sampling dimension) + for s in self.subspaces: + # generate for each energy subspace an indexing for all samples + indices[s] = np.arange(len(self.Y[s])) + # shuffle the indexing + np.random.shuffle(indices[s]) + + # iterate over all training subspaces + for t in self.training_subspaces: + # compute number of samples for current level + current_sampling_size = int(base_count*(pow(jump_factor,t[self.ct_dim-1]))) + # take first "current_sampling_size" many indices as sampling indices + self.samplings[t] = indices[t[:self.ct_dim-1]][:current_sampling_size] + + + # helper function to generate the different levels of nested (!) sampling given an already defined training_subspaces list + # ATTENTION: This function assumes that the data for all subspaces is fo the same size and data is sorted identical in all + # data sets ! + def generate_nested_samplings(self, jump_factor, base_count): + + + # empty indices dictionary which will contain per energy subspace the global indexing of all samples for that subspace + indices = dict() + + subspace_size=0 + # find size of largest subspace <- only some size is needed (since all are assumed to be identical) + for s in self.subspaces: + if len(self.Y[s])>subspace_size: + subspace_size = len(self.Y[s]) + + # generate ONE randomly shuffled indexing + indexing = np.arange(subspace_size) + np.random.shuffle(indexing) + + # go over all energy subspaces (without sampling dimension) + for s in self.subspaces: + # generate for each energy subspace an indexing for all samples + indices[s] = indexing + + # iterate over all training subspaces + for t in self.training_subspaces: + # compute number of samples for current level + current_sampling_size = int(base_count*(pow(jump_factor,t[self.ct_dim-1]))) + # take first "current_sampling_size" many indices as sampling indices + self.samplings[t] = indices[t[:self.ct_dim-1]][:current_sampling_size] + + + # helper function to randomly select out molecules that were not part of the training set + def generate_out_of_sample_evaluation_from_training_data(self,evaluation_subspace, evaluation_sampling_size): + + # iterate over all training subspaces to find union over all molecules used for training + used_molecules = set(); + for t in self.training_subspaces: + for i in self.samplings[t]: + used_molecules.add(i) + + # get evaluation subspace index without sampling level + s_eval = evaluation_subspace[:self.ct_dim-1]; + + # copy evaluation subspace data + self.X_eval = self.X[s_eval] + + # find molecules in evaluation subspace that were not used before + self.evaluation_sampling = range(len(self.X_eval)) + self.evaluation_sampling = list(set(self.evaluation_sampling)-used_molecules) + + # generate (randomized) subset of out of sample index list + np.random.shuffle(self.evaluation_sampling) + self.evaluation_sampling = self.evaluation_sampling[:evaluation_sampling_size] + + + # helper function to compute the number of samples that are used for training from a given energy subspace + def get_sample_counts(self): + + # dictionary for the sample counts + sample_counts = dict() + + # initialize dictionary + for t in self.training_subspaces: + s = t[:self.ct_dim-1] + sample_counts[s] = 0 + + # add up samples from the corresponding subspaces + for t in self.training_subspaces: + s = t[:self.ct_dim-1] + sample_counts[s] = sample_counts[s] + len(self.samplings[t]) + + # output + return sample_counts + + + # helper function to compute the number of samples that are use for training from a given energy subspace in case of nested subspaces + def get_sample_counts_for_nested_subspaces(self): + + # dictionary for the sample counts + sample_counts = dict() + + # dictionary for the per-subspace samplings + samplings_per_subspace = dict() + + # ititialize samplings_per_subspace + for t in self.training_subspaces: + s = t[:self.ct_dim-1] + samplings_per_subspace[s] = [] + + # append all samplings done on a given subspace to that subspace (including multiple nested samplings) + for t in self.training_subspaces: + s = t[:self.ct_dim-1] + samplings_per_subspace[s].extend(self.samplings[t]) + + # remove identical samplings on all subspaces + for s in samplings_per_subspace: + samplings_per_subspace[s] = list(set(samplings_per_subspace[s])) + + # store sample_counts + for s in samplings_per_subspace: + sample_counts[s] = len(samplings_per_subspace[s]) + + # output + return sample_counts + + # function to compute convergence results / training curves + def check_convergence(self, max_level, level_jump_size, evaluation_subspace, evaluation_sampling_size, trial_count): + average_error_list = [] + sample_counts_list = [] + + for i in range(self.ct_dim+1,max_level): + average_error = 0 + for t in range(trial_count): + self.generate_samplings(level_jump_size, 2**i) + self.generate_out_of_sample_evaluation_from_training_data(evaluation_subspace,evaluation_sampling_size) + self.train() + res = self.evaluate(); + data = self.Y[evaluation_subspace[:self.ct_dim-1]][self.evaluation_sampling] + error = np.mean(np.abs(res-data)) + average_error = average_error + error + average_error = average_error / trial_count + average_error_list.append(average_error) + sample_counts = self.get_sample_counts() + sample_counts_list.append(sample_counts) + print(average_error, sample_counts) + + return (average_error_list, sample_counts_list) + + + # function to compute convergence results / training curves in case of nested molecule subspaces (with the default to compute MAE errors) + def check_nested_convergence(self, max_level, level_jump_size, evaluation_subspace, evaluation_sampling_size, trial_count): + average_error_list = [] + sample_counts_list = [] + + for i in range(self.ct_dim-1,max_level): + average_error = 0 + for t in range(trial_count): + self.generate_nested_samplings(level_jump_size, 2**i) + self.generate_out_of_sample_evaluation_from_training_data(evaluation_subspace,evaluation_sampling_size) + self.train() + res = self.evaluate() + data = self.Y[evaluation_subspace[:self.ct_dim-1]][self.evaluation_sampling] + error = np.mean(np.abs(res-data)) + average_error = average_error + error + average_error = average_error / trial_count + average_error_list.append(average_error) + sample_counts = self.get_sample_counts_for_nested_subspaces() + + sample_counts_list.append(sample_counts) + print(average_error, sample_counts) + + return (average_error_list, sample_counts_list) + + + # function to compute convergence results / training curves in case of nested molecule subspaces (with RMSE errors) + def check_nested_convergence_rmse(self, max_level, level_jump_size, evaluation_subspace, evaluation_sampling_size, trial_count): + average_error_list = [] + sample_counts_list = [] + + for i in range(self.ct_dim-1,max_level): + average_error = 0 + for t in range(trial_count): + self.generate_nested_samplings(level_jump_size, 2**i) + self.generate_out_of_sample_evaluation_from_training_data(evaluation_subspace,evaluation_sampling_size) + self.train() + res = self.evaluate() + data = self.Y[evaluation_subspace[:self.ct_dim-1]][self.evaluation_sampling] + error = np.sqrt(((res - data) ** 2).mean()) + average_error = average_error + error + average_error = average_error / trial_count + average_error_list.append(average_error) + sample_counts = self.get_sample_counts_for_nested_subspaces() + + sample_counts_list.append(sample_counts) + print(average_error, sample_counts) + + return (average_error_list, sample_counts_list) + + + diff --git a/qml/models/cqml2d.py b/qml/models/cqml2d.py new file mode 100644 index 000000000..cfa003e1d --- /dev/null +++ b/qml/models/cqml2d.py @@ -0,0 +1,407 @@ +# MIT License +# +# Copyright (c) 2018 Peter Zaspel, Helmut Harbrecht +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from math import * +import os +import numpy as np +import qml +from qml.kernels import laplacian_kernel,gaussian_kernel +from qml.math import cho_solve +import scipy.spatial.distance as ssd +import pickle + +from qml.representations import get_slatm_mbtypes + +# struct / class containting parameters for the multilevel data +class multilevel_data_config: + # list of learning data files per level + data_files = None + # list of directories with xyz files per level + xyz_directories = None + # number of compounds to load for training and testing + N_load = 0 + # representation type ("slatm" / "coulomb_matrix") + representation_type = "slatm" + # number of levels which shall actually be loaded + level_count_load = 0 + +# struct / class containing the multilevel data +class multilevel_data: + # list of representation arrays for each level + X = None + # list of learning data arrays for each level + Y = None + # list of cost arrays for each level + c = None + # level-wise cost of the data + level_costs = None + # function to store data + def save(self, file_name): + pickle.dump(self, file(file_name, 'wb')) + # function to load data + @staticmethod + def load(file_name): + return pickle.load(file(file_name, 'rb')) + + +# function to load the energies / the information to be learned +def get_multilevel_energies(ml_data_cfg): + + # list which will contain the learning data per level + energies = []; + + # go over all levels that shall be loaded + for l in range(ml_data_cfg.level_count_load): + # take the file name of the data file + filename = ml_data_cfg.data_files[l] + + # read in all lines of the data file + f = open(filename, "r") + lines = f.readlines() + f.close() + + # this list will contain the learning data for the current level + current_energies = [] + + i = 0 + + # read in learning data from read file data + for line in lines[:ml_data_cfg.N_load]: + current_energies.append(float(line)) + i = i + 1 + + # attach the learning data from the current level to the per-level list + energies.append(np.array(current_energies)); + + # return multilevel learning data + return energies + + +# function to load the energies / the information to be learned +def get_multilevel_costs(ml_data_cfg): + + # list which will contain the learning data per level + costs = []; + + # go over all levels that shall be loaded + for l in range(ml_data_cfg.level_count_load): + # take the file name of the data file + filename = ml_data_cfg.cost_files[l] + + # read in all lines of the data file + f = open(filename, "r") + lines = f.readlines() + f.close() + + # this list will contain the cost data for the current level + current_costs = [] + + # read in learning data from read file data + for line in lines[:ml_data_cfg.N_load]: + current_costs.append(float(line)) + + # attach the learning data from the current level to the per-level list + costs.append(np.array(current_costs)); + + # return multilevel learning data + return costs + + +# load multilevel data, i.e. representations and learning data +def load_multilevel_data(ml_data_cfg): + + # retrieve instance of multilevel_data structure for configuration + ml_data = multilevel_data() + + # get all levels of energies / information to be learned + ml_data.Y = get_multilevel_energies(ml_data_cfg) + + # list, which will contain the representation lists per level + ml_data.X = [] + + # go over all levels + for l in range(ml_data_cfg.level_count_load): + + # information + print("Loading data for level %d ..." % (l)) + + # generate a list of qml.Compound() objects + mols = [] + + # loop over all samples / molecules, which shall be loaded + for i in range(ml_data_cfg.N_load): + + # get file name of current xyz file, which is to be used + filename = ("%s/frag_%04d.xyz" % (ml_data_cfg.xyz_directories[l],i+1)) + + # initialize the qml.Compound() object, loading the xyz date file + mol = qml.Compound(xyz=filename); + + # attach the current molecule to the list of all molecules / samles + mols.append(mol) + + # in case of the slatm representation, precompute the necessary data + if ml_data_cfg.representation_type == "slatm": + mbtypes = get_slatm_mbtypes(np.array([mol.nuclear_charges for mol in mols])) + + # loop over all samples / molecules + for mol in mols: + # generate coulomb matrix, if this type was chosen + if ml_data_cfg.representation_type == "coulomb_matrix": + # This is a Molecular Coulomb matrix sorted by row norm + mol.generate_coulomb_matrix(size=23, sorting="row-norm") + # generate slatm representation, if this type was chosen + elif ml_data_cfg.representation_type == "slatm": + mol.generate_slatm(mbtypes) + else: + print("Error: Representation type not supported") + + # add all representations to the representation list for the current level + ml_data.X.append(np.array([mol.representation for mol in mols])) + + # return loaded multilevel data + return ml_data + + +# struct / class containing all configuration parameters for the combination technique +class cqml2d_config: + # list of regularization parameters for each global level (=> for all levels as in ml_data) + regularizations = None + # list of regularization parameters for each global level + scalings = None + # number of local (!) levels (=> number of levels actually used in the CT-lerning, not(!) len(X)) + level_count = 0 + # (global!) base level used in the CT-leraning (starting from 0, numbering as in ml_data) + base_level = 0 + # (global!) error level used in the CT test phase (starting from 0, numbering as in ml_data) + error_level = 0 + # resolution level stating that the maximum sample count for which the convergence test is done is 2^max_resolution_level + max_resolution_level = 12 + # array containing the jump sizes per local (!) level (i.e. starting from base_level) + level_jump_size = None + + +# this function generates the kernel matrix for a given representations list (for one level) +def generate_kernel_matrix(X, llambda, scaling): + + # setting hyper parameter + sigma = scaling; + # generate kernel matrix + K = laplacian_kernel(X, X, sigma) + # adding regularization + K[np.diag_indices_from(K)] += llambda; + + return K + +# this function generates the kernel evaluation matrix between two levels of representations +def generate_evaluation_matrix(X1, X2, llambda, scaling): + + # setting hyper parameter + sigma = scaling + # generate kernel matrix + K = laplacian_kernel(X1, X2, sigma) + # adding regularization + K[np.diag_indices_from(K)] += llambda; + + return K + + +# this function learns the kernel coefficients giving the appropriate data limiting it to a subset of the data (a subset of the learning info is given) +def learn_from_subset_using_data(K, Y_subset, subset_indices): + + # get subset of kernel matrix + Ksmall = K[np.ix_(subset_indices, subset_indices)]; + + # do the learning + alpha = cho_solve(Ksmall, Y_subset); + + # return computed coefficients + return alpha; + + +# this function does the kernel evaluation given a known set of subset indices for the training and test part) +def predict_from_subset(K, coeffs, learning_subset_indices, to_be_predicted_subset): + + # get subset of kernel matrix + Ke = K[np.ix_(to_be_predicted_subset, learning_subset_indices)]; + + # predict and return the prediction + return np.dot(Ke,coeffs); + + +# this function evaluates the combination technique - based QML model and gives back the error +def compute_cqml2d_error_using_subsets_fixed_error_level( ml_dat_cfg, ml_data, ct_cfg, K, N_subsets): + + # ATTENTION: In this function, data structures will be built such that they are relative to ct_cfg.base_level !!! + + ################################ + # Combination Technique Learning + ################################ + + # list containing the subset indices used for learning for each level + learning_subset_indices = [] + + # list containing the remaining subset indices for each level + remaining_subset_indices = [] + + # building subset indices lists; idea: shuffle 1:N randomly and then split up according to N_subsets + for l in range(ct_cfg.level_count): + indices = np.array(range(len(ml_data.Y[ct_cfg.base_level+l]))) + np.random.shuffle(indices) + learning_subset_indices.append(indices[:N_subsets[l]]); + remaining_subset_indices.append(indices[N_subsets[l]:]); + + # list of the learning coefficients for each level + coeffs = [] + + # list of predictions for each level + predicted_results=[] + + # list of hierarchical surpluses for each level + hierarchical_surplus = [] + + # go over all levels (again: these are the levels relative to ct_cfg.base_level, not the levels as in ml_data !) + for l in range(ct_cfg.level_count): + # use combined models from all previous levels to predict energies at training points on new level + predicted_results.append(np.zeros((len(learning_subset_indices[l])))) + for ll in range(l): + predicted_results[l] = predicted_results[l] + predict_from_subset(K[ct_cfg.base_level+l][ct_cfg.base_level+ll], coeffs[ll], learning_subset_indices[ll], learning_subset_indices[l]) + + # compute offset / hierarchical surplus between the data on this level and the prediction for this level => this is the DELTA to learn + hierarchical_surplus.append(ml_data.Y[ct_cfg.base_level+l][learning_subset_indices[l]].T-predicted_results[l]); + + # learn hierarchical surplus + coeffs.append(learn_from_subset_using_data(K[ct_cfg.base_level+l][ct_cfg.base_level+l], hierarchical_surplus[l], learning_subset_indices[l])) + + + + ################################## + # Combination Technique Evaluation + ################################## + + # list of the result contributions for each level when doing the error evaluation + evaluation_result_contributions = []; + + # evaluate machine learning model on points that have been used for + # none (!) of the levels (this is why I use "remaining_subset_indices[0]" with a "0" + # attention: ct_cfg.error_level is the absolute level wrt. ml_data, not the relative level as uses in all other data structures. + # This becomes necessary since it shall be possible to decouble the learning level from the evaluation level. + for l in range(ct_cfg.level_count): + evaluation_result_contributions.append(predict_from_subset(K[ct_cfg.error_level][ct_cfg.base_level+l], coeffs[l], learning_subset_indices[l], remaining_subset_indices[0])); + + # add up hierarchical contributions to get evaluation of results on training points + results = evaluation_result_contributions[0]; + for l in range(1,ct_cfg.level_count): + results = results + evaluation_result_contributions[l]; + + # measure error wrt. model on ct_cfg.error_level (again: this is the absolute level!) + error = np.mean(np.abs(results - ml_data.Y[ct_cfg.error_level][remaining_subset_indices[0]])); + + return error + + +# this function computes for the given parameters the combination technique learning and testing and performs a convergence study (with averaging over the errors) +def compute_cqml2d_convergence(ml_data_cfg, ml_data, ct_cfg, trials): + + # initialize array which will hold the averaged errors + averaged_errors = np.array(0) + + # loop over all trials to compute the error + for t in range(trials): + + # array containting the errors + errors = [] + + # array containing the size of the subsets per level + N_subsets = [] + + # array of arrays / matrix containing kernel matrices between all representation levels + K=[]; + + # filling kernel matrices matrix + for l1 in range(ml_data_cfg.level_count_load): + K_current_l1 = []; + print("Filling matrix of kernel matrices ...") + for l2 in range(ml_data_cfg.level_count_load): + print("(%d, %d)" % (l1,l2)) + K_current_l1.append(generate_evaluation_matrix(ml_data.X[l1], ml_data.X[l2], ct_cfg.regularizations[l2], ct_cfg.scalings[l2])) + K.append(K_current_l1) + + # array holding for each convergence point the number of subsets per level + nums = [0]*(ct_cfg.max_resolution_level+1); + + + if len(ct_cfg.level_jump_size)==1: + for i in range(((ct_cfg.level_count-1)*ct_cfg.level_jump_size[0]),ct_cfg.max_resolution_level+1): + + # computing size of the subsets for the current convergence point + N_subsets = [] + for l in range(ct_cfg.level_count): + N_subsets.append(2**(i-ct_cfg.level_jump_size[0]*(l))) + + print("Computing error for coarse problem size of %d..." % N_subsets[0]) + + # computing combination technique getting back the error for the current convergence point + error = compute_cqml2d_error_using_subsets_fixed_error_level( ml_data_cfg, ml_data, ct_cfg, K, N_subsets); + + nums[i] = N_subsets + errors.append(error); + + else: + for i in range((ct_cfg.level_jump_size(ct_cfg.level_count)),ct_cfg.max_resolution_level+1): + + # computing size of the subsets for the current convergence point + N_subsets = [] + for l in range(level_count): + N_subsets.append(2**(i-level_jump_size(l))); + + # computing combination technique getting back the error for the current convergence point + error = compute_cqml2d_error_using_subsets_fixed_error_level( ml_data_cfg, ml_data, ct_cfg, K, N_subsets); + + nums[i] = N_subsets; + errors.append(error); + + # array holding the costs for each convergence point + costs = [] + + if (len(ct_cfg.level_jump_size)==1): + for i in range(((ct_cfg.level_count-1)*ct_cfg.level_jump_size[0]),ct_cfg.max_resolution_level+1): + current_cost = 0; + for l in range(ct_cfg.level_count): + current_cost = current_cost + nums[i][l]*ml_data.level_costs[ct_cfg.base_level+l]; + costs.append(current_cost); + else: + for i in range((ct_cfg.level_jump_size(ct_cfg.level_count)),ct_cfg.max_resolution_level+1): + current_cost = 0; + for l in range(ct_cfg.level_count): + current_cost = current_cost + nums[i][l]*ml_data.level_costs[ct_cfg.base_level+l]; + costs.apend(current_cost); + + + # add errors to total sum of errors (which will be used for averaging) + averaged_errors = averaged_errors + np.array(errors); + + # average the errors + averaged_errors = averaged_errors / trials; + + return (averaged_errors,costs) +