Skip to content
This repository was archived by the owner on Dec 8, 2024. It is now read-only.

Commit 4143b49

Browse files
authored
Kernel PCA (#101)
* Added Kernel PCA and test for KPCA
1 parent 9cb742e commit 4143b49

File tree

4 files changed

+233
-3
lines changed

4 files changed

+233
-3
lines changed

qml/kernels/fkpca.f90

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
! MIT License
2+
!
3+
! Copyright (c) 2018 Anders Steen Christensen
4+
!
5+
! Permission is hereby granted, free of charge, to any person obtaining a copy
6+
! of this software and associated documentation files (the "Software"), to deal
7+
! in the Software without restriction, including without limitation the rights
8+
! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
! copies of the Software, and to permit persons to whom the Software is
10+
! furnished to do so, subject to the following conditions:
11+
!
12+
! The above copyright notice and this permission notice shall be included in all
13+
! copies or substantial portions of the Software.
14+
!
15+
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
! SOFTWARE.
22+
23+
24+
subroutine fkpca(k, n, centering, kpca)
25+
26+
implicit none
27+
28+
double precision, dimension(:,:), intent(in) :: k
29+
integer, intent(in) :: n
30+
logical, intent(in) :: centering
31+
double precision, dimension(n,n), intent(out) :: kpca
32+
33+
! Eigenvalues
34+
double precision, dimension(n) :: eigenvals
35+
36+
double precision, allocatable, dimension(:) :: work
37+
38+
integer :: lwork
39+
integer :: info
40+
41+
integer :: i
42+
43+
double precision :: inv_n
44+
double precision, allocatable, dimension(:) :: temp
45+
double precision :: temp_sum
46+
47+
kpca(:,:) = k(:,:)
48+
49+
! This first part centers the matrix,
50+
! basically Kpca = K - G@K - K@G + G@K@G, with G = 1/n
51+
! It is a bit hard to follow, sry, but it is very fast
52+
! and requires very little memory overhead.
53+
54+
if (centering) then
55+
56+
inv_n = 1.0d0 / n
57+
58+
allocate(temp(n))
59+
temp(:) = 0.0d0
60+
61+
!$OMP PARALLEL DO
62+
do i = 1, n
63+
temp(i) = sum(k(i,:)) * inv_n
64+
enddo
65+
!$OMP END PARALLEL DO
66+
67+
temp_sum = sum(temp(:)) * inv_n
68+
69+
!$OMP PARALLEL DO
70+
do i = 1, n
71+
kpca(i,:) = kpca(i,:) + temp_sum
72+
enddo
73+
!$OMP END PARALLEL DO
74+
75+
!$OMP PARALLEL DO
76+
do i = 1, n
77+
kpca(:,i) = kpca(:,i) - temp(i)
78+
enddo
79+
!$OMP END PARALLEL DO
80+
81+
!$OMP PARALLEL DO
82+
do i = 1, n
83+
kpca(i,:) = kpca(i,:) - temp(i)
84+
enddo
85+
!$OMP END PARALLEL DO
86+
87+
deallocate(temp)
88+
89+
endif
90+
91+
! This 2nd part solves the eigenvalue problem with the least
92+
! memory intensive solver, namely DSYEV(). DSYEVD() is twice
93+
! as fast, but requires a lot more memory, which quickly
94+
! becomes prohibitive.
95+
96+
! Dry run which returns the optimal "lwork"
97+
allocate(work(1))
98+
call dsyev("V", "U", n, kpca, n, eigenvals, work, -1, info)
99+
lwork = nint(work(1)) + 1
100+
deallocate(work)
101+
102+
! Get eigenvectors
103+
allocate(work(lwork))
104+
call dsyev("V", "U", n, kpca, n, eigenvals, work, lwork, info)
105+
deallocate(work)
106+
107+
if (info < 0) then
108+
109+
write (*,*) "ERROR: The ", -info, "-th argument to DSYEV() had an illegal value."
110+
111+
else if (info > 0) then
112+
113+
write (*,*) "ERROR: DSYEV() failed to compute an eigenvalue."
114+
115+
end if
116+
117+
! This 3rd part sorts the kernel PCA matrix such that the first PCA is kpca(1)
118+
kpca = kpca(:,n:1:-1)
119+
kpca = transpose(kpca)
120+
121+
!$OMP PARALLEL DO
122+
do i = 1, n
123+
kpca(i,:) = kpca(i,:) * sqrt(eigenvals(n - i + 1))
124+
enddo
125+
!$OMP END PARALLEL DO
126+
127+
end subroutine fkpca

qml/kernels/kernels.py

+31
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
from .fkernels import fget_local_kernels_laplacian
3737
from .fkernels import fget_vector_kernels_gaussian, fget_vector_kernels_gaussian_symmetric
3838

39+
from .fkernels import fkpca
40+
3941
def laplacian_kernel(A, B, sigma):
4042
""" Calculates the Laplacian kernel matrix K, where :math:`K_{ij}`:
4143
@@ -351,3 +353,32 @@ def get_local_kernels_laplacian(A, B, na, nb, sigmas):
351353
nsigmas = len(sigmas)
352354

353355
return fget_local_kernels_laplacian(A.T, B.T, na, nb, sigmas, nma, nmb, nsigmas)
356+
357+
358+
def kpca(K, n=2, centering=True):
359+
""" Calculates `n` first principal components for the kernel :math:`K`.
360+
361+
The PCA is calculated using an OpenMP parallel Fortran routine.
362+
363+
A square, symmetric kernel matrix is required. Centering of the kernel matrix
364+
is enabled by default, although this isn't a strict requirement.
365+
366+
:param K: 2D kernel matrix
367+
:type K: numpy array
368+
:param n: Number of kernel PCAs to return (default=2)
369+
:type n: integer
370+
:param centering: Whether to center the kernel matrix (default=True)
371+
:type centering: bool
372+
373+
:return: array containing the principal components
374+
:rtype: numpy array
375+
"""
376+
377+
assert K.shape[0] == K.shape[1], "ERROR: Square matrix required for Kernel PCA."
378+
assert np.allclose(K, K.T, atol=1e-8), "ERROR: Symmetric matrix required for Kernel PCA."
379+
assert n <= K.shape[0], "ERROR: Requested more principal components than matrix size."
380+
381+
size = K.shape[0]
382+
pca = fkpca(K, size, centering)
383+
384+
return pca[:n]

setup.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,14 @@
4747

4848

4949
ext_fkernels = Extension(name = '.kernels.fkernels',
50-
sources = ['qml/kernels/fkernels.f90'],
50+
sources = [
51+
'qml/kernels/fkernels.f90',
52+
'qml/kernels/fkpca.f90',
53+
],
5154
extra_f90_compile_args = COMPILER_FLAGS,
5255
extra_f77_compile_args = COMPILER_FLAGS,
5356
extra_compile_args = COMPILER_FLAGS,
54-
extra_link_args = LINKER_FLAGS,
57+
extra_link_args = LINKER_FLAGS + MATH_LINKER_FLAGS,
5558
language = FORTRAN,
5659
f2py_options=['--quiet'])
5760

test/test_kernels.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@
2323
from __future__ import print_function
2424

2525
import sys
26+
import os
2627
import numpy as np
28+
import scipy
29+
30+
import sklearn
31+
from sklearn.decomposition import KernelPCA
32+
2733
import qml
2834
from qml.kernels import laplacian_kernel
2935
from qml.kernels import gaussian_kernel
@@ -32,6 +38,27 @@
3238
from qml.kernels import linear_kernel
3339
from qml.kernels import matern_kernel
3440
from qml.kernels import sargan_kernel
41+
from qml.kernels import kpca
42+
43+
def get_energies(filename):
44+
""" Returns a dictionary with heats of formation for each xyz-file.
45+
"""
46+
47+
f = open(filename, "r")
48+
lines = f.readlines()
49+
f.close()
50+
51+
energies = dict()
52+
53+
for line in lines:
54+
tokens = line.split()
55+
56+
xyz_name = tokens[0]
57+
hof = float(tokens[1])
58+
59+
energies[xyz_name] = hof
60+
61+
return energies
3562

3663
def test_laplacian_kernel():
3764

@@ -136,7 +163,6 @@ def test_matern_kernel():
136163

137164
for metric in ("l1", "l2"):
138165
for order in (0, 1, 2):
139-
print(metric,order)
140166
matern(metric, order)
141167

142168
def matern(metric, order):
@@ -220,9 +246,52 @@ def sargan(ngamma):
220246
# Check for symmetry:
221247
assert np.allclose(Ksymm, Ksymm.T), "Error in Sargan kernel"
222248

249+
250+
def array_nan_close(a, b):
251+
# Compares arrays, ignoring nans
252+
253+
m = np.isfinite(a) & np.isfinite(b)
254+
return np.allclose(a[m], b[m], atol=1e-8, rtol=0.0)
255+
256+
257+
def test_kpca():
258+
259+
test_dir = os.path.dirname(os.path.realpath(__file__))
260+
261+
# Parse file containing PBE0/def2-TZVP heats of formation and xyz filenam
262+
data = get_energies(test_dir + "/data/hof_qm7.txt")
263+
264+
# Generate a list of qml.data.Compound() objects
265+
mols = []
266+
267+
keys = sorted(data.keys())
268+
269+
np.random.seed(666)
270+
np.random.shuffle(keys)
271+
272+
n_mols = 100
273+
274+
for xyz_file in keys[:n_mols]:
275+
276+
mol = qml.data.Compound(xyz=test_dir + "/qm7/" + xyz_file)
277+
mol.properties = data[xyz_file]
278+
mol.generate_bob()
279+
mols.append(mol)
280+
281+
X = np.array([mol.representation for mol in mols])
282+
K = laplacian_kernel(X, X, 2e5)
283+
284+
pcas_qml = kpca(K, n=10)
285+
pcas_sklearn = KernelPCA(10, eigen_solver="dense", kernel='precomputed').fit_transform(K)
286+
287+
assert array_nan_close(np.abs(pcas_sklearn.T), np.abs(pcas_qml)), "Error in Kernel PCA decomposition."
288+
289+
223290
if __name__ == "__main__":
291+
224292
test_laplacian_kernel()
225293
test_gaussian_kernel()
226294
test_linear_kernel()
227295
test_matern_kernel()
228296
test_sargan_kernel()
297+
test_kpca()

0 commit comments

Comments
 (0)