diff --git a/qml/ml/driver/driver_mpi.f90 b/qml/ml/driver/driver_mpi.f90
new file mode 100644
index 000000000..6101f3501
--- /dev/null
+++ b/qml/ml/driver/driver_mpi.f90
@@ -0,0 +1,193 @@
+! MIT License
+!
+! Copyright (c) 2016 Anders Steen Christensen, Guido Falk von Rudorff
+!
+! Permission is hereby granted, free of charge, to any person obtaining a copy
+! of this software and associated documentation files (the "Software"), to deal
+! in the Software without restriction, including without limitation the rights
+! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+! copies of the Software, and to permit persons to whom the Software is
+! furnished to do so, subject to the following conditions:
+!
+! The above copyright notice and this permission notice shall be included in all
+! copies or substantial portions of the Software.
+!
+! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+! SOFTWARE.
+
+
+! prototype external calculator of the kernel matrix and solver for alpha coefficients
+! makes use of a PBLAS implementation, here the MKL variant
+! core idea is to distribute the kernel matrix across MPI ranks to allow out-of-core datasets to be processed
+! 
+! CSCS: 
+!   load modules PrgEnv-cray/6.0.4 cray-mpich/7.6.0 daint-gpu intel/17.0.4.196  
+!   build with ifort driver_mpi.f90 -o driver_mpi  -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_openmpi_lp64 -lpthread -lm -ldl  -m64 -I${MKLROOT}/include -Og -g -Wall -Wextra -pedantic -fimplicit-none -fcheck=all -fbacktrace
+program qml_driver
+    use mpi
+    implicit none
+
+    double precision, allocatable, dimension(:,:) :: Q
+    double precision, allocatable, dimension(:):: Y
+    
+    double precision :: sigma
+    integer :: rep_size
+    integer :: n_molecules
+
+    double precision, allocatable, dimension(:) :: alphas
+
+    integer :: info
+    integer :: na, i, numroc
+    integer :: local_i, local_j, global_i, global_j
+
+    integer :: lwork
+    double precision, dimension(:), allocatable :: work
+    double precision, dimension(:,:), allocatable :: local_K, local_B
+
+    ! pblacs data structures
+    integer :: local_K_rows, local_K_cols
+    integer :: local_B_rows, local_B_cols
+    integer :: local_id, num_ranks, ranks_rows, ranks_cols, context
+    integer :: local_rank_col, local_rank_row, block_size
+    integer :: ierr
+    integer, dimension(9) :: desca, descb
+    integer, dimension(2) :: dims
+    double precision, dimension(2) :: work_query
+    
+    ! init pblacs and set cartesian grid of MPI ranks
+    call blacs_pinfo(local_id, num_ranks)
+
+    dims = 0
+    call MPI_Dims_create(num_ranks, 2, dims, ierr)
+    ranks_rows = dims(1)
+    ranks_cols = dims(2)
+
+    block_size = 100
+
+    ! create BLACS context
+    call blacs_get(0, 0, context)
+    call blacs_gridinit(context, 'R', ranks_rows, ranks_cols)
+    call blacs_gridinfo(context, ranks_rows, ranks_cols, &
+        local_rank_row, local_rank_col)
+
+    ! Read hyperparameters and arrat suzes
+    open(unit = 9, file = "parameters.fout", form="formatted")
+
+        read(9,*) sigma, rep_size, n_molecules
+
+        ! Allocate labels
+        allocate(Y(n_molecules))
+
+        read(9,*) Y(:n_molecules)
+
+    close(9)
+    
+    ! allocate alphas
+    allocate(alphas(n_molecules))
+    ! Allocate representations
+    allocate(Q(n_molecules, rep_size))
+
+    ! Read representations
+    open(unit = 9, file = "representations.fout", form="formatted")
+        
+        ! Read representaions for each molecule
+        do i = 1, n_molecules 
+            read(9,*) Q(i,:rep_size)
+        enddo
+
+    close(9)
+
+    ! Size of kernel
+    na = size(Q, dim=1)
+
+    ! Allocate kernel and output
+    local_K_rows = numroc(na, block_size, local_rank_row, 0, ranks_rows)
+    local_K_cols = numroc(na, block_size, local_rank_col, 0, ranks_cols)
+    allocate(local_K(local_K_rows, local_K_cols))
+    local_B_rows = numroc(na, block_size, local_rank_row, 0, ranks_rows)
+    local_B_cols = numroc(1, block_size, local_rank_col, 0, ranks_cols)
+    allocate(local_B(local_B_rows, local_B_cols))
+
+    ! Calculate Laplacian kernel
+    do local_j = 1, local_K_cols
+        call l2g(local_j, local_rank_col, ranks_cols, &
+                block_size, global_j)
+        do local_i = 1, local_K_rows
+            call l2g(local_i, local_rank_row, ranks_rows, &
+                block_size, global_i)
+            local_K(local_i, local_j) = exp(-sum(abs(Q(global_j,:) - Q(global_i,:)))/sigma)
+        enddo
+    enddo
+
+    ! Setup variables for LAPACK
+    call descinit(desca, na, na, block_size, block_size, 0, 0, context, MAX(1, local_K_rows), info)
+    call descinit(descb, na, 1, block_size, block_size, 0, 0, context, MAX(1, local_B_rows), info)
+
+    ! Allocate local work arrays
+    call pdgels("N", na, na, 1, local_K, 1, 1, desca, local_B, 1, 1, DESCB, work_query, -1, info)
+    lwork = INT(work_query(1))
+    allocate(work(lwork))
+
+    ! copy data
+    local_B = 0.0d0
+    if (local_B_cols .ne. 0) then
+        do local_i = 1, local_B_rows
+            call l2g(local_i, local_rank_row, ranks_rows, block_size, global_i)
+            local_B(local_i, 1) = y(global_i)
+        enddo
+    end if
+
+    ! Solver
+    call pdgels("N", na, na, 1, local_K, 1, 1, desca, local_B, 1, 1, DESCB, work, lwork, info)
+
+    ! Copy LAPACK output
+    alphas = 0.0d0
+    if (local_B_cols .ne. 0) then
+        do local_i = 1, local_B_rows
+            call l2g(local_i, local_rank_row, ranks_rows, block_size, global_i)
+            alphas(global_i) = local_B(local_i, 1)
+        enddo
+    endif
+    call DGSUM2D(context, "All", "1-tree", na, 1, alphas, 1, -1, -1)
+
+    ! Save alphas to file
+    if (local_id .eq. 0) then
+        open(unit = 9, file = "alphas_mpi.fout", form="formatted")
+            write(9,*) alphas(:)
+        close(9)
+    end if
+   
+    ! Tear down MPI
+    call blacs_exit(0)
+
+    ! Clean up 
+    deallocate(work)
+    deallocate(local_B)
+    deallocate(Q)
+    deallocate(local_K)
+    deallocate(Y)    
+    deallocate(alphas)    
+
+end program qml_driver
+! convert local index to global index in block-cyclic distribution
+
+   subroutine l2g(il,p,np,nb,i)
+
+   implicit none
+   integer :: il   ! local array index, input
+   integer :: p    ! processor array index, input
+   integer :: np   ! processor array dimension, input
+   integer :: nb   ! block size, input
+   integer :: i    ! global array index, output
+   integer :: ilm1   
+
+   ilm1 = il-1
+   i    = (((ilm1/nb) * np) + p)*nb + mod(ilm1,nb) + 1
+
+   return
+   end subroutine l2g