From 89feb9167b8ed0cd95a1c8a027646ca1fec362a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Sat, 16 May 2020 08:57:58 -0600 Subject: [PATCH] Initial implementation of COO / CSR sparse format The key of this is the API, as shown with examples in test_sparse.f90. In particular, I am proposing the following functions / API: Higher level API: Conversion Dense <-> COO: * dense2coo * coo2dense Conversion COO -> CSR: * coo2csr CSR functionality: * csr_matvec * csr_getvalue Dense functionality: * getnnz Lower level API: * csr_has_canonical_format * csr_sum_duplicates * csr_sort_indices In these algorithms, it seems it is possible to just have one (best) implementation with one exception: sorting of indices (as implemented by `csr_sort_indices`), where different algorithms give different performance and it depends on the actual matrix. As such, it might make sense to provide a default in `csr_sort_indices` that is called by `coo2csr` that is as fast as we can make it for most cases (currently it uses quicksort, but there are other faster algoritms for our use case that we should switch over) and then provide other implementations such as `csr_sort_indices_mergesort`, `csr_sort_indices_timsort`, etc. for users so that they can choose the algorithm for sorting indices, or even provide their own. The file stdlib_experimental_sparse.f90 provides an example implementation of these algorithms that was inspired by SciPy. We can also discuss the details of that, but the key that I would like to discuss is the API itself. --- src/CMakeLists.txt | 1 + src/Makefile.manual | 1 + src/stdlib_experimental_sparse.f90 | 335 +++++++++++++++++++++++++++++ src/tests/CMakeLists.txt | 1 + src/tests/sparse/CMakeLists.txt | 1 + src/tests/sparse/test_sparse.f90 | 117 ++++++++++ 6 files changed, 456 insertions(+) create mode 100644 src/stdlib_experimental_sparse.f90 create mode 100644 src/tests/sparse/CMakeLists.txt create mode 100644 src/tests/sparse/test_sparse.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 51fd01d62..81eab715b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,7 @@ set(SRC stdlib_experimental_error.f90 stdlib_experimental_kinds.f90 stdlib_experimental_system.F90 + stdlib_experimental_sparse.f90 ${outFiles} ) diff --git a/src/Makefile.manual b/src/Makefile.manual index 55f0352ed..10264c5f0 100644 --- a/src/Makefile.manual +++ b/src/Makefile.manual @@ -8,6 +8,7 @@ SRC = f18estop.f90 \ stdlib_experimental_optval.f90 \ stdlib_experimental_quadrature.f90 \ stdlib_experimental_quadrature_trapz.f90 \ + stdlib_experimental_sparse.f90 \ stdlib_experimental_stats.f90 \ stdlib_experimental_stats_mean.f90 \ stdlib_experimental_stats_moment.f90 \ diff --git a/src/stdlib_experimental_sparse.f90 b/src/stdlib_experimental_sparse.f90 new file mode 100644 index 000000000..b9448b175 --- /dev/null +++ b/src/stdlib_experimental_sparse.f90 @@ -0,0 +1,335 @@ +module stdlib_experimental_sparse +use stdlib_experimental_kinds, only: dp +implicit none +private +public coo2dense, dense2coo, getnnz, coo2csr, coo2csc, & + csr_has_canonical_format, csr_sum_duplicates, csr_sort_indices, & + coo2csr_canonical, csr_matvec, csr_getvalue + +contains + +! Dense + +subroutine dense2coo(B, Ai, Aj, Ax) +real(dp), intent(in) :: B(:, :) +integer, intent(out) :: Ai(:), Aj(:) +real(dp), intent(out) :: Ax(:) +integer :: i, j, idx +idx = 1 +do j = 1, size(B, 2) + do i = 1, size(B, 1) + if (abs(B(i, j)) < tiny(1._dp)) cycle + Ai(idx) = i + Aj(idx) = j + Ax(idx) = B(i, j) + idx = idx + 1 + end do +end do +end subroutine + +integer function getnnz(B) result(nnz) +real(dp), intent(in) :: B(:, :) +integer :: i, j +nnz = 0 +do j = 1, size(B, 2) + do i = 1, size(B, 1) + if (abs(B(i, j)) < tiny(1._dp)) cycle + nnz = nnz + 1 + end do +end do +end function + +! COO + +subroutine coo2dense(Ai, Aj, Ax, B) +integer, intent(in) :: Ai(:), Aj(:) +real(dp), intent(in) :: Ax(:) +real(dp), intent(out) :: B(:, :) +integer :: n +B = 0 +do n = 1, size(Ai) + B(Ai(n), Aj(n)) = B(Ai(n), Aj(n)) + Ax(n) +end do +end subroutine + +subroutine coo2csr(Ai, Aj, Ax, Bp, Bj, Bx) +! Converts from COO (Ai, Aj, Ax) into CSR (Bp, Bj, Bx) +! Row and column indices are *not* assumed to be ordered. +! Duplicate entries are carried over to the CSR representation. +integer, intent(in) :: Ai(:), Aj(:) +real(dp), intent(in) :: Ax(:) +integer, intent(out) :: Bp(:), Bj(:) +real(dp), intent(out) :: Bx(:) +integer :: n, i, n_row, nnz, cumsum, temp, row, dest +n_row = size(Bp)-1 +nnz = size(Ai) +Bp = 0 +forall(n = 1:nnz) Bp(Ai(n)) = Bp(Ai(n)) + 1 +cumsum = 1 +do i = 1, n_row + temp = Bp(i) + Bp(i) = cumsum + cumsum = cumsum + temp +end do +do n = 1, nnz + row = Ai(n) + dest = Bp(row) + Bj(dest) = Aj(n) + Bx(dest) = Ax(n) + Bp(row) = Bp(row) + 1 +end do +Bp(2:) = Bp(:n_row) +Bp(1) = 1 +end subroutine + +subroutine coo2csc(Ai, Aj, Ax, Bp, Bi, Bx) +! Converts from COO (Ai, Aj, Ax) into CSC (Bp, Bi, Bx) +! Row and column indices are *not* assumed to be ordered. +! Duplicate entries are carried over to the CSC representation. +integer, intent(in) :: Ai(:), Aj(:) +real(dp), intent(in) :: Ax(:) +integer, intent(out) :: Bp(:), Bi(:) +real(dp), intent(out) :: Bx(:) +! Calculate CSR of the transposed matrix: +call coo2csr(Aj, Ai, Ax, Bp, Bi, Bx) +end subroutine + +subroutine coo2csr_canonical(Ai, Aj, Ax, Bp, Bj, Bx, verbose) +! Converts from COO (Ai, Aj, Ax) into CSR (Bp, Bj, Bx) +! Row and column indices are *not* assumed to be ordered. +! Duplicate entries are summed up and the indices are ordered. +integer, intent(in) :: Ai(:), Aj(:) +real(dp), intent(in) :: Ax(:) +integer, allocatable, intent(out) :: Bp(:), Bj(:) +real(dp), allocatable, intent(out) :: Bx(:) +logical, optional, intent(in) :: verbose +integer :: Bj_(size(Ai)) +real(dp) :: Bx_(size(Ai)) +integer :: nnz +logical :: verbose_ +verbose_ = .false. +if (present(verbose)) verbose_ = verbose +allocate(Bp(maxval(Ai)+1)) +if (verbose_) print *, "coo2csr" +call coo2csr(Ai, Aj, Ax, Bp, Bj_, Bx_) +if (verbose_) print *, "csr_sort_indices" +call csr_sort_indices(Bp, Bj_, Bx_) +if (verbose_) print *, "csr_sum_duplicates" +call csr_sum_duplicates(Bp, Bj_, Bx_) +if (verbose_) print *, "done" +nnz = Bp(size(Bp))-1 +allocate(Bj(nnz), Bx(nnz)) +Bj = Bj_(:nnz) +Bx = Bx_(:nnz) +end subroutine + +! CSR + +logical function csr_has_canonical_format(Ap, Aj) result(r) +! Determine whether the matrix structure is canonical CSR. +! Canonical CSR implies that column indices within each row +! are (1) sorted and (2) unique. Matrices that meet these +! conditions facilitate faster matrix computations. +integer, intent(in) :: Ap(:), Aj(:) +integer :: i, j +r = .false. +do i = 1, size(Ap)-1 + if (Ap(i) > Ap(i+1)) return + do j = Ap(i)+1, Ap(i+1)-1 + if (Aj(j-1) >= Aj(j)) return + end do +end do +r = .true. +end function + +subroutine csr_sum_duplicates(Ap, Aj, Ax) +! Sum together duplicate column entries in each row of CSR matrix A +! The column indicies within each row must be in sorted order. +! Explicit zeros are retained. +! Ap, Aj, and Ax will be modified *inplace* +integer, intent(inout) :: Ap(:), Aj(:) +real(dp), intent(inout) :: Ax(:) +integer :: nnz, r1, r2, i, j, jj +real(dp) :: x +nnz = 1 +r2 = 1 +do i = 1, size(Ap) - 1 + r1 = r2 + r2 = Ap(i+1) + jj = r1 + do while (jj < r2) + j = Aj(jj) + x = Ax(jj) + jj = jj + 1 + do while (jj < r2) + if (Aj(jj) == j) then + x = x + Ax(jj) + jj = jj + 1 + else + exit + end if + end do + Aj(nnz) = j + Ax(nnz) = x + nnz = nnz + 1 + end do + Ap(i+1) = nnz +end do +end subroutine + +subroutine csr_sort_indices(Ap, Aj, Ax) +! Sort CSR column indices inplace +integer, intent(inout) :: Ap(:), Aj(:) +real(dp), intent(inout) :: Ax(:) +integer :: i, r1, r2, l, idx(size(Aj)) +do i = 1, size(Ap)-1 + r1 = Ap(i) + r2 = Ap(i+1)-1 + l = r2-r1+1 + idx(:l) = iargsort_quicksort(Aj(r1:r2)) + Aj(r1:r2) = Aj(r1+idx(:l)-1) + Ax(r1:r2) = Ax(r1+idx(:l)-1) +end do +end subroutine + +function csr_matvec(Ap, Aj, Ax, x) result(y) +! Compute y = A*x for CSR matrix A and dense vectors x, y +integer, intent(in) :: Ap(:), Aj(:) +real(dp), intent(in) :: Ax(:), x(:) +real(dp) :: y(size(Ap)-1) +integer :: i +!$omp parallel default(none) shared(Ap, Aj, Ax, x, y) private(i) +!$omp do +do i = 1, size(Ap)-1 + y(i) = dot_product(Ax(Ap(i):Ap(i+1)-1), x(Aj(Ap(i):Ap(i+1)-1))) +end do +!$omp end do +!$omp end parallel +end function + +integer function lower_bound(A, val) result(i) +! Returns the lowest index "i" into the sorted array A so that A(i) >= val +! It uses bisection. +integer, intent(in) :: A(:), val +integer :: l, idx +if (A(1) >= val) then + i = 1 + return +end if +if (A(size(A)) < val) then + i = size(A)+1 + return +end if +l = 1 +i = size(A) +! Now we always have A(l) < val; A(i) >= val and we must make sure that "i" is +! the lowest possible such index. +do while (l + 1 < i) + idx = (l+i) / 2 + if (A(idx) < val) then + l = idx + else + i = idx + end if +end do +end function + + +real(dp) function csr_getvalue(Ap, Aj, Ax, i, j) result(r) +! Returns A(i, j) where the matrix A is given in the CSR format using +! (Ap, Aj, Ax) triple. Assumes A to be in canonical CSR format. +integer, intent(in) :: Ap(:), Aj(:) +real(dp), intent(in) :: Ax(:) +integer, intent(in) :: i, j +integer :: row_start, row_end, offset +row_start = Ap(i) +row_end = Ap(i+1)-1 +offset = lower_bound(Aj(row_start:row_end), j) + row_start - 1 +if (offset <= row_end) then + if (Aj(offset) == j) then + r = Ax(offset) + return + end if +end if +r = 0 +end function + +pure elemental subroutine swap_int(x,y) + integer, intent(in out) :: x,y + integer :: z + z = x + x = y + y = z +end subroutine + +pure subroutine interchange_sort_map_int(vec,map) + integer, intent(in out) :: vec(:) + integer, intent(in out) :: map(:) + integer :: i,j + do i = 1,size(vec) - 1 + j = minloc(vec(i:),1) + if (j > 1) then + call swap_int(vec(i),vec(i + j - 1)) + call swap_int(map(i),map(i + j - 1)) + end if + end do +end subroutine + +pure function iargsort_quicksort(vec_) result(map) + integer, intent(in) :: vec_(:) + integer :: map(size(vec_)) + integer, parameter :: levels = 300 + integer, parameter :: max_interchange_sort_size = 20 + integer :: i,left,right,l_bound(levels),u_bound(levels) + integer :: pivot + integer :: vec(size(vec_)) + + vec = vec_ + + forall(i=1:size(vec)) map(i) = i + + l_bound(1) = 1 + u_bound(1) = size(vec) + i = 1 + do while(i >= 1) + left = l_bound(i) + right = u_bound(i) + if (right - left < max_interchange_sort_size) then + if (left < right) call interchange_sort_map_int(vec(left:right),map(left:right)) + i = i - 1 + else + pivot = (vec(left) + vec(right)) / 2 + left = left - 1 + right = right + 1 + do + do + left = left + 1 + if (vec(left) >= pivot) exit + end do + do + right = right - 1 + if (vec(right) <= pivot) exit + end do + if (left < right) then + call swap_int(vec(left),vec(right)) + call swap_int(map(left),map(right)) + elseif(left == right) then + if (left == l_bound(i)) then + left = left + 1 + else + right = right - 1 + end if + exit + else + exit + end if + end do + u_bound(i + 1) = u_bound(i) + l_bound(i + 1) = left + u_bound(i) = right + i = i + 1 + end if + end do +end function + +end module diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 593d261b6..bc1cfe490 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(ascii) add_subdirectory(io) add_subdirectory(linalg) add_subdirectory(optval) +add_subdirectory(sparse) add_subdirectory(stats) add_subdirectory(system) add_subdirectory(quadrature) diff --git a/src/tests/sparse/CMakeLists.txt b/src/tests/sparse/CMakeLists.txt new file mode 100644 index 000000000..603bf22ca --- /dev/null +++ b/src/tests/sparse/CMakeLists.txt @@ -0,0 +1 @@ +ADDTEST(sparse) diff --git a/src/tests/sparse/test_sparse.f90 b/src/tests/sparse/test_sparse.f90 new file mode 100644 index 000000000..84539d90d --- /dev/null +++ b/src/tests/sparse/test_sparse.f90 @@ -0,0 +1,117 @@ +program test_sparse +use stdlib_experimental_kinds, only: dp +use stdlib_experimental_sparse, only: coo2dense, dense2coo, getnnz, coo2csr, & + csr_has_canonical_format, csr_sum_duplicates, csr_sort_indices, & + coo2csr_canonical, csr_matvec, csr_getvalue +use stdlib_experimental_error, only: check +implicit none + +real(dp), allocatable :: A(:, :), B(:, :), Ax(:), Bx(:), x(:), y(:) +integer, allocatable :: Ai(:), Aj(:), Bp(:), Bj(:) +integer :: nnz + +! Tests on square matrix for: +! * dense <-> COO +! * COO -> CSR +allocate(A(4, 4), B(4, 4)) +A = reshape([1, 7, 0, 0, & + 0, 2, 8, 0, & + 5, 0, 3, 9, & + 0, 6, 0, 4], [4, 4], order=[2, 1]) +nnz = getnnz(A) +allocate(Ai(nnz), Aj(nnz), Ax(nnz)) +call dense2coo(A, Ai, Aj, Ax) +call check(all(Ai == [1, 3, 1, 2, 4, 2, 3, 3, 4])) +call check(all(Aj == [1, 1, 2, 2, 2, 3, 3, 4, 4])) +call check(all(abs(Ax - [1, 5, 7, 2, 6, 8, 3, 9, 4]) < 1e-12_dp)) +call coo2dense(Ai, Aj, Ax, B) +call check(all(abs(A-B) < 1e-12_dp)) +allocate(Bp(maxval(Ai)+1), Bj(size(Ai)), Bx(size(Ai))) +call coo2csr(Ai, Aj, Ax, Bp, Bj, Bx) +call check(all(Bp-1 == [0, 2, 4, 7, 9])) +call check(all(Bj-1 == [0, 1, 1, 2, 0, 2, 3, 1, 3])) +call check(all(abs(Bx - [1, 7, 2, 8, 5, 3, 9, 6, 4]) < 1e-12_dp)) +call check(csr_has_canonical_format(Bp, Bj)) +deallocate(A, B, Ai, Aj, Ax, Bp, Bj, Bx) + + +! Tests on rectangular matrix for: +! * dense <-> COO +! * COO -> CSR +allocate(A(4, 5), B(4, 5)) +A = reshape([1, 7, 0, 0, 1, & + 0, 2, 8, 0, 2, & + 5, 0, 3, 9, 3, & + 0, 6, 0, 4, 4], [4, 5], order=[2, 1]) +nnz = getnnz(A) +allocate(Ai(nnz), Aj(nnz), Ax(nnz)) +call dense2coo(A, Ai, Aj, Ax) +call check(all(Ai == [1, 3, 1, 2, 4, 2, 3, 3, 4, 1, 2, 3, 4])) +call check(all(Aj == [1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5])) +call check(all(abs(Ax - [1, 5, 7, 2, 6, 8, 3, 9, 4, 1, 2, 3, 4]) < 1e-12_dp)) +call coo2dense(Ai, Aj, Ax, B) +call check(all(abs(A-B) < 1e-12_dp)) +allocate(Bp(maxval(Ai)+1), Bj(size(Ai)), Bx(size(Ai))) +call coo2csr(Ai, Aj, Ax, Bp, Bj, Bx) +call check(all(Bp == [1, 4, 7, 11, 14])) +call check(all(Bj == [1, 2, 5, 2, 3, 5, 1, 3, 4, 5, 2, 4, 5])) +call check(all(abs(Bx - [1, 7, 1, 2, 8, 2, 5, 3, 9, 3, 6, 4, 4]) < 1e-12_dp)) +call check(csr_has_canonical_format(Bp, Bj)) +allocate(x(5), y(4)) +x = [1, 2, 3, 4, 5] +y = csr_matvec(Bp, Bj, Bx, x) +call check(all(abs(y-matmul(A, x)) < 1e-12_dp)) +deallocate(A, B, Ai, Aj, Ax, Bp, Bj, Bx, x, y) + + +! Tests on rectangular matrix for: +! * COO -> dense +! * COO -> CSR +allocate(Ai(6), Aj(6), Ax(6), A(3, 4), B(3, 4)) +Ai = [1, 2, 3, 1, 2, 2] +Aj = [1, 3, 2, 1, 4, 3] +Ax = [1, 2, 3, 4, 1, 5] +call coo2dense(Ai, Aj, Ax, B) +A = reshape([5, 0, 0, 0, & + 0, 0, 7, 1, & + 0, 3, 0, 0], [3, 4], order=[2, 1]) +call check(all(abs(A-B) < 1e-12_dp)) +allocate(Bp(maxval(Ai)+1), Bj(size(Ai)), Bx(size(Ai))) +call coo2csr(Ai, Aj, Ax, Bp, Bj, Bx) +call check(all(Bp == [1, 3, 6, 7])) +call check(all(Bj == [1, 1, 3, 4, 3, 2])) +call check(all(abs(Bx - [1, 4, 2, 1, 5, 3]) < 1e-12_dp)) +call check(.not. csr_has_canonical_format(Bp, Bj)) +call csr_sort_indices(Bp, Bj, Bx) +call check(all(Bp == [1, 3, 6, 7])) +call check(all(Bj == [1, 1, 3, 3, 4, 2])) +call check(all(abs(Bx - [1, 4, 2, 5, 1, 3]) < 1e-12_dp)) +call csr_sum_duplicates(Bp, Bj, Bx) +nnz = Bp(size(Bp))-1 +call check(all(Bp == [1, 2, 4, 5])) +call check(all(Bj(:nnz) == [1, 3, 4, 2])) +call check(all(abs(Bx(:nnz) - [5, 7, 1, 3]) < 1e-12_dp)) +call check(csr_has_canonical_format(Bp, Bj(:nnz))) + +call coo2csr_canonical(Ai, Aj, Ax, Bp, Bj, Bx) +call check(all(Bp == [1, 2, 4, 5])) +call check(all(Bj == [1, 3, 4, 2])) +call check(all(abs(Bx - [5, 7, 1, 3]) < 1e-12_dp)) +call check(csr_has_canonical_format(Bp, Bj)) + +call check(abs(csr_getvalue(Bp, Bj, Bx, 1, 1) - 5) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 1, 2) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 1, 3) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 1, 4) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 2, 1) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 2, 2) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 2, 3) - 7) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 2, 4) - 1) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 3, 1) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 3, 2) - 3) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 3, 3) - 0) < tiny(1._dp)) +call check(abs(csr_getvalue(Bp, Bj, Bx, 3, 4) - 0) < tiny(1._dp)) + +deallocate(A, B, Ai, Aj, Ax, Bp, Bj, Bx) + +end