Skip to content

Commit 500e6ed

Browse files
authored
Merge branch 'develop' into topic/sgemm_direct_sme1
2 parents 5a83107 + a64b75a commit 500e6ed

File tree

205 files changed

+9176
-4814
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

205 files changed

+9176
-4814
lines changed

.cirrus.yml

+5-5
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,9 @@ task:
125125
- make USE_OPENMP=1
126126

127127
FreeBSD_task:
128-
name: FreeBSD-gcc12
128+
name: FreeBSD-gcc
129129
freebsd_instance:
130-
image_family: freebsd-13-3
130+
image_family: freebsd-14-2
131131
install_script:
132132
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
133133
compile_script:
@@ -136,9 +136,9 @@ FreeBSD_task:
136136

137137

138138
FreeBSD_task:
139-
name: freebsd-gcc12-ilp64
139+
name: freebsd-gcc-ilp64
140140
freebsd_instance:
141-
image_family: freebsd-13-3
141+
image_family: freebsd-14-2
142142
install_script:
143143
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
144144
compile_script:
@@ -148,7 +148,7 @@ FreeBSD_task:
148148
FreeBSD_task:
149149
name: FreeBSD-clang-openmp
150150
freebsd_instance:
151-
image_family: freebsd-13-3
151+
image_family: freebsd-14-2
152152
install_script:
153153
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
154154
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so

.github/workflows/c910v.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
run: |
3838
sudo apt-get update
3939
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
40-
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
40+
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev
4141
4242
- name: checkout qemu
4343
uses: actions/checkout@v3
@@ -52,6 +52,7 @@ jobs:
5252
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
5353
cd qemu
5454
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
55+
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
5556
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
5657
make -j$(nproc)
5758
make install

.github/workflows/codspeed-bench.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
strategy:
1616
fail-fast: false
1717
matrix:
18-
os: [ubuntu-latest]
18+
os: [ubuntu-22.04]
1919
fortran: [gfortran]
2020
build: [make]
2121
pyver: ["3.12"]
@@ -147,7 +147,7 @@ jobs:
147147
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
148148
149149
- name: Run benchmarks
150-
uses: CodSpeedHQ/action@v2
150+
uses: CodSpeedHQ/action@v3
151151
with:
152152
token: ${{ secrets.CODSPEED_TOKEN }}
153153
run: |

.github/workflows/docs.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
python-version: "3.10"
2424

2525
- name: Install MkDocs and doc theme packages
26-
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
26+
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin
2727

2828
- name: Build docs site
2929
run: mkdocs build

.github/workflows/dynamic_arch.yml

+24-17
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ jobs:
4343
run: |
4444
if [ "$RUNNER_OS" == "Linux" ]; then
4545
sudo apt-get update
46-
sudo apt-get install -y gfortran cmake ccache libtinfo5
46+
sudo apt-get install -y gfortran cmake ccache
47+
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
48+
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
4749
elif [ "$RUNNER_OS" == "macOS" ]; then
4850
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
4951
brew reinstall gcc
@@ -158,7 +160,7 @@ jobs:
158160
strategy:
159161
fail-fast: false
160162
matrix:
161-
msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
163+
msystem: [UCRT64, MINGW32, CLANG64]
162164
idx: [int32, int64]
163165
build-type: [Release]
164166
include:
@@ -174,14 +176,6 @@ jobs:
174176
idx: int32
175177
target-prefix: mingw-w64-clang-x86_64
176178
fc-pkg: fc
177-
# Compiling with Flang 16 seems to cause test errors on machines
178-
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
179-
no-avx512-flags: -DNO_AVX512=1
180-
- msystem: CLANG32
181-
idx: int32
182-
target-prefix: mingw-w64-clang-i686
183-
fc-pkg: cc
184-
c-lapack-flags: -DC_LAPACK=ON
185179
- msystem: UCRT64
186180
idx: int64
187181
idx64-flags: -DBINARY=64 -DINTERFACE64=1
@@ -192,9 +186,6 @@ jobs:
192186
idx64-flags: -DBINARY=64 -DINTERFACE64=1
193187
target-prefix: mingw-w64-clang-x86_64
194188
fc-pkg: fc
195-
# Compiling with Flang 16 seems to cause test errors on machines
196-
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
197-
no-avx512-flags: -DNO_AVX512=1
198189
- msystem: UCRT64
199190
idx: int32
200191
target-prefix: mingw-w64-ucrt-x86_64
@@ -203,8 +194,6 @@ jobs:
203194
exclude:
204195
- msystem: MINGW32
205196
idx: int64
206-
- msystem: CLANG32
207-
idx: int64
208197

209198
defaults:
210199
run:
@@ -280,8 +269,6 @@ jobs:
280269
-DNUM_THREADS=64 \
281270
-DTARGET=CORE2 \
282271
${{ matrix.idx64-flags }} \
283-
${{ matrix.c-lapack-flags }} \
284-
${{ matrix.no-avx512-flags }} \
285272
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
286273
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
287274
..
@@ -369,3 +356,23 @@ jobs:
369356
- name: Build OpenBLAS
370357
run: |
371358
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}
359+
360+
neoverse_build:
361+
if: "github.repository == 'OpenMathLib/OpenBLAS'"
362+
runs-on: ubuntu-24.04-arm
363+
364+
steps:
365+
- name: Checkout repository
366+
uses: actions/checkout@v3
367+
368+
- name: Install Dependencies
369+
run: |
370+
sudo apt-get update
371+
sudo apt-get install -y gcc gfortran make
372+
373+
- name: Build OpenBLAS
374+
run: |
375+
make -j${nproc} TARGET=NEOVERSEN2
376+
make -j${nproc} TARGET=NEOVERSEN2 lapack-test
377+
378+

.github/workflows/harmonyos.yml

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: harmonyos
2+
3+
on: [push, pull_request]
4+
5+
concurrency:
6+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
7+
cancel-in-progress: true
8+
9+
permissions:
10+
contents: read # to fetch code (actions/checkout)
11+
12+
jobs:
13+
build:
14+
if: "github.repository == 'OpenMathLib/OpenBLAS'"
15+
runs-on: ubuntu-latest
16+
env:
17+
OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake
18+
COMMON_CMAKE_OPTIONS: |
19+
-DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
20+
-DCMAKE_INSTALL_PREFIX=install \
21+
-DCMAKE_BUILD_TYPE=Release \
22+
steps:
23+
- uses: actions/checkout@v4
24+
- name: ndk-install
25+
run: |
26+
wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz
27+
tar -xf ohos-sdk-windows_linux-public.tar.gz
28+
cd ohos-sdk/linux
29+
unzip -q native-linux-x64-4.1.7.8-Release.zip
30+
cd -
31+
- name: build-armv8
32+
run: |
33+
mkdir build && cd build
34+
${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \
35+
-DTARGET=ARMV8 -DNOFORTRAN=1 ..
36+
${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc)
37+

.github/workflows/loongarch64_clang.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
- name: Install APT deps
4242
run: |
4343
sudo apt-get update
44-
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
44+
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev
4545
4646
- name: Download and install loongarch64-toolchain
4747
run: |

.github/workflows/mips64.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ jobs:
4141
run: |
4242
sudo apt-get update
4343
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
44-
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
44+
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev
4545
4646
- name: checkout qemu
4747
uses: actions/checkout@v3
4848
with:
4949
repository: qemu/qemu
5050
path: qemu
51-
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
51+
ref: ae35f033b874c627d81d51070187fbf55f0bf1a7
5252

5353
- name: build qemu
5454
run: |

CONTRIBUTORS.md

+13
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,16 @@ In chronological order:
229229

230230
* Christopher Daley <https://github.com/cdaley>
231231
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems
232+
233+
* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
234+
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE
235+
236+
* Annop Wongwathanarat <[email protected]>
237+
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1
238+
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
239+
240+
* Marek Michalowski <https://github.com/michalowski-arm>
241+
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`
242+
243+
244+
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1

Changelog.txt

+95
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,99 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.29
4+
12-Jan-2025
5+
6+
general:
7+
- fixed a potential NULL pointer dereference in multithreaded builds
8+
- added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS
9+
- fixed a build failure when building without LAPACK_DEPRECATED functions
10+
- the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order
11+
to remove many compatibility and deprecation warnings
12+
- added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM)
13+
- fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data
14+
- improved thread scaling of multithreaded SBGEMV
15+
- improved thread scaling of multithreaded TRTRI
16+
- fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler)
17+
- added support for option handling changes in flang-new from LLVM18 onwards
18+
- added support for recent calling conventions changes in Cray and NVIDIA compilers
19+
- added support for compilation with the NAG Fortran compiler
20+
- fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file
21+
- improved the CMakeConfig file generated by the Makefile build
22+
- fixed const-correctness of cblas_?geadd in cblas.h
23+
- fixed a potential inaccuracy in multithreaded BLAS3 calls
24+
- fixed empty implementations of get/set_affinity that print a warning in OpenMP builds
25+
- fixed function signatures for TRTRS in the converted C version of LAPACK
26+
- fixed omission of several single-precision LAPACK symbols in the shared library
27+
- improved build instructions for the provided "pybench" benchmarks
28+
- improved documentation, including added build instructions for WoA and HarmonyOS
29+
as well as descriptions of environment variables that affect build and runtime behavior
30+
- added a separate "make install_tests" target for use with cross-compilations
31+
- integrated improvements and corrections from Reference-LAPACK:
32+
- removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062)
33+
- fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064)
34+
- replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080)
35+
36+
arm:
37+
- fixed build with recent versions of the NDK (missing .type declaration of symbols)
38+
39+
arm64:
40+
- fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to
41+
reads and writes outside the array bounds in some circumstances
42+
- rewrote cpu autodetection to scan all cores and return the highest performing type
43+
- improved the DGEMM performance for SVE targets and small matrix sizes
44+
- improved dimension criteria for forwarding from GEMM to GEMV kernels
45+
- added SVE kernels for ROT and SWAP
46+
- improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1
47+
- added support for using the "small matrix" kernels with CMake as well
48+
- fixed compilation on Windows on Arm
49+
- improved compile-time detection of SVE capability
50+
- added cpu autodetection and initial support for Apple M4
51+
- added support for compilation on systems running IOS
52+
- added support for compilation on NetBSD ("evbarm" architecture)
53+
- fixed NRM2 implementations for generic SVE targets and the Neoverse N2
54+
- fixed compilation for SVE-capable targets with the NVIDIA compiler
55+
56+
x86_64:
57+
- fixed a wrong storage size in the SBGEMV kernel for Cooper Lake
58+
- added cpu autodetection for Intel Granite Rapids
59+
- added cpu autodetection for AMD Ryzen 5 series
60+
- added optimized SOMATCOPY_CT for AVX-capable targets
61+
- fixed the fallback implementation of GEMM3M in GENERIC builds
62+
- tentatively re-enabled builds with the EXPRECISION option
63+
- worked around a miscompilation of tests with mingw32-gfortran14
64+
- added support for compilation with the Intel oneAPI 2025.0 compiler on Windows
65+
66+
power:
67+
- fixed multithreaded SBGEMM
68+
- fixed a CMake build problem on POWER10
69+
- improved the performance of SGEMV
70+
- added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them
71+
- fixed illegal instructions and potential memory overflow in SGEMM on PPCG4
72+
- fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970
73+
- added improved CGEMM and ZGEMM kernels for POWER10
74+
- added Makefile logic to remove all optimization flags in DEBUG builds
75+
76+
mips64:
77+
- fixed compilation with gcc14
78+
- fixed GEMM parameter selection for the MIPS64_GENERIC target
79+
- fixed a potential build failure when compiling with OpenMP
80+
81+
loongarch64:
82+
- fixed compilation for Loongson3 with recent versions of gmake
83+
- fixed a potential loss of precision in Loongson3A GEMM
84+
- fixed a potential build failure when compiling with OpenMP
85+
- added optimized SOMATCOPY for LASX-capable targets
86+
- introduced a new cpu naming scheme while retaining compatibility
87+
- added support for cross-compiling Loongarch64 targets with CMake
88+
- added support for compilation with LLVM
89+
90+
riscv64:
91+
- removed thread yielding overhead caused by sched_yield
92+
- replaced some non-standard intrinsics with their official names
93+
- fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256
94+
- improved the performance of SNRM2/DNRM2 for RVV1.0 targets
95+
- added optimized ?OMATCOPY_CN kernels for RVV1.0 targets
96+
297
====================================================================
398
Version 0.3.28
499
8-Aug-2024

Makefile

+3
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,9 @@ dummy :
426426
install :
427427
$(MAKE) -f Makefile.install install
428428

429+
install_tests :
430+
$(MAKE) -f Makefile.install install_tests
431+
429432
clean ::
430433
@for d in $(SUBDIRS_ALL) ; \
431434
do if test -d $$d; then \

0 commit comments

Comments
 (0)