Merge pull request #58 from Forceflow/develop

Forceflow · web-flow · commit 4629fcb68d3d · 2020-11-12T17:24:14.000+01:00
Pull 0.2.6 from develop
diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,5 @@ $RECYCLE.BIN/
 
 # Mac desktop service store files
 .DS_Store
+*.db-shm
+*.db-wal
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Libmorton v0.2.5
+# Libmorton v0.2.6
 [![Build Status](https://travis-ci.org/Forceflow/libmorton.svg?branch=master)](https://travis-ci.org/Forceflow/libmorton) [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT) [![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.me/forceflow)
 
  * Libmorton is a **C++ header-only library** with methods to efficiently encode/decode 64, 32 and 16-bit Morton codes and coordinates, in 2D and 3D. *Morton order* is also known as *Z-order* or *[the Z-order curve](https://en.wikipedia.org/wiki/Z-order_curve)*.
@@ -21,13 +21,16 @@ inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uin
 inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z);
 </pre>
 
+## Installation
+No installation is required (just download the headers and include them), but I was informed libmorton is packaged for [Microsoft's VCPKG system](https://github.com/Microsoft/vcpkg) as well, if you want a more controlled environment to install C++ packages in.
+
 ## Instruction sets
 In the standard case, libmorton only uses operations that are supported on pretty much any CPU you can throw it at. If you know you're compiling for a specific architecture, you might gain a speed boost in encoding/decoding operations by enabling implementations for a specific instruction set. Libmorton ships with support for:
- * **BMI2 instruction set**: Intel Haswell CPU's and newer. Define `__BMI2__` before including `morton.h`. This is definitely a faster method when compared to the standard case.
+ * **BMI2 instruction set**: Intel: Haswell CPU's and newer. AMD: Ryzen CPU's and newer. Define `__BMI2__` before including `morton.h`. This is definitely a faster method when compared to the standard case.
  * **AVX512 instruction set (experimental)**: Intel Ice Lake CPU's and newer. Uses `_mm512_bitshuffle_epi64_mask`. Define `__AVX512BITALG__` before including `morton.h`. For more info on performance, see [this PR](https://github.com/Forceflow/libmorton/pull/40).
-
-## Installation
-No installation is required (just download the headers and include them), but I was informed libmorton is packaged for [Microsoft's VCPKG system](https://github.com/Microsoft/vcpkg) as well, if you want a more controlled environment to install C++ packages in.
+ 
+When using MSVC, these options can be found under _Project Properties -> Code Generation -> Enable Enhanced Instruction set_.
+When using GCC (version 9.0 or higher), you can use `-march=haswell` (or `-march=znver2`) for BMI2 support and `-march=icelake-client` for AVX512 support.
 
 ## Testing
 The `test` folder contains tools I use to test correctness and performance of the libmorton implementation. You can regard them as unit tests. This section is under heavy re-writing, but might contain some useful code for advanced usage. 
diff --git a/libmorton/include/morton.h b/libmorton/include/morton.h
@@ -9,11 +9,9 @@
 #include "morton2D.h"
 #include "morton3D.h"
 
-#if defined(__BMI2__) || defined(__AVX2__)
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
 #include "morton_BMI.h"
-#endif
-
-#if defined(__AVX512BITALG__)
+#elif defined(__AVX512BITALG__)
 #include "morton_AVX512BITALG.h"
 #endif
 
@@ -22,31 +20,31 @@ namespace libmorton {
 	//-----------------------------------------------------------------------------------------------
 
 	// ENCODING
-#if defined(__AVX512BITALG__)
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
 	inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
-		return m2D_e_BITALG<uint_fast32_t, uint_fast16_t>(x, y);
+		return m2D_e_BMI<uint_fast32_t, uint_fast16_t>(x, y);
 	}
 	inline uint_fast64_t morton2D_64_encode(const uint_fast32_t x, const uint_fast32_t y) {
-		return m2D_e_BITALG<uint_fast64_t, uint_fast32_t>(x, y);
+		return m2D_e_BMI<uint_fast64_t, uint_fast32_t>(x, y);
 	}
 	inline uint_fast32_t morton3D_32_encode(const uint_fast16_t x, const uint_fast16_t y, const uint_fast16_t z) {
-		return m3D_e_BITALG<uint_fast32_t, uint_fast16_t>(x, y, z);
+		return m3D_e_BMI<uint_fast32_t, uint_fast16_t>(x, y, z);
 	}
 	inline uint_fast64_t morton3D_64_encode(const uint_fast32_t x, const uint_fast32_t y, const uint_fast32_t z) {
-		return m3D_e_BITALG<uint_fast64_t, uint_fast32_t>(x, y, z);
+		return m3D_e_BMI<uint_fast64_t, uint_fast32_t>(x, y, z);
 	}
-#elif defined(__BMI2__) || defined(__AVX2__)
+#elif defined(__AVX512BITALG__)
 	inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
-		return m2D_e_BMI<uint_fast32_t, uint_fast16_t>(x, y);
+		return m2D_e_BITALG<uint_fast32_t, uint_fast16_t>(x, y);
 	}
 	inline uint_fast64_t morton2D_64_encode(const uint_fast32_t x, const uint_fast32_t y) {
-		return m2D_e_BMI<uint_fast64_t, uint_fast32_t>(x, y);
+		return m2D_e_BITALG<uint_fast64_t, uint_fast32_t>(x, y);
 	}
 	inline uint_fast32_t morton3D_32_encode(const uint_fast16_t x, const uint_fast16_t y, const uint_fast16_t z) {
-		return m3D_e_BMI<uint_fast32_t, uint_fast16_t>(x, y, z);
+		return m3D_e_BITALG<uint_fast32_t, uint_fast16_t>(x, y, z);
 	}
 	inline uint_fast64_t morton3D_64_encode(const uint_fast32_t x, const uint_fast32_t y, const uint_fast32_t z) {
-		return m3D_e_BMI<uint_fast64_t, uint_fast32_t>(x, y, z);
+		return m3D_e_BITALG<uint_fast64_t, uint_fast32_t>(x, y, z);
 	}
 #else
 	inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
@@ -64,32 +62,33 @@ namespace libmorton {
 #endif
 
 	// DECODING
-#if defined(__AVX512BITALG__)
+
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
 	inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
-		m2D_d_BITALG<uint_fast32_t, uint_fast16_t>(morton, x, y);
+		m2D_d_BMI<uint_fast32_t, uint_fast16_t>(morton, x, y);
 	}
 	inline void morton2D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y) {
-		m2D_d_BITALG<uint_fast64_t, uint_fast32_t>(morton, x, y);
+		m2D_d_BMI<uint_fast64_t, uint_fast32_t>(morton, x, y);
 	}
 	inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y, uint_fast16_t& z) {
-		m3D_d_BITALG<uint_fast32_t, uint_fast16_t>(morton, x, y, z);
+		m3D_d_BMI<uint_fast32_t, uint_fast16_t>(morton, x, y, z);
 	}
 	inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z) {
-		m3D_d_BITALG<uint_fast64_t, uint_fast32_t>(morton, x, y, z);
+		m3D_d_BMI<uint_fast64_t, uint_fast32_t>(morton, x, y, z);
 	}
-#elif defined(__BMI2__) || defined(__AVX2__)
+#elif defined(__AVX512BITALG__)
 	inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
-		m2D_d_BMI<uint_fast32_t, uint_fast16_t>(morton, x, y);
+		m2D_d_BITALG<uint_fast32_t, uint_fast16_t>(morton, x, y);
 	}
 	inline void morton2D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y) {
-		m2D_d_BMI<uint_fast64_t, uint_fast32_t>(morton, x, y);
+		m2D_d_BITALG<uint_fast64_t, uint_fast32_t>(morton, x, y);
 	}
 	inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y, uint_fast16_t& z) {
-		m3D_d_BMI<uint_fast32_t, uint_fast16_t>(morton, x, y, z);
+		m3D_d_BITALG<uint_fast32_t, uint_fast16_t>(morton, x, y, z);
 	}
 	inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z) {
-		m3D_d_BMI<uint_fast64_t, uint_fast32_t>(morton, x, y, z);
-	}
+		m3D_d_BITALG<uint_fast64_t, uint_fast32_t>(morton, x, y, z);
+}
 #else
 	inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
 		m2D_d_sLUT<uint_fast32_t, uint_fast16_t>(morton, x, y);
diff --git a/libmorton/include/morton_AVX512BITALG.h b/libmorton/include/morton_AVX512BITALG.h
@@ -1,4 +1,5 @@
 #pragma once
+#if defined(__AVX512BITALG__)
 #include <immintrin.h>
 #include <stdint.h>
 
@@ -218,4 +219,5 @@ namespace libmorton {
 	inline void m3D_d_BITALG(const morton m, coord& x, coord& y, coord& z) {
 		bitalg_detail::bitunzip3D(m, x, y, z);
 	}
-}
+}
+#endif
diff --git a/libmorton/include/morton_BMI.h b/libmorton/include/morton_BMI.h
@@ -1,5 +1,5 @@
 #pragma once
-#if defined(__BMI2__) || defined(__AVX2__)
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
 #include <immintrin.h>
 #include <stdint.h>
 
diff --git a/test/libmorton_test.cpp b/test/libmorton_test.cpp
@@ -145,7 +145,7 @@ void registerFunctions() {
 	f3D_32_decode.push_back(decode_3D_32_wrapper("LUT Shifted ET", &m3D_d_sLUT_ET<uint_fast32_t, uint_fast16_t>));
 
 	// Register 3D BMI intrinsics if available
-#if defined(__BMI2__)
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
 	f3D_64_encode.push_back(encode_3D_64_wrapper("BMI2 instruction set", &m3D_e_BMI<uint_fast64_t, uint_fast32_t>));
 	f3D_32_encode.push_back(encode_3D_32_wrapper("BMI2 instruction set", &m3D_e_BMI<uint_fast32_t, uint_fast16_t>));
 	f3D_64_decode.push_back(decode_3D_64_wrapper("BMI2 Instruction set", &m3D_d_BMI<uint_fast64_t, uint_fast32_t>));
diff --git a/test/libmorton_test.h b/test/libmorton_test.h
@@ -22,6 +22,14 @@
 #include "morton_LUT_generators.h"
 #include "../libmorton/include/morton2D.h"
 #include "../libmorton/include/morton3D.h"
+#if defined(__BMI2__) || (defined(__AVX2__) && defined(_MSC_VER))
+#include "morton_BMI.h"
+#endif
+#if defined(__AVX512BITALG__)
+#include "morton_AVX512BITALG.h"
+#endif
+
+// Load main morton include file (should be unnecessary)
 #include "../libmorton/include/morton.h"
 
 using std::string;
diff --git a/test/makefile b/test/makefile
@@ -14,3 +14,6 @@ avx512:
 
 clean:
 	rm -f libmorton_test libmorton_test_bmi2 libmorton_test_avx512
+
+zen2:
+	$(CXX) $(CFLAGS) -march=znver2 libmorton_test.cpp -o libmorton_test_zen2