From f8b738078a3d91152659dfd80de9792555a80b9f Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sat, 13 May 2017 16:25:35 -0400
Subject: [PATCH 1/9] Remove code that are not needed/will not make sense with
 function multi-versioning and improved cpu selection/detection.

* Remove cpuid specific binary

    To be replaced by function multi-versioning

* Remove old CPU target handling in sysimg

    The feature check is not complete since X86 features bits
    are also in many other CPUID results, including ones that
    we are actually interested in like AVX2.

* Add invalid CPU name and help handling in codegen initialization.

    Update `--cpu-target` test
    since target mismatch will not raise an error anymore.
    CPU names unknown to LLVM would.
---
 Make.inc             |  5 -----
 Makefile             | 15 ++-------------
 base/pkg/pkg.jl      |  2 --
 src/Makefile         |  4 ----
 src/codegen.cpp      | 20 ++++++++++++++++++++
 src/jitlayers.cpp    | 29 -----------------------------
 src/jloptions.c      |  9 +--------
 src/julia.h          |  2 --
 src/julia_internal.h |  4 ----
 src/staticdata.c     | 38 +++++++-------------------------------
 src/sys.c            | 25 +------------------------
 test/cmdlineargs.jl  | 11 +++--------
 ui/repl.c            |  7 -------
 13 files changed, 34 insertions(+), 137 deletions(-)

diff --git a/Make.inc b/Make.inc
index aaba197352117..b07c6c4af6061 100644
--- a/Make.inc
+++ b/Make.inc
@@ -81,11 +81,6 @@ HAVE_SSP := 0
 WITH_GC_VERIFY := 0
 WITH_GC_DEBUG_ENV := 0
 
-# When set, give julia binaries CPUID specific names. This is useful in cluster environments
-# with heterogeneous architectures. N.B.: will not be automatically rebuilt for all
-# architectures if julia is updated.
-CPUID_SPECIFIC_BINARIES ?= 0
-
 # Prevent picking up $ARCH from the environment variables
 ARCH:=
 
diff --git a/Makefile b/Makefile
index 9d180dd4a8274..cf996d492d715 100644
--- a/Makefile
+++ b/Makefile
@@ -102,17 +102,11 @@ julia-ui-release julia-ui-debug : julia-ui-% : julia-src-%
 julia-inference : julia-base julia-ui-$(JULIA_BUILD_MODE) $(build_prefix)/.examples
 	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT) $(build_private_libdir)/inference.ji JULIA_BUILD_MODE=$(JULIA_BUILD_MODE)
 
-ifneq ($(CPUID_SPECIFIC_BINARIES), 0)
-CPUID_TAG = _$(call exec,$(JULIA_EXECUTABLE) --cpuid)
-else
-CPUID_TAG =
-endif
-
 julia-sysimg-release : julia-inference julia-ui-release
-	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT) $(build_private_libdir)/sys$(CPUID_TAG).$(SHLIB_EXT) JULIA_BUILD_MODE=release
+	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT) $(build_private_libdir)/sys.$(SHLIB_EXT) JULIA_BUILD_MODE=release
 
 julia-sysimg-debug : julia-inference julia-ui-debug
-	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT) $(build_private_libdir)/sys-debug$(CPUID_TAG).$(SHLIB_EXT) JULIA_BUILD_MODE=debug
+	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT) $(build_private_libdir)/sys-debug.$(SHLIB_EXT) JULIA_BUILD_MODE=debug
 
 julia-debug julia-release : julia-% : julia-ui-% julia-sysimg-% julia-symlink julia-libccalltest
 
@@ -229,13 +223,8 @@ $$(build_private_libdir)/sys$1.o: $$(build_private_libdir)/inference.ji $$(JULIA
 	fi )
 .SECONDARY: $(build_private_libdir)/sys$1.o
 endef
-ifneq ($(CPUID_SPECIFIC_BINARIES),0)
-$(eval $(call sysimg_builder,_%,-O3,$(JULIA_EXECUTABLE_release)))
-$(eval $(call sysimg_builder,-debug_%,-O0,$(JULIA_EXECUTABLE_debug)))
-else
 $(eval $(call sysimg_builder,,-O3,$(JULIA_EXECUTABLE_release)))
 $(eval $(call sysimg_builder,-debug,-O0,$(JULIA_EXECUTABLE_debug)))
-endif
 
 $(build_depsbindir)/stringreplace: $(JULIAHOME)/contrib/stringreplace.c | $(build_depsbindir)
 	@$(call PRINT_CC, $(HOSTCC) -o $(build_depsbindir)/stringreplace $(JULIAHOME)/contrib/stringreplace.c)
diff --git a/base/pkg/pkg.jl b/base/pkg/pkg.jl
index cb9df70635efb..69634126d3427 100644
--- a/base/pkg/pkg.jl
+++ b/base/pkg/pkg.jl
@@ -86,8 +86,6 @@ init(meta::AbstractString=DEFAULT_META, branch::AbstractString=META_BRANCH) = Di
 
 function __init__()
     vers = "v$(VERSION.major).$(VERSION.minor)"
-    vers = ccall(:jl_uses_cpuid_tag, Cint, ()) == 0 ? vers :
-        joinpath(vers,hex(ccall(:jl_cpuid_tag, UInt64, ()), 2*sizeof(UInt64)))
     unshift!(Base.LOAD_CACHE_PATH, abspath(Dir._pkgroot(), "lib", vers))
 end
 
diff --git a/src/Makefile b/src/Makefile
index d3ff989009006..daefd063655a3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -109,10 +109,6 @@ SHIPFLAGS += $(FLAGS)
 SHIPFLAGS += "-DJL_SYSTEM_IMAGE_PATH=\"$(build_private_libdir_rel)/sys.$(SHLIB_EXT)\""
 DEBUGFLAGS += "-DJL_SYSTEM_IMAGE_PATH=\"$(build_private_libdir_rel)/sys-debug.$(SHLIB_EXT)\""
 
-ifneq ($(CPUID_SPECIFIC_BINARIES), 0)
-override CPPFLAGS += "-DCPUID_SPECIFIC_BINARIES=1"
-endif
-
 FLISP_EXECUTABLE_debug := $(BUILDDIR)/flisp/flisp-debug
 FLISP_EXECUTABLE_release := $(BUILDDIR)/flisp/flisp
 ifeq ($(OS),WINNT)
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 7f13e04fcec9d..ee7bc9edff2a8 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -6848,8 +6848,28 @@ extern "C" void *jl_init_llvm(void)
 #if defined(FORCE_ELF)
     TheTriple.setObjectFormat(Triple::ELF);
 #endif
+    bool help = false;
+    if (jl_options.cpu_target && strcmp(jl_options.cpu_target, "help") == 0) {
+        help = true;
+        jl_options.cpu_target = "native";
+    }
     std::string TheCPU;
     SmallVector<std::string, 10> targetFeatures = getTargetFeatures(TheCPU);
+    {
+        std::string errorstr;
+        const Target *target = TargetRegistry::lookupTarget("", TheTriple, errorstr);
+        assert(target);
+        std::unique_ptr<MCSubtargetInfo> MSTI(
+            target->createMCSubtargetInfo(TheTriple.str(), "", ""));
+        if (!MSTI->isCPUStringValid(TheCPU))
+            jl_errorf("Invalid CPU name %s.", TheCPU.c_str());
+        if (help) {
+            // This is the only way I can find to print the help message once.
+            // It'll be nice if we can iterate through the features and print our own help
+            // message...
+            MSTI->setDefaultFeatures("help", "");
+        }
+    }
     jl_TargetMachine = eb.selectTarget(
             TheTriple,
             "",
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 7218e53cb180c..a164c2bd531e7 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -1020,12 +1020,6 @@ void jl_add_to_shadow(Module *m)
     jl_merge_module(shadow_output, std::move(clone));
 }
 
-#ifdef HAVE_CPUID
-extern "C" {
-    extern void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType);
-}
-#endif
-
 static void emit_offset_table(Module *mod, const std::vector<GlobalValue*> &vars, StringRef name)
 {
     assert(!vars.empty());
@@ -1070,14 +1064,6 @@ static void jl_gen_llvm_globaldata(Module *mod, const char *sysimg_data, size_t
                                  "jl_tls_offset_idx"));
 #endif
 
-    Constant *feature_string = ConstantDataArray::getString(jl_LLVMContext, jl_options.cpu_target);
-    addComdat(new GlobalVariable(*mod,
-                                 feature_string->getType(),
-                                 true,
-                                 GlobalVariable::ExternalLinkage,
-                                 feature_string,
-                                 "jl_sysimg_cpu_target"));
-
     // reflect the address of the jl_RTLD_DEFAULT_handle variable
     // back to the caller, so that we can check for consistency issues
     GlobalValue *jlRTLD_DEFAULT_var = mod->getNamedValue("jl_RTLD_DEFAULT_handle");
@@ -1088,21 +1074,6 @@ static void jl_gen_llvm_globaldata(Module *mod, const char *sysimg_data, size_t
                                  jlRTLD_DEFAULT_var,
                                  "jl_RTLD_DEFAULT_handle_pointer"));
 
-#ifdef HAVE_CPUID
-    // For native also store the cpuid
-    if (strcmp(jl_options.cpu_target,"native") == 0) {
-        uint32_t info[4];
-
-        jl_cpuid((int32_t*)info, 1);
-        addComdat(new GlobalVariable(*mod,
-                                     T_uint64,
-                                     true,
-                                     GlobalVariable::ExternalLinkage,
-                                     ConstantInt::get(T_uint64,((uint64_t)info[2])|(((uint64_t)info[3])<<32)),
-                                     "jl_sysimg_cpu_cpuid"));
-    }
-#endif
-
     if (sysimg_data) {
         Constant *data = ConstantDataArray::get(jl_LLVMContext,
             ArrayRef<uint8_t>((const unsigned char*)sysimg_data, sysimg_len));
diff --git a/src/jloptions.c b/src/jloptions.c
index 7bd9729eaaccc..8c95a7f13692e 100644
--- a/src/jloptions.c
+++ b/src/jloptions.c
@@ -20,16 +20,9 @@ char *shlib_ext = ".dylib";
 char *shlib_ext = ".so";
 #endif
 
-static char system_image_path[256] = "\0" JL_SYSTEM_IMAGE_PATH;
+static const char system_image_path[256] = "\0" JL_SYSTEM_IMAGE_PATH;
 JL_DLLEXPORT const char *jl_get_default_sysimg_path(void)
 {
-#ifdef CPUID_SPECIFIC_BINARIES
-    char *path = &system_image_path[1];
-    size_t existing_length = strlen(path) - strlen(shlib_ext);
-    path += existing_length;
-    snprintf(path, sizeof(system_image_path) - existing_length,
-        "_%" PRIx64 "%s", jl_cpuid_tag(), shlib_ext);
-#endif
     return &system_image_path[1];
 }
 
diff --git a/src/julia.h b/src/julia.h
index 30e2c26548c70..95c091affb5ee 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1280,8 +1280,6 @@ JL_DLLEXPORT long jl_getallocationgranularity(void);
 JL_DLLEXPORT int jl_is_debugbuild(void);
 JL_DLLEXPORT jl_sym_t *jl_get_UNAME(void);
 JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void);
-JL_DLLEXPORT uint64_t jl_cpuid_tag(void);
-JL_DLLEXPORT int jl_uses_cpuid_tag(void);
 
 // environment entries
 JL_DLLEXPORT jl_value_t *jl_environ(int i);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 7cd30911f54fd..e80a57dcf18ac 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -734,10 +734,6 @@ void *jl_dlopen_soname(const char *pfx, size_t n, unsigned flags);
 // libuv wrappers:
 JL_DLLEXPORT int jl_fs_rename(const char *src_path, const char *dst_path);
 
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
-#define HAVE_CPUID
-#endif
-
 #ifdef SEGV_EXCEPTION
 extern JL_DLLEXPORT jl_value_t *jl_segv_exception;
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index a5d8146fca578..2a2af6b3008e4 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -129,10 +129,6 @@ static uint32_t read_uint32(ios_t *s)
 
 // --- Static Compile ---
 
-#ifdef HAVE_CPUID
-extern void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType);
-#endif
-
 extern int globalUnique;
 static void *jl_sysimg_handle = NULL;
 static uint64_t sysimage_base = 0;
@@ -158,9 +154,6 @@ JL_DLLEXPORT int jl_running_on_valgrind(void)
 
 static void jl_load_sysimg_so(void)
 {
-#ifndef _OS_WINDOWS_
-    Dl_info dlinfo;
-#endif
     int imaging_mode = jl_generating_output() && !jl_options.incremental;
     // in --build mode only use sysimg data, not precompiled native code
     if (!imaging_mode && jl_options.use_sysimage_native_code==JL_OPTIONS_USE_SYSIMAGE_NATIVE_CODE_YES) {
@@ -181,31 +174,11 @@ static void jl_load_sysimg_so(void)
         *sysimg_gvars(sysimg_gvars_base, tls_offset_idx - 1) =
             (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
 #endif
-        const char *cpu_target = (const char*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_cpu_target");
-        if (strcmp(cpu_target,jl_options.cpu_target) != 0)
-            jl_error("Julia and the system image were compiled for different architectures.\n"
-                     "Please delete or regenerate sys.{so,dll,dylib}.");
-#ifdef HAVE_CPUID
-        uint32_t info[4];
-        jl_cpuid((int32_t*)info, 1);
-        if (strcmp(cpu_target, "native") == 0) {
-            if (!RUNNING_ON_VALGRIND) {
-                uint64_t saved_cpuid = *(uint64_t*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_cpu_cpuid");
-                if (saved_cpuid != (((uint64_t)info[2])|(((uint64_t)info[3])<<32)))
-                    jl_error("Target architecture mismatch. Please delete or regenerate sys.{so,dll,dylib}.");
-            }
-        }
-        else if (strcmp(cpu_target,"core2") == 0) {
-            int HasSSSE3 = (info[2] & 1<<9);
-            if (!HasSSSE3)
-                jl_error("The current host does not support SSSE3, but the system image was compiled for Core2.\n"
-                         "Please delete or regenerate sys.{so,dll,dylib}.");
-        }
-#endif
 
 #ifdef _OS_WINDOWS_
         sysimage_base = (intptr_t)jl_sysimg_handle;
 #else
+        Dl_info dlinfo;
         if (dladdr((void*)sysimg_gvars_base, &dlinfo) != 0) {
             sysimage_base = (intptr_t)dlinfo.dli_fbase;
         }
@@ -1360,13 +1333,16 @@ JL_DLLEXPORT void jl_preload_sysimg_so(const char *fname)
 // Allow passing in a module handle directly, rather than a path
 JL_DLLEXPORT void jl_set_sysimg_so(void *handle)
 {
-    // set cpu target if unspecified by user and available from sysimg
-    // otherwise default to native.
     void* *jl_RTLD_DEFAULT_handle_pointer = (void**)jl_dlsym_e(handle, "jl_RTLD_DEFAULT_handle_pointer");
     if (!jl_RTLD_DEFAULT_handle_pointer || (void*)&jl_RTLD_DEFAULT_handle != *jl_RTLD_DEFAULT_handle_pointer)
         jl_error("System image file failed consistency check: maybe opened the wrong version?");
+    // TODO make sure the sysimg and the JIT agrees on the ABI.
+    // This shouldn't be a problem for any required C types on any platforms we support
+    // but could be a problem from optional types. In particular, we need to make sure
+    // the two agrees on the usable register sizes so that functions that take
+    // a vector as input can use consistent calling convention.
     if (jl_options.cpu_target == NULL)
-        jl_options.cpu_target = (const char *)jl_dlsym(handle, "jl_sysimg_cpu_target");
+        jl_options.cpu_target = "native";
     jl_sysimg_handle = handle;
 }
 
diff --git a/src/sys.c b/src/sys.c
index 128664fc45d5f..40f0a549ad01e 100644
--- a/src/sys.c
+++ b/src/sys.c
@@ -455,7 +455,7 @@ JL_DLLEXPORT JL_STREAM *jl_stderr_stream(void) { return JL_STDERR; }
 
 // CPUID
 
-#ifdef HAVE_CPUID
+#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
 JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
 {
 #if defined _MSC_VER
@@ -478,31 +478,8 @@ JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
     );
 #endif
 }
-JL_DLLEXPORT uint64_t jl_cpuid_tag(void)
-{
-    uint32_t info[4];
-    jl_cpuid((int32_t *)info, 1);
-    return (((uint64_t)info[2]) | (((uint64_t)info[3]) << 32));
-}
-#elif defined(CPUID_SPECIFIC_BINARIES)
-#error "CPUID not available on this CPU. Turn off CPUID_SPECIFIC_BINARIES"
-#else
-// For architectures that don't have CPUID
-JL_DLLEXPORT uint64_t jl_cpuid_tag(void)
-{
-    return 0;
-}
 #endif
 
-JL_DLLEXPORT int jl_uses_cpuid_tag(void)
-{
-#ifdef CPUID_SPECIFIC_BINARIES
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 // -- set/clear the FZ/DAZ flags on x86 & x86-64 --
 #ifdef __SSE__
 
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index 8220001ca127c..12fcfbb03ae76 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -95,14 +95,9 @@ let exename = `$(Base.julia_cmd()) --sysimage-native-code=yes --startup-file=no`
     @test !success(`$exename -L`)
     @test !success(`$exename --load`)
 
-    # --cpu-target
-    # NOTE: this test only holds true if image_file is a shared library.
-    if Libdl.dlopen_e(unsafe_string(Base.JLOptions().image_file)) != C_NULL
-        @test !success(`$exename -C invalidtarget --sysimage-native-code=yes`)
-        @test !success(`$exename --cpu-target=invalidtarget --sysimage-native-code=yes`)
-    else
-        warn("--cpu-target test not runnable")
-    end
+    # --cpu-target (requires LLVM enabled)
+    @test !success(`$exename -C invalidtarget`)
+    @test !success(`$exename --cpu-target=invalidtarget`)
 
     # --procs
     @test readchomp(`$exename -q -p 2 -e "println(nworkers())"`) == "2"
diff --git a/ui/repl.c b/ui/repl.c
index e08c5607b42b1..1add10b0c56f5 100644
--- a/ui/repl.c
+++ b/ui/repl.c
@@ -158,8 +158,6 @@ static NOINLINE int true_main(int argc, char *argv[])
     return 0;
 }
 
-extern JL_DLLEXPORT uint64_t jl_cpuid_tag();
-
 #ifndef _OS_WINDOWS_
 int main(int argc, char *argv[])
 {
@@ -224,11 +222,6 @@ int wmain(int argc, wchar_t *argv[], wchar_t *envp[])
         argv[i] = (wchar_t*)arg;
     }
 #endif
-    if (argc >= 2 && strcmp((char *)argv[1], "--cpuid") == 0) {
-        /* Used by the build system to name CPUID-specific binaries */
-        printf("%" PRIx64, jl_cpuid_tag());
-        return 0;
-    }
     libsupport_init();
     int lisp_prompt = (argc >= 2 && strcmp((char*)argv[1],"--lisp") == 0);
     if (lisp_prompt) {

From cc1d4de851c390e1e1afcb909d5cbdbb4c06f056 Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sat, 13 May 2017 16:39:59 -0400
Subject: [PATCH 2/9] Move CPU feature detection functions to processor.cpp

Use C++ instead of C since there will be a lot of string handling and indirect interaction
with LLVM in this file. The meta-programing capability is also pretty useful.
---
 src/Makefile         |   2 +-
 src/julia_internal.h |   5 ++
 src/processor.cpp    | 153 +++++++++++++++++++++++++++++++++++++++++++
 src/sys.c            | 130 ------------------------------------
 4 files changed, 159 insertions(+), 131 deletions(-)
 create mode 100644 src/processor.cpp

diff --git a/src/Makefile b/src/Makefile
index daefd063655a3..439f62acb9049 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,7 +41,7 @@ SRCS := \
 	simplevector APInt-C runtime_intrinsics runtime_ccall precompile \
 	threadgroup threading stackwalk gc gc-debug gc-pages method \
 	jlapi signal-handling safepoint jloptions timing subtype rtutils \
-	crc32c
+	crc32c processor
 
 ifeq ($(USEMSVC), 1)
 SRCS += getopt
diff --git a/src/julia_internal.h b/src/julia_internal.h
index e80a57dcf18ac..10da0b7cfbee1 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -734,6 +734,11 @@ void *jl_dlopen_soname(const char *pfx, size_t n, unsigned flags);
 // libuv wrappers:
 JL_DLLEXPORT int jl_fs_rename(const char *src_path, const char *dst_path);
 
+#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType);
+JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType);
+#endif
+
 #ifdef SEGV_EXCEPTION
 extern JL_DLLEXPORT jl_value_t *jl_segv_exception;
 #endif
diff --git a/src/processor.cpp b/src/processor.cpp
new file mode 100644
index 0000000000000..d4078d680a405
--- /dev/null
+++ b/src/processor.cpp
@@ -0,0 +1,153 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Processor feature detection
+
+#include "julia.h"
+#include "julia_internal.h"
+
+extern "C" {
+
+#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+
+// CPUID
+
+JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
+{
+#if defined _MSC_VER
+    __cpuid(CPUInfo, InfoType);
+#else
+    __asm__ __volatile__ (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType)
+        );
+#endif
+}
+
+JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType)
+{
+#if defined _MSC_VER
+    __cpuidex(CPUInfo, InfoType, subInfoType);
+#else
+    __asm__ __volatile__ (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType),
+        "c" (subInfoType)
+        );
+#endif
+}
+
+// -- set/clear the FZ/DAZ flags on x86 & x86-64 --
+static uint32_t get_subnormal_flags(void)
+{
+    // CPU capabilities not yet inspected.
+    int32_t info[4];
+    jl_cpuid(info, 0);
+    if (info[0] >= 1) {
+        jl_cpuid(info, 1);
+        if (info[3] & (1 << 26)) {
+            // SSE2 supports both FZ and DAZ
+            return 0x00008040;
+        }
+        else if (info[3] & (1 << 25)) {
+            // SSE supports only the FZ flag
+            return 0x00008000;
+        }
+    }
+    return 0;
+}
+
+// Cache of information recovered from `cpuid` since executing `cpuid` it at runtime is slow.
+static uint32_t subnormal_flags = get_subnormal_flags();
+
+// Returns non-zero if subnormals go to 0; zero otherwise.
+JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return _mm_getcsr() & subnormal_flags;
+}
+
+// Return zero on success, non-zero on failure.
+JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t flags = subnormal_flags;
+    if (flags) {
+        uint32_t state = _mm_getcsr();
+        if (isZero)
+            state |= flags;
+        else
+            state &= ~flags;
+        _mm_setcsr(state);
+        return 0;
+    }
+    else {
+        // Report a failure only if user is trying to enable FTZ/DAZ.
+        return isZero;
+    }
+}
+
+#elif defined(_CPU_AARCH64_)
+
+// FZ, bit [24]
+static const uint32_t fpcr_fz_mask = 1 << 24;
+
+static inline uint32_t get_fpcr_aarch64(void)
+{
+    uint32_t fpcr;
+    asm volatile("mrs %0, fpcr" : "=r"(fpcr));
+    return fpcr;
+}
+
+static inline void set_fpcr_aarch64(uint32_t fpcr)
+{
+    asm volatile("msr fpcr, %0" :: "r"(fpcr));
+}
+
+JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
+}
+
+JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t fpcr = get_fpcr_aarch64();
+    fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask);
+    set_fpcr_aarch64(fpcr);
+    return 0;
+}
+
+#else
+
+JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return 0;
+}
+
+JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    return isZero;
+}
+
+#endif
+
+}
diff --git a/src/sys.c b/src/sys.c
index 40f0a549ad01e..9ed1d2b5a701d 100644
--- a/src/sys.c
+++ b/src/sys.c
@@ -453,136 +453,6 @@ JL_DLLEXPORT JL_STREAM *jl_stdin_stream(void)  { return JL_STDIN; }
 JL_DLLEXPORT JL_STREAM *jl_stdout_stream(void) { return JL_STDOUT; }
 JL_DLLEXPORT JL_STREAM *jl_stderr_stream(void) { return JL_STDERR; }
 
-// CPUID
-
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
-JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
-{
-#if defined _MSC_VER
-    __cpuid(CPUInfo, InfoType);
-#else
-    __asm__ __volatile__ (
-        #if defined(__i386__) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;":
-        "=S" (CPUInfo[1]) ,
-        #else
-        "cpuid":
-        "=b" (CPUInfo[1]),
-        #endif
-        "=a" (CPUInfo[0]),
-        "=c" (CPUInfo[2]),
-        "=d" (CPUInfo[3]) :
-        "a" (InfoType)
-    );
-#endif
-}
-#endif
-
-// -- set/clear the FZ/DAZ flags on x86 & x86-64 --
-#ifdef __SSE__
-
-// Cache of information recovered from jl_cpuid.
-// In a multithreaded environment, there will be races on subnormal_flags,
-// but they are harmless idempotent races.  If we ever embrace C11, then
-// subnormal_flags should be declared atomic.
-static volatile int32_t subnormal_flags = 1;
-
-static int32_t get_subnormal_flags(void)
-{
-    uint32_t f = subnormal_flags;
-    if (f & 1) {
-        // CPU capabilities not yet inspected.
-        f = 0;
-        int32_t info[4];
-        jl_cpuid(info, 0);
-        if (info[0] >= 1) {
-            jl_cpuid(info, 0x00000001);
-            if (info[3] & (1 << 26)) {
-                // SSE2 supports both FZ and DAZ
-                f = 0x00008040;
-            }
-            else if (info[3] & (1 << 25)) {
-                // SSE supports only the FZ flag
-                f = 0x00008000;
-            }
-        }
-        subnormal_flags = f;
-    }
-    return f;
-}
-
-// Returns non-zero if subnormals go to 0; zero otherwise.
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    uint32_t flags = get_subnormal_flags();
-    return _mm_getcsr() & flags;
-}
-
-// Return zero on success, non-zero on failure.
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    uint32_t flags = get_subnormal_flags();
-    if (flags) {
-        uint32_t state = _mm_getcsr();
-        if (isZero)
-            state |= flags;
-        else
-            state &= ~flags;
-        _mm_setcsr(state);
-        return 0;
-    }
-    else {
-        // Report a failure only if user is trying to enable FTZ/DAZ.
-        return isZero;
-    }
-}
-
-#elif defined(_CPU_AARCH64_)
-
-// FZ, bit [24]
-static const uint32_t fpcr_fz_mask = 1 << 24;
-
-static inline uint32_t get_fpcr_aarch64(void)
-{
-    uint32_t fpcr;
-    asm volatile("mrs %0, fpcr" : "=r"(fpcr));
-    return fpcr;
-}
-
-static inline void set_fpcr_aarch64(uint32_t fpcr)
-{
-    asm volatile("msr fpcr, %0" :: "r"(fpcr));
-}
-
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
-}
-
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    uint32_t fpcr = get_fpcr_aarch64();
-    fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask);
-    set_fpcr_aarch64(fpcr);
-    return 0;
-}
-
-#else
-
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return 0;
-}
-
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    return isZero;
-}
-
-#endif
-
 // -- processor native alignment information --
 
 JL_DLLEXPORT void jl_native_alignment(uint_t *int8align, uint_t *int16align, uint_t *int32align,

From 0f2e51c6c9c285a52da7a7c1d4bee092c14b1771 Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sat, 13 May 2017 20:51:24 -0400
Subject: [PATCH 3/9] Add X86 CPU name and feature detection

Also implement the API needed by other part of the runtime,
especially LLVM JIT, cloning pass and sysimg initialization.

Most of the detection (both name and features) code is copied from LLVM.
However, it's very hard for us to use LLVM because there are informations we need that
is not provided by LLVM (some of which is non-trivial to expose in a target independent way),
including,

1. Feature dependencies
2. Features related to a CPU name
3. Feature name validation (needed when we expose this to julia code)
4. Effect on ABI (vector register size)
5. Serialization format
6. Cloning heuristic

Additionally, the detection code itself is only a small part of the code for each arch
and we need to support multiple LLVM versions so copying the LLVM code shouldn't cause too
much problem by itself.
---
 src/Makefile          |   1 +
 src/features_x86.h    |  94 +++++
 src/julia_internal.h  |   5 -
 src/processor.cpp     | 873 ++++++++++++++++++++++++++++++++++----
 src/processor.h       | 200 +++++++++
 src/processor_x86.cpp | 961 ++++++++++++++++++++++++++++++++++++++++++
 src/runtime_ccall.cpp |  39 +-
 7 files changed, 2083 insertions(+), 90 deletions(-)
 create mode 100644 src/features_x86.h
 create mode 100644 src/processor.h
 create mode 100644 src/processor_x86.cpp

diff --git a/src/Makefile b/src/Makefile
index 439f62acb9049..99f84cdd25178 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -183,6 +183,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase
 $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h
 $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\
 	intrinsics.cpp jitlayers.h intrinsics.h debuginfo.h codegen_shared.h cgutils.cpp ccall.cpp abi_*.cpp)
+$(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h)
 $(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics.h
 $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(SRCDIR)/debuginfo.h
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h
diff --git a/src/features_x86.h b/src/features_x86.h
new file mode 100644
index 0000000000000..f5c567cba4e7e
--- /dev/null
+++ b/src/features_x86.h
@@ -0,0 +1,94 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifdef _CPU_X86_
+// avx is unusable on 32bit before LLVM 5.0 due to LLVM bug (try to encode too many registers)
+#define JL_X86_AVX_MIN_VER 50000
+#define JL_X86_64ONLY_VER(x) UINT32_MAX
+#else
+#define JL_X86_AVX_MIN_VER 0
+#define JL_X86_64ONLY_VER(x) x
+#endif
+
+// X86 features definition
+// EAX=1: ECX
+JL_FEATURE_DEF(sse3, 0, 0)
+JL_FEATURE_DEF(pclmul, 1, 0)
+JL_FEATURE_DEF(ssse3, 9, 0)
+JL_FEATURE_DEF(fma, 12, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(cx16, 13, JL_X86_64ONLY_VER(0)) // cx16 requires 64bit
+JL_FEATURE_DEF_NAME(sse41, 19, 0, "sse4.1")
+JL_FEATURE_DEF_NAME(sse42, 20, 0, "sse4.2")
+JL_FEATURE_DEF(movbe, 22, 0)
+JL_FEATURE_DEF(popcnt, 23, 0)
+JL_FEATURE_DEF(aes, 25, 0)
+JL_FEATURE_DEF(xsave, 26, 0)
+JL_FEATURE_DEF(avx, 28, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(f16c, 29, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(rdrnd, 30, 0)
+
+// EAX=1: EDX
+// JL_FEATURE_DEF(, 32 + ?, ????)
+
+// EAX=7,ECX=0: EBX
+JL_FEATURE_DEF(fsgsbase, 32 * 2 + 0, 0)
+// JL_FEATURE_DEF(sgx, 32 * 2 + 2, 0) // Disable for now since it's very hard to detect
+JL_FEATURE_DEF(bmi, 32 * 2 + 3, 0)
+// JL_FEATURE_DEF(hle, 32 * 2 + 4, 0) // Not used and gone in LLVM 5.0
+JL_FEATURE_DEF(avx2, 32 * 2 + 5, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(bmi2, 32 * 2 + 8, 0)
+// JL_FEATURE_DEF(invpcid, 32 * 2 + 10, 0) // Not used and gone in LLVM 5.0
+JL_FEATURE_DEF(rtm, 32 * 2 + 11, 0)
+JL_FEATURE_DEF(mpx, 32 * 2 + 14, 0)
+// Disable avx512 pre-5.0 since it can't handle address space
+JL_FEATURE_DEF(avx512f, 32 * 2 + 16, 50000)
+JL_FEATURE_DEF(avx512dq, 32 * 2 + 17, 50000)
+JL_FEATURE_DEF(rdseed, 32 * 2 + 18, 0)
+JL_FEATURE_DEF(adx, 32 * 2 + 19, 0)
+// JL_FEATURE_DEF(smap, 32 * 2 + 20, 0) // Not used and gone in LLVM 5.0
+JL_FEATURE_DEF(avx512ifma, 32 * 2 + 21, 50000)
+// JL_FEATURE_DEF(pcommit, 32 * 2 + 22, 0) // Deprecated
+JL_FEATURE_DEF(clflushopt, 32 * 2 + 23, 0)
+JL_FEATURE_DEF(clwb, 32 * 2 + 24, 0)
+JL_FEATURE_DEF(avx512pf, 32 * 2 + 26, 50000)
+JL_FEATURE_DEF(avx512er, 32 * 2 + 27, 50000)
+JL_FEATURE_DEF(avx512cd, 32 * 2 + 28, 50000)
+JL_FEATURE_DEF(sha, 32 * 2 + 29, 0)
+JL_FEATURE_DEF(avx512bw, 32 * 2 + 30, 50000)
+JL_FEATURE_DEF(avx512vl, 32 * 2 + 31, 50000)
+
+// EAX=7,ECX=0: ECX
+JL_FEATURE_DEF(prefetchwt1, 32 * 3 + 0, 0)
+JL_FEATURE_DEF(avx512vbmi, 32 * 3 + 1, 50000)
+JL_FEATURE_DEF(pku, 32 * 3 + 4, 0) // ospke
+JL_FEATURE_DEF(avx512vpopcntdq, 32 * 3 + 14, 50000)
+
+// EAX=7,ECX=0: EDX
+// JL_FEATURE_DEF(avx512_4vnniw, 32 * 4 + 2, ?????)
+// JL_FEATURE_DEF(avx512_4fmaps, 32 * 4 + 3, ?????)
+
+// EAX=0x80000001: ECX
+// ignore sahf on 32bit x86 since it is required
+JL_FEATURE_DEF(sahf, 32 * 5 + 0, JL_X86_64ONLY_VER(0))
+JL_FEATURE_DEF(lzcnt, 32 * 5 + 5, 0)
+JL_FEATURE_DEF(sse4a, 32 * 5 + 6, 0)
+JL_FEATURE_DEF(prfchw, 32 * 5 + 8, 0)
+JL_FEATURE_DEF(xop, 32 * 5 + 11, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(lwp, 32 * 5 + 15, 50000)
+JL_FEATURE_DEF(fma4, 32 * 5 + 16, JL_X86_AVX_MIN_VER)
+JL_FEATURE_DEF(tbm, 32 * 5 + 21, 0)
+JL_FEATURE_DEF(mwaitx, 32 * 5 + 29, 0)
+
+// EAX=0x80000001: EDX
+// 3dnow is here but we don't care...
+// JL_FEATURE_DEF(, 32 * 6 + ?, ?????)
+
+// EAX=0xd: EAX
+JL_FEATURE_DEF(xsaveopt, 32 * 7 + 0, 0)
+JL_FEATURE_DEF(xsavec, 32 * 7 + 1, 0)
+JL_FEATURE_DEF(xsaves, 32 * 7 + 3, 0)
+
+// EAX=0x80000008: EBX
+JL_FEATURE_DEF(clzero, 32 * 8 + 0, 50000)
+
+#undef JL_X86_AVX_MIN_VER
+#undef JL_X86_64ONLY_VER
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 10da0b7cfbee1..e80a57dcf18ac 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -734,11 +734,6 @@ void *jl_dlopen_soname(const char *pfx, size_t n, unsigned flags);
 // libuv wrappers:
 JL_DLLEXPORT int jl_fs_rename(const char *src_path, const char *dst_path);
 
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
-JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType);
-JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType);
-#endif
-
 #ifdef SEGV_EXCEPTION
 extern JL_DLLEXPORT jl_value_t *jl_segv_exception;
 #endif
diff --git a/src/processor.cpp b/src/processor.cpp
index d4078d680a405..f08fef5db4c79 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -2,112 +2,809 @@
 
 // Processor feature detection
 
+#include "processor.h"
+
 #include "julia.h"
 #include "julia_internal.h"
 
-extern "C" {
+#include <map>
+#include <algorithm>
 
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+#include "llvm-version.h"
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/MathExtras.h>
+
+#include "julia_assert.h"
+
+// CPU target string is a list of strings separated by `;` each string starts with a CPU
+// or architecture name and followed by an optional list of features separated by `,`.
+// A "generic" or empty CPU name means the basic required feature set of the target ISA
+// which is at least the architecture the C/C++ runtime is compiled with.
+
+// CPU dispatch needs to determine the version to be used by the sysimg as well as
+// the target and feature used by the JIT. Currently the only limitation on JIT target
+// and feature is matching register size between the sysimg and JIT so that SIMD vectors
+// can be passed correctly. This means disabling AVX and AVX2 if AVX was not enabled
+// in sysimg and disabling AVX512 if it was not enabled in sysimg.
+// This also possibly means that SVE needs to be disabled on AArch64 if sysimg doesn't have it
+// enabled.
+
+// CPU dispatch starts by first deciding the max feature set and CPU requested for JIT.
+// This is the host or the target specified on the command line with features unavailable
+// on the host disabled. All sysimg targets that require features not available in this set
+// will be ignored.
+
+// The next step is matching CPU name.
+// If exact name match with compatible feature set exists, all versions without name match
+// are ignored.
+// This step will query LLVM first so it can accept CPU names that is recognized by LLVM but
+// not by us (yet) when LLVM is enabled.
 
-// CPUID
+// If there are still more than one candidates, a feature match is performed.
+// The ones with the largest register size will be used
+// (i.e. AVX512 > AVX2/AVX > SSE, SVE > ASIMD). If there's a tie, the one with the most features
+// enabled will be used. If there's still a tie the one that appears later in the list will be
+// used. (i.e. the order in the version list is significant in this case).
 
-JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
+// Features that are not recognized will be passed to LLVM directly during codegen
+// but ignored otherwise.
+
+// Two special features are supported:
+// 1. `clone_all`
+//
+//     This forces the target to have all functions in sysimg cloned.
+//     When used in negative form (i.e. `-clone_all`), this disables full clone that's
+//     enabled by default for certain targets.
+//
+// 2. `base([0-9]*)`
+//
+//     This specifies the (0-based) base target index. The base target is the target
+//     that the current target is based on, i.e. the functions that are not being cloned
+//     will use the version in the base target. This option causes the base target to be
+//     fully cloned (as if `clone_all` is specified for it) if it is not the default target (0).
+//     The index can only be smaller than the current index.
+
+bool jl_processor_print_help = false;
+
+namespace {
+
+// Helper functions to test/set feature bits
+
+template<typename T1, typename T2, typename T3>
+static inline bool test_bits(T1 v, T2 mask, T3 test)
 {
-#if defined _MSC_VER
-    __cpuid(CPUInfo, InfoType);
-#else
-    __asm__ __volatile__ (
-#if defined(__i386__) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;" :
-        "=S" (CPUInfo[1]),
-#else
-        "cpuid" :
-        "=b" (CPUInfo[1]),
-#endif
-        "=a" (CPUInfo[0]),
-        "=c" (CPUInfo[2]),
-        "=d" (CPUInfo[3]) :
-        "a" (InfoType)
-        );
-#endif
+    return T3(v & mask) == test;
 }
 
-JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType)
+template<typename T1, typename T2>
+static inline bool test_all_bits(T1 v, T2 mask)
 {
-#if defined _MSC_VER
-    __cpuidex(CPUInfo, InfoType, subInfoType);
-#else
-    __asm__ __volatile__ (
-#if defined(__i386__) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;" :
-        "=S" (CPUInfo[1]),
-#else
-        "cpuid" :
-        "=b" (CPUInfo[1]),
-#endif
-        "=a" (CPUInfo[0]),
-        "=c" (CPUInfo[2]),
-        "=d" (CPUInfo[3]) :
-        "a" (InfoType),
-        "c" (subInfoType)
-        );
-#endif
+    return test_bits(v, mask, mask);
+}
+
+template<typename T1, typename T2>
+static inline bool test_nbit(const T1 &bits, T2 _bitidx)
+{
+    auto bitidx = static_cast<uint32_t>(_bitidx);
+    auto u32idx = bitidx / 32;
+    auto bit = bitidx % 32;
+    return (bits[u32idx] & (1 << bit)) != 0;
+}
+
+template<typename T>
+static inline void unset_bits(T &bits)
+{
+    (void)bits;
+}
+
+template<typename T, typename T1, typename... Rest>
+static inline void unset_bits(T &bits, T1 _bitidx, Rest... rest)
+{
+    auto bitidx = static_cast<uint32_t>(_bitidx);
+    auto u32idx = bitidx / 32;
+    auto bit = bitidx % 32;
+    bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit);
+    unset_bits(bits, rest...);
 }
 
-// -- set/clear the FZ/DAZ flags on x86 & x86-64 --
-static uint32_t get_subnormal_flags(void)
+template<typename T, typename T1>
+static inline void set_bit(T &bits, T1 _bitidx, bool val)
 {
-    // CPU capabilities not yet inspected.
-    int32_t info[4];
-    jl_cpuid(info, 0);
-    if (info[0] >= 1) {
-        jl_cpuid(info, 1);
-        if (info[3] & (1 << 26)) {
-            // SSE2 supports both FZ and DAZ
-            return 0x00008040;
+    auto bitidx = static_cast<uint32_t>(_bitidx);
+    auto u32idx = bitidx / 32;
+    auto bit = bitidx % 32;
+    if (val) {
+        bits[u32idx] = bits[u32idx] | uint32_t(1 << bit);
+    }
+    else {
+        bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit);
+    }
+}
+
+// Helper functions to create feature masks
+
+// This can be `std::array<uint32_t,n>` on C++14
+template<size_t n>
+struct FeatureList {
+    uint32_t eles[n];
+    uint32_t &operator[](size_t pos)
+    {
+        return eles[pos];
+    }
+    constexpr const uint32_t &operator[](size_t pos) const
+    {
+        return eles[pos];
+    }
+    inline int nbits() const
+    {
+        int cnt = 0;
+        for (size_t i = 0; i < n; i++)
+            cnt += llvm::countPopulation(eles[i]);
+        return cnt;
+    }
+    inline bool empty() const
+    {
+        for (size_t i = 0; i < n; i++) {
+            if (eles[i]) {
+                return false;
+            }
         }
-        else if (info[3] & (1 << 25)) {
-            // SSE supports only the FZ flag
-            return 0x00008000;
+        return true;
+    }
+};
+
+static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx)
+{
+    return mask;
+}
+
+template<typename T, typename... Rest>
+static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx,
+                                                      T bit, Rest... args)
+{
+    return add_feature_mask_u32(mask | ((int(bit) >= 0 && int(bit) / 32 == (int)u32idx) ?
+                                        (1 << (int(bit) % 32)) : 0),
+                                u32idx, args...);
+}
+
+template<typename... Args>
+static inline constexpr uint32_t get_feature_mask_u32(uint32_t u32idx, Args... args)
+{
+    return add_feature_mask_u32(uint32_t(0), u32idx, args...);
+}
+
+template<uint32_t... Is> struct seq{};
+template<uint32_t N, uint32_t... Is>
+struct gen_seq : gen_seq<N-1, N-1, Is...>{};
+template<uint32_t... Is>
+struct gen_seq<0, Is...> : seq<Is...>{};
+
+template<size_t n, uint32_t... I, typename... Args>
+static inline constexpr FeatureList<n>
+_get_feature_mask(seq<I...>, Args... args)
+{
+    return FeatureList<n>{{get_feature_mask_u32(I, args...)...}};
+}
+
+template<size_t n, typename... Args>
+static inline constexpr FeatureList<n> get_feature_masks(Args... args)
+{
+    return _get_feature_mask<n>(gen_seq<n>(), args...);
+}
+
+template<size_t n, uint32_t... I>
+static inline constexpr FeatureList<n>
+_feature_mask_or(seq<I...>, const FeatureList<n> &a, const FeatureList<n> &b)
+{
+    return FeatureList<n>{{(a[I] | b[I])...}};
+}
+
+template<size_t n>
+static inline constexpr FeatureList<n> operator|(const FeatureList<n> &a, const FeatureList<n> &b)
+{
+    return _feature_mask_or<n>(gen_seq<n>(), a, b);
+}
+
+template<size_t n, uint32_t... I>
+static inline constexpr FeatureList<n>
+_feature_mask_and(seq<I...>, const FeatureList<n> &a, const FeatureList<n> &b)
+{
+    return FeatureList<n>{{(a[I] & b[I])...}};
+}
+
+template<size_t n>
+static inline constexpr FeatureList<n> operator&(const FeatureList<n> &a, const FeatureList<n> &b)
+{
+    return _feature_mask_and<n>(gen_seq<n>(), a, b);
+}
+
+template<size_t n, uint32_t... I>
+static inline constexpr FeatureList<n>
+_feature_mask_not(seq<I...>, const FeatureList<n> &a)
+{
+    return FeatureList<n>{{(~a[I])...}};
+}
+
+template<size_t n>
+static inline constexpr FeatureList<n> operator~(const FeatureList<n> &a)
+{
+    return _feature_mask_not<n>(gen_seq<n>(), a);
+}
+
+template<size_t n>
+static inline void mask_features(const FeatureList<n> masks, uint32_t *features)
+{
+    for (size_t i = 0; i < n; i++) {
+        features[i] = features[i] & masks[i];
+    }
+}
+
+// Turn feature list to a string the LLVM accept
+static inline std::string join_feature_strs(const std::vector<std::string> &strs)
+{
+    size_t nstr = strs.size();
+    if (!nstr)
+        return std::string("");
+    std::string str = strs[0];
+    for (size_t i = 1; i < nstr; i++)
+        str += ',' + strs[i];
+    return str;
+}
+
+static inline void append_ext_features(std::string &features, const std::string &ext_features)
+{
+    if (ext_features.empty())
+        return;
+    if (!features.empty())
+        features.push_back(',');
+    features.append(ext_features);
+}
+
+static inline void append_ext_features(std::vector<std::string> &features,
+                                       const std::string &ext_features)
+{
+    if (ext_features.empty())
+        return;
+    const char *start = ext_features.c_str();
+    const char *p = start;
+    for (; *p; p++) {
+        if (*p == ',') {
+            features.emplace_back(start, p - start);
+            start = p + 1;
         }
     }
-    return 0;
+    if (p > start) {
+        features.emplace_back(start, p - start);
+    }
 }
 
-// Cache of information recovered from `cpuid` since executing `cpuid` it at runtime is slow.
-static uint32_t subnormal_flags = get_subnormal_flags();
+/**
+ * Target specific type/constant definitions, always enable.
+ */
+
+struct FeatureName {
+    const char *name;
+    uint32_t bit; // bit index into a `uint32_t` array;
+    uint32_t llvmver; // 0 if it is available on the oldest LLVM version we support
+};
 
-// Returns non-zero if subnormals go to 0; zero otherwise.
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+template<typename CPU, size_t n>
+struct CPUSpec {
+    const char *name;
+    CPU cpu;
+    CPU fallback;
+    uint32_t llvmver;
+    FeatureList<n> features;
+};
+
+struct FeatureDep {
+    uint32_t feature;
+    uint32_t dep;
+};
+
+// Recursively enable all features that the current feature set depends on.
+template<size_t n>
+static inline void enable_depends(FeatureList<n> &features, const FeatureDep *deps, size_t ndeps)
+{
+    bool changed = true;
+    while (changed) {
+        changed = false;
+        for (ssize_t i = ndeps - 1; i >= 0; i--) {
+            auto &dep = deps[i];
+            if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep))
+                continue;
+            set_bit(features, dep.dep, true);
+            changed = true;
+        }
+    }
+}
+
+// Recursively disable all features that the current feature set does not provide.
+template<size_t n>
+static inline void disable_depends(FeatureList<n> &features, const FeatureDep *deps, size_t ndeps)
 {
-    return _mm_getcsr() & subnormal_flags;
+    bool changed = true;
+    while (changed) {
+        changed = false;
+        for (ssize_t i = ndeps - 1; i >= 0; i--) {
+            auto &dep = deps[i];
+            if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep))
+                continue;
+            unset_bits(features, dep.feature);
+            changed = true;
+        }
+    }
 }
 
-// Return zero on success, non-zero on failure.
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+template<typename CPU, size_t n>
+static const CPUSpec<CPU,n> *find_cpu(uint32_t cpu, const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
 {
-    uint32_t flags = subnormal_flags;
-    if (flags) {
-        uint32_t state = _mm_getcsr();
-        if (isZero)
-            state |= flags;
-        else
-            state &= ~flags;
-        _mm_setcsr(state);
+    for (uint32_t i = 0; i < ncpus; i++) {
+        if (cpu == uint32_t(cpus[i].cpu)) {
+            return &cpus[i];
+        }
+    }
+    return nullptr;
+}
+
+template<typename CPU, size_t n>
+static const CPUSpec<CPU,n> *find_cpu(llvm::StringRef name, const CPUSpec<CPU,n> *cpus,
+                                      uint32_t ncpus)
+{
+    for (uint32_t i = 0; i < ncpus; i++) {
+        if (name == cpus[i].name) {
+            return &cpus[i];
+        }
+    }
+    return nullptr;
+}
+
+template<typename CPU, size_t n>
+static const char *find_cpu_name(uint32_t cpu, const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
+{
+    if (auto *spec = find_cpu(cpu, cpus, ncpus))
+        return spec->name;
+    return "generic";
+}
+
+JL_UNUSED static uint32_t find_feature_bit(const FeatureName *features, size_t nfeatures,
+                                           const char *str, size_t len)
+{
+    for (size_t i = 0; i < nfeatures; i++) {
+        auto &feature = features[i];
+        if (strncmp(feature.name, str, len) == 0 && feature.name[len] == 0) {
+            return feature.bit;
+        }
+    }
+    return (uint32_t)-1;
+}
+
+// This is how we save the target identification.
+// CPU name is saved as string instead of binary data like features because
+// 1. CPU ID is less stable (they are not bound to hardware/OS API)
+// 2. We need to support CPU names that are not recognized by us and therefore doesn't have an ID
+// 3. CPU name is trivial to parse
+static inline std::vector<uint8_t> serialize_target_data(llvm::StringRef name,
+                                                         uint32_t nfeature,
+                                                         const uint32_t *features_en,
+                                                         const uint32_t *features_dis,
+                                                         llvm::StringRef ext_features)
+{
+    std::vector<uint8_t> res;
+    auto add_data = [&] (const void *data, size_t sz) {
+        size_t old_sz = res.size();
+        res.resize(old_sz + sz);
+        memcpy(&res[old_sz], data, sz);
+    };
+    add_data(&nfeature, 4);
+    add_data(features_en, 4 * nfeature);
+    add_data(features_dis, 4 * nfeature);
+    uint32_t namelen = name.size();
+    add_data(&namelen, 4);
+    add_data(name.data(), namelen);
+    uint32_t ext_features_len = ext_features.size();
+    add_data(&ext_features_len, 4);
+    add_data(ext_features.data(), ext_features_len);
+    return res;
+}
+
+template<size_t n>
+static inline std::vector<uint8_t> serialize_target_data(llvm::StringRef name,
+                                                         const FeatureList<n> &features_en,
+                                                         const FeatureList<n> &features_dis,
+                                                         llvm::StringRef ext_features)
+{
+    return serialize_target_data(name, n, &features_en[0], &features_dis[0], ext_features);
+}
+
+template<size_t n>
+struct TargetData {
+    std::string name;
+    std::string ext_features;
+    struct {
+        FeatureList<n> features;
+        uint32_t flags;
+    } en, dis;
+    int base;
+};
+
+// In addition to the serialized data, the first `uint32_t` gives the number of targets saved
+// and each target has a `uint32_t` flag before the serialized target data.
+template<size_t n>
+static inline std::vector<TargetData<n>> deserialize_target_data(const uint8_t *data)
+{
+    auto load_data = [&] (void *dest, size_t sz) {
+        memcpy(dest, data, sz);
+        data += sz;
+    };
+    auto load_string = [&] () {
+        uint32_t len;
+        load_data(&len, 4);
+        std::string res((const char*)data, len);
+        data += len;
+        return res;
+    };
+    uint32_t ntarget;
+    load_data(&ntarget, 4);
+    std::vector<TargetData<n>> res(ntarget);
+    for (uint32_t i = 0; i < ntarget; i++) {
+        auto &target = res[i];
+        load_data(&target.en.flags, 4);
+        target.dis.flags = 0;
+        // Starting serialized target data
+        uint32_t nfeature;
+        load_data(&nfeature, 4);
+        assert(nfeature == n);
+        load_data(&target.en.features[0], 4 * n);
+        load_data(&target.dis.features[0], 4 * n);
+        target.name = load_string();
+        target.ext_features = load_string();
+        target.base = 0;
+    }
+    return res;
+}
+
+// Try getting clone base argument. Return 1-based index. Return 0 if match failed.
+static inline int get_clone_base(const char *start, const char *end)
+{
+    const char *prefix = "base(";
+    const int prefix_len = strlen(prefix);
+    if (end - start <= prefix_len)
+        return 0;
+    if (memcmp(start, prefix, prefix_len) != 0)
+        return 0;
+    start += prefix_len;
+    if (*start > '9' || *start < '0')
+        return 0;
+    char *digit_end;
+    auto idx = strtol(start, &digit_end, 10);
+    if (idx < 0)
         return 0;
+    if (*digit_end != ')' || digit_end + 1 != end)
+        return 0;
+    return (int)idx + 1;
+}
+
+// Parse cmdline string. This handles `clone_all` and `base` special features.
+// Other feature names will be passed to `feature_cb` for target dependent parsing.
+template<size_t n, typename F>
+static inline std::vector<TargetData<n>>
+parse_cmdline(const char *option, F &&feature_cb)
+{
+    if (!option)
+        option = "native";
+    std::vector<TargetData<n>> res;
+    TargetData<n> arg{};
+    auto reset_arg = [&] {
+        res.push_back(arg);
+        arg.name.clear();
+        arg.ext_features.clear();
+        memset(&arg.en.features[0], 0, 4 * n);
+        memset(&arg.dis.features[0], 0, 4 * n);
+        arg.en.flags = 0;
+        arg.dis.flags = 0;
+    };
+    const char *start = option;
+    for (const char *p = option; ; p++) {
+        switch (*p) {
+        case ',':
+        case ';':
+        case '\0': {
+            bool done = *p == '\0';
+            bool next_target = *p == ';' || done;
+            if (arg.name.empty()) {
+                if (p == start)
+                    jl_error("Invalid target option: empty CPU name");
+                arg.name.append(start, p - start);
+                if (arg.name == "help") {
+                    arg.name = "native";
+                    jl_processor_print_help = true;
+                }
+                start = p + 1;
+                if (next_target)
+                    reset_arg();
+                if (done)
+                    return res;
+                continue;
+            }
+            bool disable = false;
+            const char *full = start;
+            const char *fname = full;
+            start = p + 1;
+            if (*full == '-') {
+                disable = true;
+                fname++;
+            }
+            else if (*full == '+') {
+                fname++;
+            }
+            if (llvm::StringRef(fname, p - fname) == "clone_all") {
+                if (!disable) {
+                    arg.en.flags |= JL_TARGET_CLONE_ALL;
+                    arg.dis.flags &= ~JL_TARGET_CLONE_ALL;
+                }
+                else {
+                    arg.dis.flags |= JL_TARGET_CLONE_ALL;
+                    arg.en.flags &= ~JL_TARGET_CLONE_ALL;
+                }
+            }
+            else if (int base = get_clone_base(fname, p)) {
+                if (disable)
+                    jl_error("Invalid target option: disabled base index.");
+                base -= 1;
+                if (base >= (int)res.size())
+                    jl_error("Invalid target option: base index must refer to a previous target.");
+                if (res[base].dis.flags & JL_TARGET_CLONE_ALL ||
+                    !(res[base].en.flags & JL_TARGET_CLONE_ALL))
+                    jl_error("Invalid target option: base target must be clone_all.");
+                arg.base = base;
+            }
+            else if (llvm::StringRef(fname, p - fname) == "help") {
+                jl_processor_print_help = true;
+            }
+            else {
+                FeatureList<n> &list = disable ? arg.dis.features : arg.en.features;
+                if (!feature_cb(fname, p - fname, list)) {
+                    if (!arg.ext_features.empty())
+                        arg.ext_features += ',';
+                    arg.ext_features += disable ? '-' : '+';
+                    arg.ext_features.append(fname, p - fname);
+                }
+            }
+            if (next_target)
+                reset_arg();
+            if (done) {
+                return res;
+            }
+        }
+            JL_FALLTHROUGH;
+        default:
+            continue;
+        }
+    }
+}
+
+// Cached version of command line parsing
+template<size_t n, typename F>
+static inline std::vector<TargetData<n>> &get_cmdline_targets(F &&feature_cb)
+{
+    static std::vector<TargetData<n>> targets =
+        parse_cmdline<n>(jl_options.cpu_target, std::forward<F>(feature_cb));
+    return targets;
+}
+
+// Load sysimg, use the `callback` for dispatch and perform all relocations
+// for the selected target.
+template<typename F>
+static inline jl_sysimg_fptrs_t parse_sysimg(void *hdl, F &&callback)
+{
+    jl_sysimg_fptrs_t res = {nullptr, 0, nullptr, 0, nullptr, nullptr};
+    // .data base
+    auto data_base = (char*)jl_dlsym(hdl, "jl_sysimg_gvars_base");
+    // .text base
+    res.base = (const char*)jl_dlsym(hdl, "jl_sysimg_fvars_base");
+    auto offsets = ((const int32_t*)jl_dlsym(hdl, "jl_sysimg_fvars_offsets")) + 1;
+    uint32_t nfunc = ((const uint32_t*)offsets)[-1];
+    res.offsets = offsets;
+
+    void *ids = jl_dlsym(hdl, "jl_dispatch_target_ids");
+    uint32_t target_idx = callback(ids);
+
+    auto reloc_slots = ((const int32_t*)jl_dlsym(hdl, "jl_dispatch_reloc_slots")) + 1;
+    auto nreloc = ((const uint32_t*)reloc_slots)[-1];
+    auto clone_idxs = (const uint32_t*)jl_dlsym(hdl, "jl_dispatch_fvars_idxs");
+    auto clone_offsets = (const int32_t*)jl_dlsym(hdl, "jl_dispatch_fvars_offsets");
+    uint32_t tag_len = clone_idxs[0];
+    clone_idxs += 1;
+    assert(tag_len & jl_sysimg_tag_mask);
+    std::vector<const int32_t*> base_offsets = {res.offsets};
+    // Find target
+    for (uint32_t i = 0;i < target_idx;i++) {
+        uint32_t len = jl_sysimg_val_mask & tag_len;
+        if (jl_sysimg_tag_mask & tag_len) {
+            if (i != 0)
+                clone_offsets += nfunc;
+            clone_idxs += len + 1;
+        }
+        else {
+            clone_offsets += len;
+            clone_idxs += len + 2;
+        }
+        tag_len = clone_idxs[-1];
+        base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr);
+    }
+
+    bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0;
+    // Fill in return value
+    if (clone_all) {
+        // clone_all
+        if (target_idx != 0) {
+            res.offsets = clone_offsets;
+        }
     }
     else {
-        // Report a failure only if user is trying to enable FTZ/DAZ.
-        return isZero;
+        uint32_t base_idx = clone_idxs[0];
+        assert(base_idx < target_idx);
+        if (target_idx != 0) {
+            res.offsets = base_offsets[base_idx];
+            assert(res.offsets);
+        }
+        clone_idxs++;
+        res.nclones = tag_len;
+        res.clone_offsets = clone_offsets;
+        res.clone_idxs = clone_idxs;
     }
+    // Do relocation
+    uint32_t reloc_i = 0;
+    uint32_t len = jl_sysimg_val_mask & tag_len;
+    for (uint32_t i = 0; i < len; i++) {
+        uint32_t idx = clone_idxs[i];
+        int32_t offset;
+        if (clone_all) {
+            offset = res.offsets[idx];
+        }
+        else if (idx & jl_sysimg_tag_mask) {
+            idx = idx & jl_sysimg_val_mask;
+            offset = clone_offsets[i];
+        }
+        else {
+            continue;
+        }
+        bool found = false;
+        for (; reloc_i < nreloc; reloc_i++) {
+            auto reloc_idx = ((const uint32_t*)reloc_slots)[reloc_i * 2];
+            if (reloc_idx == idx) {
+                found = true;
+                auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]);
+                *slot = offset + res.base;
+            }
+            else if (reloc_idx > idx) {
+                break;
+            }
+        }
+        assert(found && "Cannot find GOT entry for cloned function.");
+        (void)found;
+    }
+
+    return res;
 }
 
+template<typename T>
+static inline void check_cmdline(T &&cmdline, bool imaging)
+{
+    assert(cmdline.size() > 0);
+    // It's unclear what does specifying multiple target when not generating
+    // sysimg means. Make it an error for now.
+    if (!imaging) {
+        if (cmdline.size() > 1) {
+            jl_error("More than one command line CPU targets specified "
+                     "without a `--output-` flag specified");
+        }
+        if (cmdline[0].en.flags & JL_TARGET_CLONE_ALL) {
+            jl_error("\"clone_all\" feature specified "
+                     "without a `--output-` flag specified");
+        }
+    }
+}
+
+struct SysimgMatch {
+    uint32_t best_idx{(uint32_t)-1};
+    int vreg_size{0};
+};
+
+// Find the best match in the sysimg.
+// Select the best one based on the largest vector register and largest compatible feature set.
+template<typename S, typename T, typename F>
+static inline SysimgMatch match_sysimg_targets(S &&sysimg, T &&target, F &&max_vector_size)
+{
+    SysimgMatch match;
+    bool match_name = false;
+    int feature_size = 0;
+    for (uint32_t i = 0; i < sysimg.size(); i++) {
+        auto &imgt = sysimg[i];
+        if (!(imgt.en.features & target.dis.features).empty()) {
+            // Check sysimg enabled features against runtime disabled features
+            // This is valid (and all what we can do)
+            // even if one or both of the targets are unknown.
+            continue;
+        }
+        if (imgt.name == target.name) {
+            if (!match_name) {
+                match_name = true;
+                match.vreg_size = 0;
+                feature_size = 0;
+            }
+        }
+        else if (match_name) {
+            continue;
+        }
+        int new_vsz = max_vector_size(imgt.en.features);
+        if (match.vreg_size > new_vsz)
+            continue;
+        int new_feature_size = imgt.en.features.nbits();
+        if (match.vreg_size < new_vsz) {
+            match.best_idx = i;
+            match.vreg_size = new_vsz;
+            feature_size = new_feature_size;
+            continue;
+        }
+        if (new_feature_size < feature_size)
+            continue;
+        match.best_idx = i;
+        feature_size = new_feature_size;
+    }
+    if (match.best_idx == (uint32_t)-1)
+        jl_error("Unable to find compatible target in system image.");
+    return match;
+}
+
+// Debug helper
+
+template<typename CPU, size_t n>
+static inline void dump_cpu_spec(uint32_t cpu, const FeatureList<n> &features,
+                                 const FeatureName *feature_names, uint32_t nfeature_names,
+                                 const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
+{
+    bool cpu_found = false;
+    for (uint32_t i = 0;i < ncpus;i++) {
+        if (cpu == uint32_t(cpus[i].cpu)) {
+            cpu_found = true;
+            jl_safe_printf("CPU: %s\n", cpus[i].name);
+            break;
+        }
+    }
+    if (!cpu_found)
+        jl_safe_printf("CPU: generic\n");
+    jl_safe_printf("Features:");
+    bool first = true;
+    for (uint32_t i = 0;i < nfeature_names;i++) {
+        if (test_nbit(&features[0], feature_names[i].bit)) {
+            if (first) {
+                jl_safe_printf(" %s", feature_names[i].name);
+                first = false;
+            }
+            else {
+                jl_safe_printf(", %s", feature_names[i].name);
+            }
+        }
+    }
+    jl_safe_printf("\n");
+}
+
+}
+
+#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+
+#include "processor_x86.cpp"
+
 #elif defined(_CPU_AARCH64_)
 
+// TODO
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
+{
+    return jl_cstr_to_string(jl_get_cpu_name_llvm().c_str());
+}
+
 // FZ, bit [24]
 static const uint32_t fpcr_fz_mask = 1 << 24;
 
@@ -123,12 +820,12 @@ static inline void set_fpcr_aarch64(uint32_t fpcr)
     asm volatile("msr fpcr, %0" :: "r"(fpcr));
 }
 
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
 {
     return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
 }
 
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
 {
     uint32_t fpcr = get_fpcr_aarch64();
     fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask);
@@ -138,16 +835,30 @@ JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
 
 #else
 
-JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
+{
+    return jl_cstr_to_string(jl_get_cpu_name_llvm().c_str());
+}
+
+JL_DLLEXPORT void jl_dump_host_cpu(void)
+{
+    jl_safe_printf("CPU: generic\n");
+    jl_safe_printf("Features:\n");
+}
+
+extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
 {
     return 0;
 }
 
-JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
 {
     return isZero;
 }
 
 #endif
-
-}
diff --git a/src/processor.h b/src/processor.h
new file mode 100644
index 0000000000000..7b43aaca8a750
--- /dev/null
+++ b/src/processor.h
@@ -0,0 +1,200 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "support/dtypes.h"
+
+#include "julia.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Related sysimg exported symbols
+ *
+ * In the following text function refer to an abstract identity.
+ * It corresponds to a `Function` that we emit in the codegen and there might be multiple copy
+ * of it in the system image. Only one of those copy will be used in a given session.
+ * Function pointers refer to a real piece of code in the system image.
+ * Each function might have multiple function pointers in the system image
+ * and each function pointer will correspond to only one function.
+ *
+ * # Global function and base pointers
+ * `jl_sysimg_gvars_base`:
+ *     The address of this symbol is the base data pointer
+ *     (all other data pointers are stored as offsets to this address)
+ * `jl_sysimg_fvars_base`:
+ *     The address of this symbol is the base function pointer
+ *     (all other function pointers are stored as offsets to this address)
+ * `jl_sysimg_fvars_offsets`: [static data]
+ *     The array of function pointer offsets (`int32_t`) from the base pointer.
+ *     This includes all julia functions in sysimg as well as all other functions that are cloned.
+ *     The default function pointer is used if the function is cloned.
+ *     The first element is the size of the array, which should **NOT** be used as the number
+ *     of julia functions in the sysimg.
+ *     Each entry in this array uniquely identifies a function which we are interested in
+ *     (the function may have multiple function pointers corresponding to different versions).
+ *     In other sysimg info, all information of functions are stored as function index which are
+ *     `uint32_t` index in this array.
+ *
+ * # Target data and dispatch slots (Only needed by runtime during loading)
+ * `jl_dispatch_target_ids`: [static data] serialize target data.
+ *     This contains the number of targets which is needed to decode `jl_dispatch_fvars_idxs`
+ *     in additional to the name and feature set of each target.
+ * `jl_dispatch_reloc_slots`: [static data] location and index of relocation slots.
+ *     Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`.
+ *     The first element is an `uint32_t` giving the number of relocations.
+ *     This is needed for functions whose address is used in a way that requires dispatch.
+ *     We currently only support one type of relocation (i.e. absolute pointer) which is enough
+ *     for all use in functions as well as global GOT slot (for "PLT" callback).
+ *     Note that not all functions being cloned are assigned a slot.
+ *     This array is sorted by the function indices.
+ *     There can be more than one slot per-function,
+ *     i.e. there can be duplicated function indices.
+ *
+ * # Target functions
+ * `jl_dispatch_fvars_idxs`: [static data] Target specific functions indices.
+ *     For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index
+ *     of the base target followed by an array of tagged function indices.
+ *     The base target index is required to be smaller than the index of the current target
+ *     and must be the default (`0`) or a `clone_all` target.
+ *     If it's not `0`, the function pointer array for the `clone_all` target will be used as
+ *     the base function pointer offsets instead.
+ *     The tag bits for both the length and the indices are the top bit.
+ *     A tagged length indicates that all of the functions are cloned and the indices follows
+ *     are the ones that requires relocation. The base target index is omitted in this case.
+ *     Otherwise, the length is the total number of functions that we are interested in
+ *     for this target, which includes all cloned julia functions and
+ *     all other cloned functions that requires relocation.
+ *     A tagged index means that the function pointer should be filled into the GOT slots
+ *     identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function.
+ *     (Note that a tagged index could corresponds to a functions pointer that's the same as
+ *     the base one since this is the only way we currently represent relocations.)
+ *     A tagged length implicitly tags all the indices and the indices will not have the tag bit
+ *     set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`.
+ * `jl_dispatch_fvars_offsets`: [static data] Target specific function pointer offsets.
+ *     This contains all the cloned functions that we are interested and it needs to be decoded
+ *     and used along with `jl_dispatch_fvars_idxs`.
+ *     For the default target, there's no entries in this variable, if there's any relocations
+ *     needed for the default target, the function pointers are taken from the global offset
+ *     arrays directly.
+ *     For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this
+ *     variable contains an offset array the same length as the global one. Only the indices
+ *     appeared in `jl_dispatch_fvars_idxs` needs relocation and the dispatch code should return
+ *     this array as the original/base function offsets.
+ *     For other targets, this variable contains an offset array with the length defined in
+ *     `jl_dispatch_fvars_idxs`. Tagged indices needs relocations.
+ */
+
+enum {
+    JL_TARGET_VEC_CALL = 1 << 0,
+    // Clone all functions
+    JL_TARGET_CLONE_ALL = 1 << 1,
+    // Clone when there's scalar math operations that can benefit from target specific
+    // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags.
+    JL_TARGET_CLONE_MATH = 1 << 2,
+    // Clone when the function has a loop
+    JL_TARGET_CLONE_LOOP = 1 << 3,
+    // Clone when the function uses any vectors
+    // When this is specified, the cloning pass should also record if any of the cloned functions
+    // used this in any function call (including the signature of the function itself)
+    JL_TARGET_CLONE_SIMD = 1 << 4,
+    // The CPU name is unknown
+    JL_TARGET_UNKNOWN_NAME = 1 << 5,
+};
+
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
+typedef enum {
+#define JL_FEATURE_DEF(name, bit, llvmver) JL_X86_##name = bit,
+#include "features_x86.h"
+#undef JL_FEATURE_DEF
+} jl_cpu_feature_t;
+#undef JL_FEATURE_DEF_NAME
+
+int jl_test_cpu_feature(jl_cpu_feature_t feature);
+
+static const uint32_t jl_sysimg_tag_mask = 0x80000000u;
+static const uint32_t jl_sysimg_val_mask = ~((uint32_t)0x80000000u);
+
+typedef struct {
+    // base function pointer
+    const char *base;
+    // number of functions
+    uint32_t noffsets;
+    // function pointer offsets
+    const int32_t *offsets;
+
+    // Following fields contains the information about the selected target.
+    // All of these fields are 0 if the selected targets have all the functions cloned.
+    // Instead the offsets are stored in `noffsets` and `offsets`.
+
+    // number of cloned functions
+    uint32_t nclones;
+    // function pointer offsets of cloned functions
+    const int32_t *clone_offsets;
+    // sorted indices of the cloned functions (including the tag bit)
+    const uint32_t *clone_idxs;
+} jl_sysimg_fptrs_t;
+
+/**
+ * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself).
+ * The dispatch system will find the best implementation to be used in this session.
+ * The decision will be based on the host CPU and features as well as the `cpu_target`
+ * option. This must be called before initializing JIT and should only be called once.
+ * An error will be raised if this is called more than once or none of the implementation
+ * supports the current system.
+ *
+ * Return the data about the function pointers selected.
+ */
+jl_sysimg_fptrs_t jl_init_processor_sysimg(void *hdl);
+
+// Return the name of the host CPU as a julia string.
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void);
+// Dump the name and feature set of the host CPU
+// For debugging only
+JL_DLLEXPORT void jl_dump_host_cpu(void);
+
+#ifdef __cplusplus
+}
+
+#include <utility>
+#include <string>
+#include <vector>
+
+extern bool jl_processor_print_help;
+
+/**
+ * Returns the CPU name and feature string to be used by LLVM JIT.
+ *
+ * If the detected/specified CPU name is not available on the LLVM version specified,
+ * a fallback CPU name will be used. Unsupported features will be ignored.
+ */
+std::pair<std::string,std::vector<std::string>> jl_get_llvm_target(bool imaging, uint32_t &flags);
+
+/**
+ * Returns the CPU name and feature string to be used by LLVM disassembler.
+ *
+ * This will return a generic CPU name and a full feature string.
+ */
+const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void);
+
+struct jl_target_spec_t {
+    // LLVM target name
+    std::string cpu_name;
+    // LLVM feature string
+    std::string cpu_features;
+    // serialized identification data
+    std::vector<uint8_t> data;
+    // Clone condition.
+    uint32_t flags;
+    // Base target index.
+    int base;
+};
+/**
+ * Return the list of targets to clone
+ */
+std::vector<jl_target_spec_t> jl_get_llvm_clone_targets(void);
+std::string jl_get_cpu_name_llvm(void);
+#endif
diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp
new file mode 100644
index 0000000000000..ac412d889ab13
--- /dev/null
+++ b/src/processor_x86.cpp
@@ -0,0 +1,961 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// X86 specific processor detection and dispatch
+
+// CPUID
+
+extern "C" JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
+{
+#if defined _MSC_VER
+    __cpuid(CPUInfo, InfoType);
+#else
+    asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType)
+        );
+#endif
+}
+
+extern "C" JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType)
+{
+#if defined _MSC_VER
+    __cpuidex(CPUInfo, InfoType, subInfoType);
+#else
+    asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType),
+        "c" (subInfoType)
+        );
+#endif
+}
+
+namespace X86 {
+
+enum class CPU : uint32_t {
+    generic = 0,
+    intel_nocona,
+    intel_prescott,
+    intel_atom_bonnell,
+    intel_atom_silvermont,
+    intel_atom_goldmont,
+    intel_core2,
+    intel_core2_penryn,
+    intel_yonah,
+    intel_corei7_nehalem,
+    intel_corei7_westmere,
+    intel_corei7_sandybridge,
+    intel_corei7_ivybridge,
+    intel_corei7_haswell,
+    intel_corei7_broadwell,
+    intel_corei7_skylake,
+    intel_corei7_skylake_avx512,
+    intel_corei7_cannonlake,
+    intel_knights_landing,
+
+    amd_fam10h,
+    amd_athlon_fx,
+    amd_athlon_64,
+    amd_athlon_64_sse3,
+    amd_bdver1,
+    amd_bdver2,
+    amd_bdver3,
+    amd_bdver4,
+    amd_btver1,
+    amd_btver2,
+    amd_k8,
+    amd_k8_sse3,
+    amd_opteron,
+    amd_opteron_sse3,
+    amd_barcelona,
+    amd_znver1,
+};
+
+static constexpr size_t feature_sz = 9;
+static constexpr FeatureName feature_names[] = {
+#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
+#include "features_x86.h"
+#undef JL_FEATURE_DEF
+#undef JL_FEATURE_DEF_NAME
+};
+static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
+
+template<typename... Args>
+static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
+{
+    return ::get_feature_masks<feature_sz>(args...);
+}
+
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
+static constexpr auto feature_masks = get_feature_masks(
+#define JL_FEATURE_DEF(name, bit, llvmver) bit,
+#include "features_x86.h"
+#undef JL_FEATURE_DEF
+    -1);
+
+namespace Feature {
+enum : uint32_t {
+#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
+#include "features_x86.h"
+#undef JL_FEATURE_DEF
+};
+#undef JL_FEATURE_DEF_NAME
+static constexpr FeatureDep deps[] = {
+    {ssse3, sse3},
+    {fma, avx},
+    {sse41, ssse3},
+    {sse42, sse41},
+    {avx, sse42},
+    {f16c, avx},
+    {avx2, avx},
+    {avx512f, avx2},
+    {avx512dq, avx512f},
+    {avx512ifma, avx512f},
+    {avx512pf, avx512f},
+    {avx512er, avx512f},
+    {avx512cd, avx512f},
+    {avx512bw, avx512f},
+    {avx512vl, avx512f},
+    {avx512vbmi, avx512bw},
+    {avx512vpopcntdq, avx512f},
+    {sse4a, sse3},
+    {xop, fma4},
+    {fma4, avx},
+    {fma4, sse4a}
+};
+
+// We require cx16 on 64bit by default. This can be overwritten with `-cx16`
+// This isn't really compatible with 32bit but we mask it off there with required LLVM version
+constexpr auto generic = get_feature_masks(cx16);
+constexpr auto bonnell = get_feature_masks(sse3, ssse3, cx16, movbe, sahf);
+constexpr auto silvermont = bonnell | get_feature_masks(sse41, sse42, popcnt,
+                                                        pclmul, aes, prfchw);
+constexpr auto goldmont = silvermont | get_feature_masks(mpx, sha, rdrnd, rdseed, xsave,
+                                                         xsaveopt, xsavec, xsaves, clflushopt);
+constexpr auto yonah = get_feature_masks(sse3);
+constexpr auto prescott = yonah;
+constexpr auto core2 = get_feature_masks(sse3, ssse3, cx16, sahf);
+constexpr auto nocona = get_feature_masks(sse3, cx16);
+constexpr auto penryn = nocona | get_feature_masks(ssse3, sse41, sahf);
+constexpr auto nehalem = penryn | get_feature_masks(sse42, popcnt);
+constexpr auto westmere = nehalem | get_feature_masks(aes, pclmul);
+constexpr auto sandybridge = westmere | get_feature_masks(avx, xsave, xsaveopt);
+constexpr auto ivybridge = sandybridge | get_feature_masks(rdrnd, f16c, fsgsbase);
+constexpr auto haswell = ivybridge | get_feature_masks(avx2, bmi, bmi2, fma, lzcnt, movbe);
+constexpr auto broadwell = haswell | get_feature_masks(adx, rdseed, prfchw);
+constexpr auto skylake = broadwell | get_feature_masks(mpx, rtm, xsavec, xsaves,
+                                                       clflushopt); // ignore sgx; hle
+constexpr auto knl = broadwell | get_feature_masks(avx512f, avx512er, avx512cd, avx512pf,
+                                                   prefetchwt1);
+constexpr auto skx = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl,
+                                                 pku, clwb);
+constexpr auto cannonlake = skx | get_feature_masks(avx512vbmi, avx512ifma, sha);
+
+constexpr auto k8_sse3 = get_feature_masks(sse3, cx16);
+constexpr auto amdfam10 = k8_sse3 | get_feature_masks(sse4a, lzcnt, popcnt, sahf);
+
+constexpr auto btver1 = amdfam10 | get_feature_masks(ssse3, prfchw);
+constexpr auto btver2 = btver1 | get_feature_masks(sse41, sse42, avx, aes, pclmul, bmi, f16c,
+                                                   movbe, xsave, xsaveopt);
+
+constexpr auto bdver1 = amdfam10 | get_feature_masks(xop, fma4, avx, ssse3, sse41, sse42, aes,
+                                                     prfchw, pclmul, xsave, lwp);
+constexpr auto bdver2 = bdver1 | get_feature_masks(f16c, bmi, tbm, fma);
+constexpr auto bdver3 = bdver2 | get_feature_masks(xsaveopt, fsgsbase);
+constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx);
+
+constexpr auto znver1 = haswell | get_feature_masks(adx, clflushopt, clzero, mwaitx, prfchw,
+                                                    rdseed, sha, sse4a, xsavec, xsaves);
+
+}
+
+static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
+    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
+    {"bonnell", CPU::intel_atom_bonnell, CPU::generic, 0, Feature::bonnell},
+    {"silvermont", CPU::intel_atom_silvermont, CPU::generic, 0, Feature::silvermont},
+    {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 50000, Feature::goldmont},
+    {"core2", CPU::intel_core2, CPU::generic, 0, Feature::core2},
+    {"yonah", CPU::intel_yonah, CPU::generic, 0, Feature::yonah},
+    {"prescott", CPU::intel_prescott, CPU::generic, 0, Feature::prescott},
+    {"nocona", CPU::intel_nocona, CPU::generic, 0, Feature::nocona},
+    {"penryn", CPU::intel_core2_penryn, CPU::generic, 0, Feature::penryn},
+    {"nehalem", CPU::intel_corei7_nehalem, CPU::generic, 0, Feature::nehalem},
+    {"westmere", CPU::intel_corei7_westmere, CPU::generic, 0, Feature::westmere},
+    {"sandybridge", CPU::intel_corei7_sandybridge, CPU::generic, 0, Feature::sandybridge},
+    {"ivybridge", CPU::intel_corei7_ivybridge, CPU::generic, 0, Feature::ivybridge},
+    {"haswell", CPU::intel_corei7_haswell, CPU::generic, 0, Feature::haswell},
+    {"broadwell", CPU::intel_corei7_broadwell, CPU::generic, 0, Feature::broadwell},
+    {"skylake", CPU::intel_corei7_skylake, CPU::generic, 0, Feature::skylake},
+    {"knl", CPU::intel_knights_landing, CPU::generic, 0, Feature::knl},
+    {"skylake-avx512", CPU::intel_corei7_skylake_avx512, CPU::generic, 0, Feature::skx},
+    {"cannonlake", CPU::intel_corei7_cannonlake, CPU::intel_corei7_skylake_avx512, 40000,
+     Feature::cannonlake},
+
+    {"athlon64", CPU::amd_athlon_64, CPU::generic, 0, Feature::generic},
+    {"athlon-fx", CPU::amd_athlon_fx, CPU::generic, 0, Feature::generic},
+    {"k8", CPU::amd_k8, CPU::generic, 0, Feature::generic},
+    {"opteron", CPU::amd_opteron, CPU::generic, 0, Feature::generic},
+
+    {"athlon64-sse3", CPU::amd_athlon_64_sse3, CPU::generic, 0, Feature::k8_sse3},
+    {"k8-sse3", CPU::amd_k8_sse3, CPU::generic, 0, Feature::k8_sse3},
+    {"opteron-sse3", CPU::amd_opteron_sse3, CPU::generic, 0, Feature::k8_sse3},
+
+    {"amdfam10", CPU::amd_fam10h, CPU::generic, 0, Feature::amdfam10},
+    {"barcelona", CPU::amd_barcelona, CPU::generic, 0, Feature::amdfam10},
+
+    {"btver1", CPU::amd_btver1, CPU::generic, 0, Feature::btver1},
+    {"btver2", CPU::amd_btver2, CPU::generic, 0, Feature::btver2},
+
+    {"bdver1", CPU::amd_bdver1, CPU::generic, 0, Feature::bdver1},
+    {"bdver2", CPU::amd_bdver2, CPU::generic, 0, Feature::bdver2},
+    {"bdver3", CPU::amd_bdver3, CPU::generic, 0, Feature::bdver3},
+    {"bdver4", CPU::amd_bdver4, CPU::generic, 0, Feature::bdver4},
+
+    {"znver1", CPU::amd_znver1, CPU::generic, 0, Feature::znver1},
+};
+static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]);
+
+// For CPU model and feature detection on X86
+
+const int SIG_INTEL = 0x756e6547; // Genu
+const int SIG_AMD = 0x68747541; // Auth
+
+static uint64_t get_xcr0(void)
+{
+#if defined _MSC_VER
+    return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+    uint32_t eax, edx;
+    asm volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+    return (uint64_t(edx) << 32) | eax;
+#endif
+}
+
+static CPU get_intel_processor_name(uint32_t family, uint32_t model, uint32_t brand_id,
+                                    const uint32_t *features)
+{
+    if (brand_id != 0)
+        return CPU::generic;
+    switch (family) {
+    case 3:
+    case 4:
+    case 5:
+        return CPU::generic;
+    case 6:
+        switch (model) {
+        case 0x01: // Pentium Pro processor
+        case 0x03: // Intel Pentium II OverDrive processor, Pentium II processor, model 03
+        case 0x05: // Pentium II processor, model 05, Pentium II Xeon processor,
+            // model 05, and Intel Celeron processor, model 05
+        case 0x06: // Celeron processor, model 06
+        case 0x07: // Pentium III processor, model 07, and Pentium III Xeon processor, model 07
+        case 0x08: // Pentium III processor, model 08, Pentium III Xeon processor,
+            // model 08, and Celeron processor, model 08
+        case 0x0a: // Pentium III Xeon processor, model 0Ah
+        case 0x0b: // Pentium III processor, model 0Bh
+        case 0x09: // Intel Pentium M processor, Intel Celeron M processor model 09.
+        case 0x0d: // Intel Pentium M processor, Intel Celeron M processor, model
+            // 0Dh. All processors are manufactured using the 90 nm process.
+        case 0x15: // Intel EP80579 Integrated Processor and Intel EP80579
+            // Integrated Processor with Intel QuickAssist Technology
+            return CPU::generic;
+        case 0x0e: // Intel Core Duo processor, Intel Core Solo processor, model
+            // 0Eh. All processors are manufactured using the 65 nm process.
+            return CPU::intel_yonah;
+        case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
+            // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
+            // mobile processor, Intel Core 2 Extreme processor, Intel
+            // Pentium Dual-Core processor, Intel Xeon processor, model
+            // 0Fh. All processors are manufactured using the 65 nm process.
+        case 0x16: // Intel Celeron processor model 16h. All processors are
+            // manufactured using the 65 nm process
+            return CPU::intel_core2;
+        case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
+            // 17h. All processors are manufactured using the 45 nm process.
+            //
+            // 45nm: Penryn , Wolfdale, Yorkfield (XE)
+        case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
+            // the 45 nm process.
+            return CPU::intel_core2_penryn;
+        case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
+            // processors are manufactured using the 45 nm process.
+        case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
+            // As found in a Summer 2010 model iMac.
+        case 0x1f:
+        case 0x2e: // Nehalem EX
+            return CPU::intel_corei7_nehalem;
+        case 0x25: // Intel Core i7, laptop version.
+        case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
+            // processors are manufactured using the 32 nm process.
+        case 0x2f: // Westmere EX
+            return CPU::intel_corei7_westmere;
+        case 0x2a: // Intel Core i7 processor. All processors are manufactured
+            // using the 32 nm process.
+        case 0x2d:
+            return CPU::intel_corei7_sandybridge;
+        case 0x3a:
+        case 0x3e: // Ivy Bridge EP
+            return CPU::intel_corei7_ivybridge;
+
+            // Haswell:
+        case 0x3c:
+        case 0x3f:
+        case 0x45:
+        case 0x46:
+            return CPU::intel_corei7_haswell;
+
+            // Broadwell:
+        case 0x3d:
+        case 0x47:
+        case 0x4f:
+        case 0x56:
+            return CPU::intel_corei7_broadwell;
+
+            // Skylake:
+        case 0x4e: // Skylake mobile
+        case 0x5e: // Skylake desktop
+        case 0x8e: // Kaby Lake mobile
+        case 0x9e: // Kaby Lake desktop
+            return CPU::intel_corei7_skylake;
+
+            // Skylake Xeon:
+        case 0x55:
+            return CPU::intel_corei7_skylake;
+
+        case 0x1c: // Most 45 nm Intel Atom processors
+        case 0x26: // 45 nm Atom Lincroft
+        case 0x27: // 32 nm Atom Medfield
+        case 0x35: // 32 nm Atom Midview
+        case 0x36: // 32 nm Atom Midview
+            return CPU::intel_atom_bonnell;
+
+            // Atom Silvermont codes from the Intel software optimization guide.
+        case 0x37:
+        case 0x4a:
+        case 0x4d:
+        case 0x5a:
+        case 0x5d:
+        case 0x4c: // really airmont
+            return CPU::intel_atom_silvermont;
+
+            // Goldmont:
+        case 0x5c:
+        case 0x5f:
+            return CPU::intel_atom_goldmont;
+
+        case 0x57:
+            return CPU::intel_knights_landing;
+
+        default:
+            return CPU::generic;
+        }
+        break;
+    case 15: {
+        switch (model) {
+        case 0: // Pentium 4 processor, Intel Xeon processor. All processors are
+            // model 00h and manufactured using the 0.18 micron process.
+        case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
+            // processor MP, and Intel Celeron processor. All processors are
+            // model 01h and manufactured using the 0.18 micron process.
+        case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
+            // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
+            // processor, and Mobile Intel Celeron processor. All processors
+            // are model 02h and manufactured using the 0.13 micron process.
+        default:
+            return CPU::generic;
+
+        case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
+            // processor. All processors are model 03h and manufactured using
+            // the 90 nm process.
+        case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
+            // Pentium D processor, Intel Xeon processor, Intel Xeon
+            // processor MP, Intel Celeron D processor. All processors are
+            // model 04h and manufactured using the 90 nm process.
+        case 6: // Pentium 4 processor, Pentium D processor, Pentium processor
+            // Extreme Edition, Intel Xeon processor, Intel Xeon processor
+            // MP, Intel Celeron D processor. All processors are model 06h
+            // and manufactured using the 65 nm process.
+#ifdef _CPU_X86_64_
+            return CPU::intel_nocona;
+#else
+            return CPU::intel_prescott;
+#endif
+        }
+    }
+    default:
+        break; /*"generic"*/
+    }
+    return CPU::generic;
+}
+
+static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_t *features)
+{
+    switch (family) {
+    case 4:
+    case 5:
+    case 6:
+    default:
+        return CPU::generic;
+    case 15:
+        if (test_nbit(features, Feature::sse3))
+            return CPU::amd_k8_sse3;
+        switch (model) {
+        case 1:
+            return CPU::amd_opteron;
+        case 5:
+            return CPU::amd_athlon_fx;
+        default:
+            return CPU::amd_athlon_64;
+        }
+    case 16:
+        switch (model) {
+        case 2:
+            return CPU::amd_barcelona;
+        case 4:
+        case 8:
+        default:
+            return CPU::amd_fam10h;
+        }
+    case 20:
+        return CPU::amd_btver1;
+    case 21:
+        if (!test_nbit(features, Feature::avx))
+            return CPU::amd_btver1;
+        if (model >= 0x50 && model <= 0x6f)
+            return CPU::amd_bdver4;
+        if (model >= 0x30 && model <= 0x3f)
+            return CPU::amd_bdver3;
+        if (model >= 0x10 && model <= 0x1f)
+            return CPU::amd_bdver2;
+        if (model <= 0x0f)
+            return CPU::amd_bdver1;
+        return CPU::amd_btver1; // fallback
+    case 22:
+        if (!test_nbit(features, Feature::avx))
+            return CPU::amd_btver1;
+        return CPU::amd_btver2;
+    case 23:
+        if (test_nbit(features, Feature::adx))
+            return CPU::amd_znver1;
+        return CPU::amd_btver1;
+    }
+}
+
+template<typename T>
+static inline void features_disable_avx512(T &features)
+{
+    using namespace Feature;
+    unset_bits(features, avx512f, avx512dq, avx512ifma, avx512pf, avx512er, avx512cd,
+               avx512bw, avx512vl, avx512vbmi);
+}
+
+template<typename T>
+static inline void features_disable_avx(T &features)
+{
+    using namespace Feature;
+    unset_bits(features, avx, Feature::fma, f16c, xsave, avx2, xop, fma4,
+               xsaveopt, xsavec, xsaves);
+}
+
+static inline const std::pair<uint32_t,FeatureList<feature_sz>> &get_host_cpu()
+{
+    static const auto host_cpu = [] () NOINLINE {
+        FeatureList<feature_sz> features = {};
+
+        int32_t info0[4];
+        jl_cpuid(info0, 0);
+        uint32_t maxleaf = info0[0];
+        if (maxleaf < 1)
+            return std::make_pair(uint32_t(CPU::generic), features);
+        int32_t info1[4];
+        jl_cpuid(info1, 1);
+
+        auto vendor = info0[1];
+        auto brand_id = info1[1] & 0xff;
+
+        auto family = (info1[0] >> 8) & 0xf; // Bits 8 - 11
+        auto model = (info1[0] >> 4) & 0xf;  // Bits 4 - 7
+        if (family == 6 || family == 0xf) {
+            if (family == 0xf)
+                // Examine extended family ID if family ID is F.
+                family += (info1[0] >> 20) & 0xff; // Bits 20 - 27
+            // Examine extended model ID if family ID is 6 or F.
+            model += ((info1[0] >> 16) & 0xf) << 4; // Bits 16 - 19
+        }
+
+        // Fill in the features
+        features[0] = info1[2];
+        features[1] = info1[3];
+        if (maxleaf >= 7) {
+            int32_t info7[4];
+            jl_cpuidex(info7, 7, 0);
+            features[2] = info7[1];
+            features[3] = info7[2];
+            features[4] = info7[3];
+        }
+        int32_t infoex0[4];
+        jl_cpuid(infoex0, 0x80000000);
+        uint32_t maxexleaf = infoex0[0];
+        if (maxexleaf >= 0x80000001) {
+            int32_t infoex1[4];
+            jl_cpuid(infoex1, 0x80000001);
+            features[5] = infoex1[2];
+            features[6] = infoex1[3];
+        }
+        if (maxleaf >= 0xd) {
+            int32_t infod[4];
+            jl_cpuidex(infod, 0xd, 0x1);
+            features[7] = infod[0];
+        }
+        if (maxexleaf >= 0x80000008) {
+            int32_t infoex8[4];
+            jl_cpuidex(infoex8, 0x80000008, 0);
+            features[8] = infoex8[1];
+        }
+
+        // Fix up AVX bits to account for OS support and match LLVM model
+        uint64_t xcr0 = 0;
+        const uint32_t avx_mask = (1 << 27) | (1 << 28);
+        bool hasavx = test_all_bits(features[0], avx_mask);
+        if (hasavx) {
+            xcr0 = get_xcr0();
+            hasavx = test_all_bits(xcr0, 0x6);
+        }
+        unset_bits(features, 32 + 27);
+        if (!hasavx)
+            features_disable_avx(features);
+        bool hasavx512save = hasavx && test_all_bits(xcr0, 0xe0);
+        if (!hasavx512save)
+            features_disable_avx512(features);
+        // Ignore feature bits that we are not interested in.
+        mask_features(feature_masks, &features[0]);
+
+        uint32_t cpu;
+        if (vendor == SIG_INTEL) {
+            cpu = uint32_t(get_intel_processor_name(family, model, brand_id, &features[0]));
+        }
+        else if (vendor == SIG_AMD) {
+            cpu = uint32_t(get_amd_processor_name(family, model, &features[0]));
+        }
+        else {
+            cpu = uint32_t(CPU::generic);
+        }
+
+        return std::make_pair(cpu, features);
+    }();
+    return host_cpu;
+}
+
+static inline const CPUSpec<CPU,feature_sz> *find_cpu(uint32_t cpu)
+{
+    return ::find_cpu(cpu, cpus, ncpu_names);
+}
+
+static inline const CPUSpec<CPU,feature_sz> *find_cpu(llvm::StringRef name)
+{
+    return ::find_cpu(name, cpus, ncpu_names);
+}
+
+static inline const char *find_cpu_name(uint32_t cpu)
+{
+    return ::find_cpu_name(cpu, cpus, ncpu_names);
+}
+
+static inline const std::string &host_cpu_name()
+{
+    static std::string name =
+        (CPU)get_host_cpu().first != CPU::generic ?
+        std::string(find_cpu_name(get_host_cpu().first)) :
+        jl_get_cpu_name_llvm();
+    return name;
+}
+
+static inline const char *normalize_cpu_name(llvm::StringRef name)
+{
+    if (name == "atom")
+        return "bonnell";
+    if (name == "slm")
+        return "silvermont";
+    if (name == "glm")
+        return "goldmont";
+    if (name == "corei7")
+        return "nehalem";
+    if (name == "corei7-avx")
+        return "sandybridge";
+    if (name == "core-avx-i")
+        return "ivybridge";
+    if (name == "core-avx2")
+        return "haswell";
+    if (name == "skx")
+        return "skylake-avx512";
+#ifdef _CPU_X86_
+    // i686 isn't a supported target but it's a common default one so just make it mean pentium4.
+    if (name == "pentium4" || name == "i686")
+        return "generic";
+#else
+    if (name == "x86-64" || name == "x86_64")
+        return "generic";
+#endif
+    return nullptr;
+}
+
+template<size_t n>
+static inline void enable_depends(FeatureList<n> &features)
+{
+    ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
+}
+
+template<size_t n>
+static inline void disable_depends(FeatureList<n> &features)
+{
+    ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
+}
+
+static const std::vector<TargetData<feature_sz>> &get_cmdline_targets(void)
+{
+    auto feature_cb = [] (const char *str, size_t len, FeatureList<feature_sz> &list) {
+        auto fbit = find_feature_bit(feature_names, nfeature_names, str, len);
+        if (fbit == (uint32_t)-1)
+            return false;
+        set_bit(list, fbit, true);
+        return true;
+    };
+    auto &targets = ::get_cmdline_targets<feature_sz>(feature_cb);
+    for (auto &t: targets) {
+        if (auto nname = normalize_cpu_name(t.name)) {
+            t.name = nname;
+        }
+    }
+    return targets;
+}
+
+static std::vector<TargetData<feature_sz>> jit_targets;
+
+static TargetData<feature_sz> arg_target_data(const TargetData<feature_sz> &arg, bool require_host)
+{
+    TargetData<feature_sz> res = arg;
+    const FeatureList<feature_sz> *cpu_features = nullptr;
+    if (res.name == "native") {
+        res.name = host_cpu_name();
+        cpu_features = &get_host_cpu().second;
+    }
+    else if (auto spec = find_cpu(res.name)) {
+        cpu_features = &spec->features;
+    }
+    else {
+        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
+    }
+    if (cpu_features) {
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.en.features[i] |= (*cpu_features)[i];
+        }
+    }
+    enable_depends(res.en.features);
+    for (size_t i = 0; i < feature_sz; i++)
+        res.en.features[i] &= ~res.dis.features[i];
+    if (require_host) {
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.en.features[i] &= get_host_cpu().second[i];
+        }
+    }
+    disable_depends(res.en.features);
+    if (cpu_features) {
+        // If the base feature if known, fill in the disable features
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.dis.features[i] = feature_masks[i] & ~res.en.features[i];
+        }
+    }
+    return res;
+}
+
+static int max_vector_size(const FeatureList<feature_sz> &features)
+{
+    if (test_nbit(features, Feature::avx512f))
+        return 64;
+    if (test_nbit(features, Feature::avx))
+        return 32;
+    // SSE is required
+    return 16;
+}
+
+static uint32_t sysimg_init_cb(const void *id)
+{
+    // First see what target is requested for the JIT.
+    auto &cmdline = get_cmdline_targets();
+    TargetData<feature_sz> target = arg_target_data(cmdline[0], true);
+    // Then find the best match in the sysimg
+    auto sysimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
+    // We translate `generic` to `pentium4` or `x86-64` before sending it to LLVM
+    // (see `get_llvm_target_noext`) which will be serialized into the sysimg target data.
+    // Translate them back so we can actually match them.
+    for (auto &t: sysimg) {
+        if (auto nname = normalize_cpu_name(t.name)) {
+            t.name = nname;
+        }
+    }
+    auto match = match_sysimg_targets(sysimg, target, max_vector_size);
+    // Now we've decided on which sysimg version to use.
+    // Make sure the JIT target is compatible with it and save the JIT target.
+    if (match.vreg_size != max_vector_size(target.en.features) &&
+        (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) {
+        if (match.vreg_size < 64) {
+            features_disable_avx512(target.en.features);
+        }
+        if (match.vreg_size < 32) {
+            features_disable_avx(target.en.features);
+        }
+    }
+    jit_targets.push_back(std::move(target));
+    return match.best_idx;
+}
+
+static void ensure_jit_target(bool imaging)
+{
+    auto &cmdline = get_cmdline_targets();
+    check_cmdline(cmdline, imaging);
+    if (!jit_targets.empty())
+        return;
+    for (auto &arg: cmdline) {
+        auto data = arg_target_data(arg, jit_targets.empty());
+        jit_targets.push_back(std::move(data));
+    }
+    auto ntargets = jit_targets.size();
+    // Now decide the clone condition.
+    for (size_t i = 1; i < ntargets; i++) {
+        auto &t = jit_targets[i];
+        if (t.en.flags & JL_TARGET_CLONE_ALL)
+            continue;
+        // The most useful one in general...
+        t.en.flags |= JL_TARGET_CLONE_LOOP;
+        auto &features0 = jit_targets[t.base].en.features;
+        // Special case for KNL since it's so different
+        if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) {
+            if (t.name == "knl" && jit_targets[t.base].name != "knl") {
+                t.en.flags |= JL_TARGET_CLONE_ALL;
+                break;
+            }
+        }
+        static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4};
+        static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3,
+                                                  Feature::sse41, Feature::sse42,
+                                                  Feature::avx, Feature::avx2,
+                                                  Feature::sse4a, Feature::avx512f,
+                                                  Feature::avx512dq, Feature::avx512ifma,
+                                                  Feature::avx512pf, Feature::avx512er,
+                                                  Feature::avx512cd, Feature::avx512bw,
+                                                  Feature::avx512vl, Feature::avx512vbmi,
+                                                  Feature::avx512vpopcntdq};
+        for (auto fe: clone_math) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_MATH;
+                break;
+            }
+        }
+        for (auto fe: clone_simd) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_SIMD;
+                break;
+            }
+        }
+    }
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_noext(const TargetData<feature_sz> &data)
+{
+    std::string name = data.name;
+    auto *spec = find_cpu(name);
+    while (spec) {
+        if (spec->llvmver <= JL_LLVM_VERSION)
+            break;
+        spec = find_cpu((uint32_t)spec->fallback);
+        name = spec->name;
+    }
+    if (name == "generic") {
+        // Use translate `generic` into what we actually require
+#ifdef _CPU_X86_
+        name = "pentium4";
+#else
+        name = "x86-64";
+#endif
+    }
+    std::vector<std::string> features;
+    for (auto &fename: feature_names) {
+        if (fename.llvmver > JL_LLVM_VERSION)
+            continue;
+        if (test_nbit(data.en.features, fename.bit)) {
+            features.insert(features.begin(), std::string("+") + fename.name);
+        }
+        else if (test_nbit(data.dis.features, fename.bit)) {
+            features.push_back(std::string("-") + fename.name);
+        }
+    }
+    features.push_back("+sse2");
+    features.push_back("+mmx");
+    features.push_back("+fxsr");
+#if JL_LLVM_VERSION < 50000
+#  ifdef _CPU_X86_
+    // LLVM has bug on < 5.0 when using avx in 32bit mode.
+    features.push_back("-avx");
+#  endif
+    // Scatter-gatter can't handle address space on < 5.0
+    // This is a base requirement for AVX512 so we have to turn all AVX512 features off
+    // Gatter is available in AVX2 too but fortunately LLVM doesn't use them.
+    features.push_back("-avx512f");
+    features.push_back("-avx512dq");
+#endif
+    return std::make_pair(std::move(name), std::move(features));
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_vec(const TargetData<feature_sz> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    append_ext_features(res0.second, data.ext_features);
+    return res0;
+}
+
+static std::pair<std::string,std::string>
+get_llvm_target_str(const TargetData<feature_sz> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    auto features = join_feature_strs(res0.second);
+    append_ext_features(features, data.ext_features);
+    return std::make_pair(std::move(res0.first), std::move(features));
+}
+
+}
+
+using namespace X86;
+
+JL_DLLEXPORT void jl_dump_host_cpu(void)
+{
+    dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names,
+                  cpus, ncpu_names);
+}
+
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
+{
+    return jl_cstr_to_string(host_cpu_name().c_str());
+}
+
+jl_sysimg_fptrs_t jl_init_processor_sysimg(void *hdl)
+{
+    if (!jit_targets.empty())
+        jl_error("JIT targets already initialized");
+    return parse_sysimg(hdl, sysimg_init_cb);
+}
+
+std::pair<std::string,std::vector<std::string>> jl_get_llvm_target(bool imaging, uint32_t &flags)
+{
+    ensure_jit_target(imaging);
+    flags = jit_targets[0].en.flags;
+    return get_llvm_target_vec(jit_targets[0]);
+}
+
+const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
+{
+    static const auto res = get_llvm_target_str(TargetData<feature_sz>{"generic", "",
+            {feature_masks, 0}, {{}, 0}, 0});
+    return res;
+}
+
+std::vector<jl_target_spec_t> jl_get_llvm_clone_targets(void)
+{
+    if (jit_targets.empty())
+        jl_error("JIT targets not initialized");
+    std::vector<jl_target_spec_t> res;
+    for (auto &target: jit_targets) {
+        auto features_en = target.en.features;
+        auto features_dis = target.dis.features;
+        for (auto &fename: feature_names) {
+            if (fename.llvmver > JL_LLVM_VERSION) {
+                unset_bits(features_en, fename.bit);
+                unset_bits(features_dis, fename.bit);
+            }
+        }
+        X86::disable_depends(features_en);
+        jl_target_spec_t ele;
+        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
+        ele.data = serialize_target_data(target.name, features_en, features_dis,
+                                         target.ext_features);
+        ele.flags = target.en.flags;
+        ele.base = target.base;
+        res.push_back(ele);
+    }
+    return res;
+}
+
+extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
+{
+    if (feature >= 32 * feature_sz)
+        return 0;
+    return test_nbit(&get_host_cpu().second[0], feature);
+}
+
+// -- set/clear the FZ/DAZ flags on x86 & x86-64 --
+
+// Cache of information recovered from `cpuid` since executing `cpuid` it at runtime is slow.
+static uint32_t subnormal_flags = [] {
+    int32_t info[4];
+    jl_cpuid(info, 0);
+    if (info[0] >= 1) {
+        jl_cpuid(info, 1);
+        if (info[3] & (1 << 26)) {
+            // SSE2 supports both FZ and DAZ
+            return 0x00008040;
+        }
+        else if (info[3] & (1 << 25)) {
+            // SSE supports only the FZ flag
+            return 0x00008000;
+        }
+    }
+    return 0;
+}();
+
+// Returns non-zero if subnormals go to 0; zero otherwise.
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return _mm_getcsr() & subnormal_flags;
+}
+
+// Return zero on success, non-zero on failure.
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t flags = subnormal_flags;
+    if (flags) {
+        uint32_t state = _mm_getcsr();
+        if (isZero)
+            state |= flags;
+        else
+            state &= ~flags;
+        _mm_setcsr(state);
+        return 0;
+    }
+    else {
+        // Report a failure only if user is trying to enable FTZ/DAZ.
+        return isZero;
+    }
+}
diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
index 49a0640943bca..bbc5d34b1ba55 100644
--- a/src/runtime_ccall.cpp
+++ b/src/runtime_ccall.cpp
@@ -7,6 +7,7 @@
 #include <llvm/Support/Host.h>
 #include "julia.h"
 #include "julia_internal.h"
+#include "processor.h"
 #include "julia_assert.h"
 
 using namespace llvm;
@@ -173,11 +174,41 @@ void *jl_load_and_lookup(const char *f_lib, const char *f_name, void **hnd)
 }
 
 // miscellany
-extern "C" JL_DLLEXPORT
-jl_value_t *jl_get_cpu_name(void)
+std::string jl_get_cpu_name_llvm(void)
+{
+    return llvm::sys::getHostCPUName().str();
+}
+
+std::string jl_get_cpu_features_llvm(void)
 {
-    StringRef HostCPUName = llvm::sys::getHostCPUName();
-    return jl_pchar_to_string(HostCPUName.data(), HostCPUName.size());
+    StringMap<bool> HostFeatures;
+    llvm::sys::getHostCPUFeatures(HostFeatures);
+    std::string attr;
+    for (auto &ele: HostFeatures) {
+        if (ele.getValue()) {
+            if (!attr.empty()) {
+                attr.append(",+");
+            }
+            else {
+                attr.append("+");
+            }
+            attr.append(ele.getKey().str());
+        }
+    }
+    // Explicitly disabled features need to be added at the end so that
+    // they are not reenabled by other features that implies them by default.
+    for (auto &ele: HostFeatures) {
+        if (!ele.getValue()) {
+            if (!attr.empty()) {
+                attr.append(",-");
+            }
+            else {
+                attr.append("-");
+            }
+            attr.append(ele.getKey().str());
+        }
+    }
+    return attr;
 }
 
 extern "C" JL_DLLEXPORT

From c3d75daad4e5ab4aa962bd8e66cdf3abbcdb4dea Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sun, 28 May 2017 11:34:13 -0400
Subject: [PATCH 4/9] Add AArch32 and AArch64 CPU name and feature detection
 code

Also implement internal runtime API.

The detection code avoid using `/proc/cpuinfo` whenever possible and should be much more
reliable than the one in LLVM. It also contains a much larger CPUID table to decode CPU names.

Compare to X86, the feature encoding/decoding is more complex due to the way LLVM takes
attributes. Certain information (arch version) also needs to be moved between name
and feature list.
---
 src/features_aarch32.h |   28 +
 src/features_aarch64.h |   25 +
 src/processor.cpp      |   36 +-
 src/processor.h        |    6 +
 src/processor_arm.cpp  | 1443 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1504 insertions(+), 34 deletions(-)
 create mode 100644 src/features_aarch32.h
 create mode 100644 src/features_aarch64.h
 create mode 100644 src/processor_arm.cpp

diff --git a/src/features_aarch32.h b/src/features_aarch32.h
new file mode 100644
index 0000000000000..803d576c61548
--- /dev/null
+++ b/src/features_aarch32.h
@@ -0,0 +1,28 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// AArch32 features definition
+// hwcap
+JL_FEATURE_DEF(neon, 12, 0)
+JL_FEATURE_DEF(vfp3, 13, 0)
+// JL_FEATURE_DEF(vfpv3d16, 14, 0) // d16
+JL_FEATURE_DEF(vfp4, 16, 0)
+JL_FEATURE_DEF_NAME(hwdiv_arm, 17, 0, "hwdiv-arm")
+JL_FEATURE_DEF(hwdiv, 18, 0)
+JL_FEATURE_DEF(d32, 19, 0) // -d16
+
+// hwcap2
+JL_FEATURE_DEF(crypto, 32 + 0, 0)
+JL_FEATURE_DEF(crc, 32 + 4, 0)
+// JL_FEATURE_DEF(ras, 32 + ???, 0)
+// JL_FEATURE_DEF(fullfp16, 32 + ???, 0)
+
+// custom bits to match llvm model
+JL_FEATURE_DEF(aclass, 32 * 2 + 0, 0)
+JL_FEATURE_DEF(rclass, 32 * 2 + 1, 0)
+JL_FEATURE_DEF(mclass, 32 * 2 + 2, 0)
+JL_FEATURE_DEF(v7, 32 * 2 + 3, 0)
+JL_FEATURE_DEF(v8, 32 * 2 + 4, 0)
+JL_FEATURE_DEF(v8_1a, 32 * 2 + 5, 0)
+JL_FEATURE_DEF(v8_2a, 32 * 2 + 6, 0)
+JL_FEATURE_DEF(v8_3a, 32 * 2 + 7, 60000)
+JL_FEATURE_DEF(v8_m_main, 32 * 2 + 8, 0)
diff --git a/src/features_aarch64.h b/src/features_aarch64.h
new file mode 100644
index 0000000000000..1cb869f06c4f0
--- /dev/null
+++ b/src/features_aarch64.h
@@ -0,0 +1,25 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// AArch64 features definition
+// hwcap
+JL_FEATURE_DEF(crypto, 3, 0)
+JL_FEATURE_DEF(crc, 7, 0)
+JL_FEATURE_DEF(lse, 8, 40000) // ARMv8.1-Atomics
+JL_FEATURE_DEF(fullfp16, 9, 0)
+JL_FEATURE_DEF(rdm, 12, 50000) // ARMv8.1-SIMD
+JL_FEATURE_DEF(jscvt, 13, UINT32_MAX) // Linux Kernel HWCAP name
+JL_FEATURE_DEF(fcma, 14, UINT32_MAX) // Linux Kernel HWCAP name
+JL_FEATURE_DEF(rcpc, 15, 60000)
+JL_FEATURE_DEF(dcpop, 16, UINT32_MAX) // Linux Kernel HWCAP name
+// JL_FEATURE_DEF(dotprod, ???, 60000) // ARMv8.2-DotProd
+// JL_FEATURE_DEF(ras, ???, 0)
+// JL_FEATURE_DEF(sve, ???, UINT32_MAX)
+
+// hwcap2
+// JL_FEATURE_DEF(?, 32 + ?, 0)
+
+// custom bits to match llvm model
+JL_FEATURE_DEF(v8_1a, 32 * 2 + 0, 0)
+JL_FEATURE_DEF(v8_2a, 32 * 2 + 1, 0)
+JL_FEATURE_DEF(v8_3a, 32 * 2 + 2, 60000)
+// JL_FEATURE_DEF(v8_4a, 32 * 2 + 3, ???)
diff --git a/src/processor.cpp b/src/processor.cpp
index f08fef5db4c79..ba5072efadf6a 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -797,41 +797,9 @@ static inline void dump_cpu_spec(uint32_t cpu, const FeatureList<n> &features,
 
 #include "processor_x86.cpp"
 
-#elif defined(_CPU_AARCH64_)
+#elif defined(_CPU_AARCH64_) || defined(_CPU_ARM_)
 
-// TODO
-JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
-{
-    return jl_cstr_to_string(jl_get_cpu_name_llvm().c_str());
-}
-
-// FZ, bit [24]
-static const uint32_t fpcr_fz_mask = 1 << 24;
-
-static inline uint32_t get_fpcr_aarch64(void)
-{
-    uint32_t fpcr;
-    asm volatile("mrs %0, fpcr" : "=r"(fpcr));
-    return fpcr;
-}
-
-static inline void set_fpcr_aarch64(uint32_t fpcr)
-{
-    asm volatile("msr fpcr, %0" :: "r"(fpcr));
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    uint32_t fpcr = get_fpcr_aarch64();
-    fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask);
-    set_fpcr_aarch64(fpcr);
-    return 0;
-}
+#include "processor_arm.cpp"
 
 #else
 
diff --git a/src/processor.h b/src/processor.h
index 7b43aaca8a750..66d2b135b3291 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -110,6 +110,12 @@ typedef enum {
 #define JL_FEATURE_DEF(name, bit, llvmver) JL_X86_##name = bit,
 #include "features_x86.h"
 #undef JL_FEATURE_DEF
+#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch32_##name = bit,
+#include "features_aarch32.h"
+#undef JL_FEATURE_DEF
+#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch64_##name = bit,
+#include "features_aarch64.h"
+#undef JL_FEATURE_DEF
 } jl_cpu_feature_t;
 #undef JL_FEATURE_DEF_NAME
 
diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp
new file mode 100644
index 0000000000000..5f8f3fae1a3da
--- /dev/null
+++ b/src/processor_arm.cpp
@@ -0,0 +1,1443 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// ARM (AArch32/AArch64) specific processor detection and dispatch
+
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <fcntl.h>
+#include <set>
+#include <sstream>
+#include <fstream>
+#include <algorithm>
+
+#if defined(_CPU_AARCH64_) || __GLIBC_PREREQ(2, 16)
+#  include <sys/auxv.h>
+#else
+#  define DYN_GETAUXVAL
+#endif
+
+namespace ARM {
+enum class CPU : uint32_t {
+    generic = 0,
+
+    // Architecture targets
+    armv7_a,
+    armv7_m,
+    armv7e_m,
+    armv7_r,
+    armv8_a,
+    armv8_m_base,
+    armv8_m_main,
+    armv8_r,
+    armv8_1_a,
+    armv8_2_a,
+    armv8_3_a,
+    // armv8_4_a,
+
+    // ARM
+    // armv6l
+    arm_mpcore,
+    arm_1136jf_s,
+    arm_1156t2f_s,
+    arm_1176jzf_s,
+    arm_cortex_m0,
+    arm_cortex_m1,
+    // armv7ml
+    arm_cortex_m3,
+    arm_cortex_m4,
+    arm_cortex_m7,
+    // armv7l
+    arm_cortex_a5,
+    arm_cortex_a7,
+    arm_cortex_a8,
+    arm_cortex_a9,
+    arm_cortex_a12,
+    arm_cortex_a15,
+    arm_cortex_a17,
+    arm_cortex_r4,
+    arm_cortex_r5,
+    arm_cortex_r7,
+    arm_cortex_r8,
+    // armv8ml
+    arm_cortex_m23,
+    arm_cortex_m33,
+    // armv8l
+    arm_cortex_a32,
+    arm_cortex_r52,
+    // aarch64
+    arm_cortex_a35,
+    arm_cortex_a53,
+    arm_cortex_a55,
+    arm_cortex_a57,
+    arm_cortex_a72,
+    arm_cortex_a73,
+    arm_cortex_a75,
+
+    // Cavium
+    // aarch64
+    cavium_thunderx,
+    cavium_thunderx88,
+    cavium_thunderx88p1,
+    cavium_thunderx81,
+    cavium_thunderx83,
+    cavium_thunderx2t99,
+    cavium_thunderx2t99p1,
+
+    // NVIDIA
+    // aarch64
+    nvidia_denver1,
+    nvidia_denver2,
+
+    // AppliedMicro
+    // aarch64
+    apm_xgene1,
+    apm_xgene2,
+    apm_xgene3,
+
+    // Qualcomm
+    // armv7l
+    qualcomm_scorpion,
+    qualcomm_krait,
+    // aarch64
+    qualcomm_kyro,
+    qualcomm_falkor,
+    qualcomm_saphira,
+
+    // Samsung
+    // aarch64
+    samsung_exynos_m1,
+    samsung_exynos_m2,
+    samsung_exynos_m3,
+
+    // Apple
+    // armv7l
+    apple_swift,
+    // aarch64
+    apple_cyclone,
+    apple_typhoon,
+    apple_twister,
+    apple_hurricane,
+
+    // Marvell
+    // armv7l
+    marvell_pj4,
+
+    // Intel
+    // armv7l
+    intel_3735d,
+};
+
+#ifdef _CPU_AARCH64_
+static constexpr size_t feature_sz = 3;
+static constexpr FeatureName feature_names[] = {
+#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
+#include "features_aarch64.h"
+#undef JL_FEATURE_DEF
+#undef JL_FEATURE_DEF_NAME
+};
+static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
+
+template<typename... Args>
+static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
+{
+    return ::get_feature_masks<feature_sz>(args...);
+}
+
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
+static constexpr auto feature_masks = get_feature_masks(
+#define JL_FEATURE_DEF(name, bit, llvmver) bit,
+#include "features_aarch64.h"
+#undef JL_FEATURE_DEF
+    -1);
+static const auto real_feature_masks =
+    feature_masks & FeatureList<feature_sz>{{(uint32_t)-1, (uint32_t)-1, 0}};
+
+namespace Feature {
+enum : uint32_t {
+#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
+#include "features_aarch64.h"
+#undef JL_FEATURE_DEF
+};
+#undef JL_FEATURE_DEF_NAME
+// This does not cover all dependencies (e.g. the ones that depends on arm versions)
+static constexpr FeatureDep deps[] = {
+    {0, 0} // dummy
+};
+
+constexpr auto generic = get_feature_masks();
+constexpr auto armv8a_crc = get_feature_masks(crc);
+constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto);
+constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a, lse, rdm); // lor, hpd
+constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a); // ras
+constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto);
+constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a, rcpc);
+constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto);
+
+constexpr auto arm_cortex_a32 = generic; // TODO? (crc, crypto)
+constexpr auto arm_cortex_a35 = generic; // TODO? (crc, crypto)
+constexpr auto arm_cortex_a53 = armv8a_crc;
+constexpr auto arm_cortex_a55 = armv8_2a_crypto | get_feature_masks(rcpc); // dotprod;
+constexpr auto arm_cortex_a57 = armv8a_crc;
+constexpr auto arm_cortex_a72 = armv8a_crc;
+constexpr auto arm_cortex_a73 = armv8a_crc;
+constexpr auto arm_cortex_a75 = armv8_2a_crypto | get_feature_masks(rcpc); // dotprod;
+constexpr auto cavium_thunderx = armv8a_crc_crypto;
+constexpr auto cavium_thunderx88 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx81 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx83 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx2t99 = armv8a_crc_crypto | get_feature_masks(v8_1a);
+constexpr auto cavium_thunderx2t99p1 = armv8a_crc_crypto | get_feature_masks(v8_1a);
+constexpr auto nvidia_denver1 = generic; // TODO? (crc, crypto)
+constexpr auto nvidia_denver2 = armv8a_crc_crypto;
+constexpr auto apm_xgene1 = generic;
+constexpr auto apm_xgene2 = generic; // TODO?
+constexpr auto apm_xgene3 = generic; // TODO?
+constexpr auto qualcomm_kyro = armv8a_crc_crypto;
+constexpr auto qualcomm_falkor = armv8a_crc_crypto;
+constexpr auto qualcomm_saphira = armv8_3a_crypto;
+constexpr auto samsung_exynos_m1 = armv8a_crc_crypto;
+constexpr auto samsung_exynos_m2 = armv8a_crc_crypto;
+constexpr auto samsung_exynos_m3 = armv8a_crc_crypto;
+constexpr auto apple_cyclone = armv8a_crc_crypto;
+constexpr auto apple_typhoon = armv8a_crc_crypto;
+constexpr auto apple_twister = armv8a_crc_crypto;
+constexpr auto apple_hurricane = armv8a_crc_crypto;
+
+}
+
+static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
+    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
+    {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a},
+    {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a},
+    {"armv8.3_a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a},
+    {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35},
+    {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53},
+    {"cortex-a55", CPU::arm_cortex_a55, CPU::arm_cortex_a53, UINT32_MAX, Feature::arm_cortex_a55},
+    {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57},
+    {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72},
+    {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73},
+    {"cortex-a75", CPU::arm_cortex_a75, CPU::arm_cortex_a73, UINT32_MAX, Feature::arm_cortex_a75},
+    {"thunderx", CPU::cavium_thunderx, CPU::generic, 50000, Feature::cavium_thunderx},
+    {"thunderxt88", CPU::cavium_thunderx88, CPU::generic, 50000, Feature::cavium_thunderx88},
+    {"thunderxt88p1", CPU::cavium_thunderx88p1, CPU::cavium_thunderx88, UINT32_MAX,
+     Feature::cavium_thunderx88p1},
+    {"thunderxt81", CPU::cavium_thunderx81, CPU::generic, 50000, Feature::cavium_thunderx81},
+    {"thunderxt83", CPU::cavium_thunderx83, CPU::generic, 50000, Feature::cavium_thunderx83},
+    {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::generic, 50000,
+     Feature::cavium_thunderx2t99},
+    {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::cavium_thunderx2t99, UINT32_MAX,
+     Feature::cavium_thunderx2t99p1},
+    {"denver1", CPU::nvidia_denver1, CPU::generic, UINT32_MAX, Feature::nvidia_denver1},
+    {"denver2", CPU::nvidia_denver2, CPU::generic, UINT32_MAX, Feature::nvidia_denver2},
+    {"xgene1", CPU::apm_xgene1, CPU::generic, UINT32_MAX, Feature::apm_xgene1},
+    {"xgene2", CPU::apm_xgene2, CPU::generic, UINT32_MAX, Feature::apm_xgene2},
+    {"xgene3", CPU::apm_xgene3, CPU::generic, UINT32_MAX, Feature::apm_xgene3},
+    {"kyro", CPU::qualcomm_kyro, CPU::generic, 0, Feature::qualcomm_kyro},
+    {"falkor", CPU::qualcomm_falkor, CPU::generic, 40000, Feature::qualcomm_falkor},
+    {"saphira", CPU::qualcomm_saphira, CPU::qualcomm_falkor, 60000, Feature::qualcomm_saphira},
+    {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, 0, Feature::samsung_exynos_m1},
+    {"exynos-m2", CPU::samsung_exynos_m2, CPU::samsung_exynos_m1, 40000,
+     Feature::samsung_exynos_m2},
+    {"exynos-m3", CPU::samsung_exynos_m3, CPU::samsung_exynos_m2, 40000,
+     Feature::samsung_exynos_m3},
+    {"cyclone", CPU::apple_cyclone, CPU::generic, 0, Feature::apple_cyclone},
+    {"typhoon", CPU::apple_typhoon, CPU::apple_cyclone, UINT32_MAX, Feature::apple_typhoon},
+    {"twister", CPU::apple_twister, CPU::apple_typhoon, UINT32_MAX, Feature::apple_twister},
+    {"hurricane", CPU::apple_hurricane, CPU::apple_twister, UINT32_MAX, Feature::apple_hurricane},
+};
+#else
+static constexpr size_t feature_sz = 3;
+static constexpr FeatureName feature_names[] = {
+#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
+#include "features_aarch32.h"
+#undef JL_FEATURE_DEF
+#undef JL_FEATURE_DEF_NAME
+};
+static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
+
+template<typename... Args>
+static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
+{
+    return ::get_feature_masks<feature_sz>(args...);
+}
+
+#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
+static constexpr auto feature_masks = get_feature_masks(
+#define JL_FEATURE_DEF(name, bit, llvmver) bit,
+#include "features_aarch32.h"
+#undef JL_FEATURE_DEF
+    -1);
+static const auto real_feature_masks =
+    feature_masks & FeatureList<feature_sz>{{(uint32_t)-1, (uint32_t)-1, 0}};
+
+namespace Feature {
+enum : uint32_t {
+#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
+#include "features_aarch32.h"
+#undef JL_FEATURE_DEF
+};
+#undef JL_FEATURE_DEF_NAME
+// This does not cover all dependencies (e.g. the ones that depends on arm versions)
+static constexpr FeatureDep deps[] = {
+    {neon, vfp3},
+    {vfp4, vfp3},
+    {crypto, neon},
+};
+
+// These are the real base requirements of the specific architectures
+constexpr auto _armv7m = get_feature_masks(v7, mclass, hwdiv);
+constexpr auto _armv7a = get_feature_masks(v7, aclass);
+constexpr auto _armv7r = get_feature_masks(v7, rclass);
+constexpr auto _armv8m = get_feature_masks(v7, v8, mclass, hwdiv);
+constexpr auto _armv8a = get_feature_masks(v7, v8, aclass, neon, vfp3, vfp4, d32,
+                                           hwdiv, hwdiv_arm);
+constexpr auto _armv8r = get_feature_masks(v7, v8, rclass, neon, vfp3, vfp4, d32,
+                                           hwdiv, hwdiv_arm);
+
+// Set `generic` to match the feature requirement of the `C` code.
+// we'll require at least these when compiling the sysimg.
+#if __ARM_ARCH >= 8
+#  if !defined(__ARM_ARCH_PROFILE)
+constexpr auto generic = get_feature_masks(v7, v8, hwdiv);
+#  elif __ARM_ARCH_PROFILE == 'A'
+constexpr auto generic = _armv8a;
+#  elif __ARM_ARCH_PROFILE == 'R'
+constexpr auto generic = _armv8r;
+#  elif __ARM_ARCH_PROFILE == 'M'
+constexpr auto generic = _armv8m;
+#  else
+constexpr auto generic = get_feature_masks(v7, v8, hwdiv);
+#  endif
+#elif __ARM_ARCH == 7
+#  if !defined(__ARM_ARCH_PROFILE)
+constexpr auto generic = get_feature_masks(v7);
+#  elif __ARM_ARCH_PROFILE == 'A'
+constexpr auto generic = _armv7a;
+#  elif __ARM_ARCH_PROFILE == 'R'
+constexpr auto generic = _armv7r;
+#  elif __ARM_ARCH_PROFILE == 'M'
+constexpr auto generic = _armv7m;
+#  else
+constexpr auto generic = get_feature_masks(v7);
+#  endif
+#else
+constexpr auto generic = get_feature_masks();
+#endif
+
+// All feature sets below should use or be or'ed with one of these (or generic).
+// This makes sure that, for example, the `generic` target on `armv7-a` binary is equivalent
+// to the `armv7-a` target.
+constexpr auto armv7m = generic | _armv7m;
+constexpr auto armv7a = generic | _armv7a;
+constexpr auto armv7r = generic | _armv7r;
+constexpr auto armv8m = generic | _armv8m;
+constexpr auto armv8a = generic | _armv8a;
+constexpr auto armv8r = generic | _armv8r;
+
+// armv7l
+constexpr auto arm_cortex_a5 = armv7a;
+constexpr auto arm_cortex_a7 = armv7a | get_feature_masks(vfp3, vfp4, neon);
+constexpr auto arm_cortex_a8 = armv7a | get_feature_masks(d32, vfp3, neon);
+constexpr auto arm_cortex_a9 = armv7a;
+constexpr auto arm_cortex_a12 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
+constexpr auto arm_cortex_a15 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
+constexpr auto arm_cortex_a17 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
+constexpr auto arm_cortex_r4 = armv7r | get_feature_masks(vfp3, hwdiv);
+constexpr auto arm_cortex_r5 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
+constexpr auto arm_cortex_r7 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
+constexpr auto arm_cortex_r8 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
+constexpr auto qualcomm_scorpion = armv7a | get_feature_masks(v7, aclass, vfp3, neon);
+constexpr auto qualcomm_krait = armv7a | get_feature_masks(vfp3, vfp4, neon, hwdiv, hwdiv_arm);
+constexpr auto apple_swift = armv7a | get_feature_masks(d32, vfp3, vfp4, neon, hwdiv, hwdiv_arm);
+constexpr auto marvell_pj4 = armv7a | get_feature_masks(vfp3);
+constexpr auto intel_3735d = armv7a | get_feature_masks(vfp3, neon);
+// armv8ml
+constexpr auto arm_cortex_m23 = armv8m; // unsupported
+constexpr auto arm_cortex_m33 = armv8m | get_feature_masks(v8_m_main); // unsupported
+// armv8l
+constexpr auto armv8a_crc = armv8a | get_feature_masks(crc);
+constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a);
+constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a);
+constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto);
+constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto);
+constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a);
+constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto);
+
+constexpr auto arm_cortex_a32 = armv8a; // TODO? (crc, crypto)
+constexpr auto arm_cortex_r52 = armv8r; // TODO? (crc, crypto)
+constexpr auto arm_cortex_a35 = armv8a; // TODO? (crc, crypto)
+constexpr auto arm_cortex_a53 = armv8a_crc;
+constexpr auto arm_cortex_a55 = armv8_2a_crypto;
+constexpr auto arm_cortex_a57 = armv8a_crc;
+constexpr auto arm_cortex_a72 = armv8a_crc;
+constexpr auto arm_cortex_a73 = armv8a_crc;
+constexpr auto arm_cortex_a75 = armv8_2a_crypto;
+constexpr auto cavium_thunderx = armv8a_crc_crypto;
+constexpr auto cavium_thunderx88 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx81 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx83 = armv8a_crc_crypto;
+constexpr auto cavium_thunderx2t99 = armv8a_crc_crypto | get_feature_masks(v8_1a);
+constexpr auto cavium_thunderx2t99p1 = armv8a_crc_crypto | get_feature_masks(v8_1a);
+constexpr auto nvidia_denver1 = armv8a; // TODO? (crc, crypto)
+constexpr auto nvidia_denver2 = armv8a_crc_crypto;
+constexpr auto apm_xgene1 = armv8a;
+constexpr auto apm_xgene2 = armv8a; // TODO?
+constexpr auto apm_xgene3 = armv8a; // TODO?
+constexpr auto qualcomm_kyro = armv8a_crc_crypto;
+constexpr auto qualcomm_falkor = armv8a_crc_crypto;
+constexpr auto qualcomm_saphira = armv8_3a_crypto;
+constexpr auto samsung_exynos_m1 = armv8a_crc_crypto;
+constexpr auto samsung_exynos_m2 = armv8a_crc_crypto;
+constexpr auto samsung_exynos_m3 = armv8a_crc_crypto;
+constexpr auto apple_cyclone = armv8a_crc_crypto;
+constexpr auto apple_typhoon = armv8a_crc_crypto;
+constexpr auto apple_twister = armv8a_crc_crypto;
+constexpr auto apple_hurricane = armv8a_crc_crypto;
+
+}
+
+static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
+    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
+    // armv6
+    {"mpcore", CPU::arm_mpcore, CPU::generic, 0, Feature::generic},
+    {"arm1136jf-s", CPU::arm_1136jf_s, CPU::generic, 0, Feature::generic},
+    {"arm1156t2f-s", CPU::arm_1156t2f_s, CPU::generic, 0, Feature::generic},
+    {"arm1176jzf-s", CPU::arm_1176jzf_s, CPU::generic, 0, Feature::generic},
+    {"cortex-m0", CPU::arm_cortex_m0, CPU::generic, 0, Feature::generic},
+    {"cortex-m1", CPU::arm_cortex_m1, CPU::generic, 0, Feature::generic},
+    // armv7ml
+    {"armv7-m", CPU::armv7_m, CPU::generic, 0, Feature::armv7m},
+    {"armv7e-m", CPU::armv7e_m, CPU::generic, 0, Feature::armv7m},
+    {"cortex-m3", CPU::arm_cortex_m3, CPU::generic, 0, Feature::armv7m},
+    {"cortex-m4", CPU::arm_cortex_m4, CPU::generic, 0, Feature::armv7m},
+    {"cortex-m7", CPU::arm_cortex_m7, CPU::generic, 0, Feature::armv7m},
+    // armv7l
+    {"armv7-a", CPU::armv7_a, CPU::generic, 0, Feature::armv7a},
+    {"armv7-r", CPU::armv7_r, CPU::generic, 0, Feature::armv7r},
+    {"cortex-a5", CPU::arm_cortex_a5, CPU::generic, 0, Feature::arm_cortex_a5},
+    {"cortex-a7", CPU::arm_cortex_a7, CPU::generic, 0, Feature::arm_cortex_a7},
+    {"cortex-a8", CPU::arm_cortex_a8, CPU::generic, 0, Feature::arm_cortex_a8},
+    {"cortex-a9", CPU::arm_cortex_a9, CPU::generic, 0, Feature::arm_cortex_a9},
+    {"cortex-a12", CPU::arm_cortex_a12, CPU::generic, 0, Feature::arm_cortex_a12},
+    {"cortex-a15", CPU::arm_cortex_a15, CPU::generic, 0, Feature::arm_cortex_a15},
+    {"cortex-a17", CPU::arm_cortex_a17, CPU::generic, 0, Feature::arm_cortex_a17},
+    {"cortex-r4", CPU::arm_cortex_r4, CPU::generic, 0, Feature::arm_cortex_r4},
+    {"cortex-r5", CPU::arm_cortex_r5, CPU::generic, 0, Feature::arm_cortex_r5},
+    {"cortex-r7", CPU::arm_cortex_r7, CPU::generic, 0, Feature::arm_cortex_r7},
+    {"cortex-r8", CPU::arm_cortex_r8, CPU::generic, 0, Feature::arm_cortex_r8},
+    {"scorpion", CPU::qualcomm_scorpion, CPU::armv7_a, UINT32_MAX, Feature::qualcomm_scorpion},
+    {"krait", CPU::qualcomm_krait, CPU::generic, 0, Feature::qualcomm_krait},
+    {"swift", CPU::apple_swift, CPU::generic, 0, Feature::apple_swift},
+    {"pj4", CPU::marvell_pj4, CPU::armv7_a, UINT32_MAX, Feature::marvell_pj4},
+    {"3735d", CPU::intel_3735d, CPU::armv7_a, UINT32_MAX, Feature::intel_3735d},
+
+    // armv8ml
+    {"armv8-m.base", CPU::armv8_m_base, CPU::generic, 0, Feature::armv8m},
+    {"armv8-m.main", CPU::armv8_m_main, CPU::generic, 0, Feature::armv8m},
+    {"cortex-m23", CPU::arm_cortex_m23, CPU::armv8_m_base, 50000, Feature::arm_cortex_m23},
+    {"cortex-m33", CPU::arm_cortex_m33, CPU::armv8_m_main, 50000, Feature::arm_cortex_m33},
+
+    // armv8l
+    {"armv8-a", CPU::armv8_a, CPU::generic, 0, Feature::armv8a},
+    {"armv8-r", CPU::armv8_r, CPU::generic, 0, Feature::armv8r},
+    {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a},
+    {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a},
+    {"armv8.3-a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a},
+    {"cortex-a32", CPU::arm_cortex_a32, CPU::generic, 0, Feature::arm_cortex_a32},
+    {"cortex-r52", CPU::arm_cortex_r52, CPU::armv8_r, 40000, Feature::arm_cortex_r52},
+    {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35},
+    {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53},
+    {"cortex-a55", CPU::arm_cortex_a55, CPU::arm_cortex_a53, 60000, Feature::arm_cortex_a55},
+    {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57},
+    {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72},
+    {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73},
+    {"cortex-a75", CPU::arm_cortex_a75, CPU::arm_cortex_a73, 60000, Feature::arm_cortex_a75},
+    {"thunderx", CPU::cavium_thunderx, CPU::armv8_a, UINT32_MAX, Feature::cavium_thunderx},
+    {"thunderx88", CPU::cavium_thunderx88, CPU::armv8_a, UINT32_MAX, Feature::cavium_thunderx88},
+    {"thunderx88p1", CPU::cavium_thunderx88p1, CPU::armv8_a, UINT32_MAX,
+     Feature::cavium_thunderx88p1},
+    {"thunderx81", CPU::cavium_thunderx81, CPU::armv8_a, UINT32_MAX,
+     Feature::cavium_thunderx81},
+    {"thunderx83", CPU::cavium_thunderx83, CPU::armv8_a, UINT32_MAX,
+     Feature::cavium_thunderx83},
+    {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::armv8_a, UINT32_MAX,
+     Feature::cavium_thunderx2t99},
+    {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::armv8_a, UINT32_MAX,
+     Feature::cavium_thunderx2t99p1},
+    {"denver1", CPU::nvidia_denver1, CPU::arm_cortex_a53, UINT32_MAX, Feature::nvidia_denver1},
+    {"denver2", CPU::nvidia_denver2, CPU::arm_cortex_a57, UINT32_MAX, Feature::nvidia_denver2},
+    {"xgene1", CPU::apm_xgene1, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene1},
+    {"xgene2", CPU::apm_xgene2, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene2},
+    {"xgene3", CPU::apm_xgene3, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene3},
+    {"kyro", CPU::qualcomm_kyro, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_kyro},
+    {"falkor", CPU::qualcomm_falkor, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_falkor},
+    {"saphira", CPU::qualcomm_saphira, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_saphira},
+    {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, 0, Feature::samsung_exynos_m1},
+    {"exynos-m2", CPU::samsung_exynos_m2, CPU::samsung_exynos_m1, 40000,
+     Feature::samsung_exynos_m2},
+    {"exynos-m3", CPU::samsung_exynos_m3, CPU::samsung_exynos_m2, 40000,
+     Feature::samsung_exynos_m3},
+    {"cyclone", CPU::apple_cyclone, CPU::generic, 0, Feature::apple_cyclone},
+    {"typhoon", CPU::apple_typhoon, CPU::apple_cyclone, UINT32_MAX, Feature::apple_typhoon},
+    {"twister", CPU::apple_twister, CPU::apple_typhoon, UINT32_MAX, Feature::apple_twister},
+    {"hurricane", CPU::apple_hurricane, CPU::apple_twister, UINT32_MAX, Feature::apple_hurricane},
+};
+#endif
+static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]);
+
+// auxval reader
+
+#ifndef AT_HWCAP
+#  define AT_HWCAP 16
+#endif
+#ifndef AT_HWCAP2
+#  define AT_HWCAP2 26
+#endif
+
+#if defined(DYN_GETAUXVAL)
+static bool getauxval_dlsym(unsigned long type, unsigned long *val)
+{
+    static auto getauxval_p = (unsigned long (*)(unsigned long))
+        jl_dlsym_e(jl_dlopen(nullptr, JL_RTLD_LOCAL), "getauxval");
+    if (getauxval_p) {
+        *val = getauxval_p(type);
+        return true;
+    }
+    return false;
+}
+
+static unsigned long getauxval_procfs(unsigned long type)
+{
+    int fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd == -1)
+        return 0;
+    unsigned long val = 0;
+    unsigned long buff[2];
+    while (read(fd, buff, sizeof(buff)) == sizeof(buff)) {
+        if (buff[0] == 0)
+            break;
+        if (buff[0] == type) {
+            val = buff[1];
+            break;
+        }
+    }
+    close(fd);
+    return val;
+}
+
+static inline unsigned long jl_getauxval(unsigned long type)
+{
+    unsigned long val;
+    if (getauxval_dlsym(type, &val))
+        return val;
+    return getauxval_procfs(type);
+}
+#else
+static inline unsigned long jl_getauxval(unsigned long type)
+{
+    return getauxval(type);
+}
+#endif
+
+struct CPUID {
+    uint8_t implementer;
+    uint8_t variant;
+    uint16_t part;
+    bool operator<(const CPUID &right) const
+    {
+        if (implementer < right.implementer)
+            return true;
+        if (implementer > right.implementer)
+            return false;
+        if (part < right.part)
+            return true;
+        if (part > right.part)
+            return false;
+        return variant < right.variant;
+    }
+};
+
+// /sys/devices/system/cpu/cpu<n>/regs/identification/midr_el1 reader
+static inline void get_cpuinfo_sysfs(std::set<CPUID> &res)
+{
+    // This only works on a 64bit 4.7+ kernel
+    auto dir = opendir("/sys/devices/system/cpu");
+    if (!dir)
+        return;
+    while (auto entry = readdir(dir)) {
+        if (entry->d_type != DT_DIR)
+            continue;
+        if (strncmp(entry->d_name, "cpu", 3) != 0)
+            continue;
+        std::stringstream stm;
+        stm << "/sys/devices/system/cpu/" << entry->d_name << "/regs/identification/midr_el1";
+        std::ifstream file(stm.str());
+        if (!file)
+            continue;
+        uint64_t val = 0;
+        file >> std::hex >> val;
+        if (!file)
+            continue;
+        CPUID cpuid = {
+            uint8_t(val >> 24),
+            uint8_t((val >> 20) & 0xf),
+            uint16_t((val >> 4) & 0xfff)
+        };
+        res.insert(cpuid);
+    }
+    closedir(dir);
+}
+
+// Use an external template since lambda's can't be templated in C++11
+template<typename T, typename F>
+static inline bool try_read_procfs_line(llvm::StringRef line, const char *prefix, T &out,
+                                        bool &flag, F &&reset)
+{
+    if (!line.startswith(prefix))
+        return false;
+    if (flag)
+        reset();
+    flag = line.substr(strlen(prefix)).ltrim("\t :").getAsInteger(0, out);
+    return true;
+}
+
+// /proc/cpuinfo reader
+static inline void get_cpuinfo_procfs(std::set<CPUID> &res)
+{
+    std::ifstream file("/proc/cpuinfo");
+    CPUID cpuid = {0, 0, 0};
+    bool impl = false;
+    bool part = false;
+    bool var = false;
+    auto reset = [&] () {
+        if (impl && part)
+            res.insert(cpuid);
+        impl = false;
+        part = false;
+        var = false;
+        memset(&cpuid, 0, sizeof(cpuid));
+    };
+    for (std::string line; std::getline(file, line);) {
+        if (line.empty()) {
+            reset();
+            continue;
+        }
+        try_read_procfs_line(line, "CPU implementer", cpuid.implementer, impl, reset) ||
+            try_read_procfs_line(line, "CPU variant", cpuid.variant, var, reset) ||
+            try_read_procfs_line(line, "CPU part", cpuid.part, part, reset);
+    }
+    reset();
+}
+
+static std::set<CPUID> get_cpuinfo(void)
+{
+    std::set<CPUID> res;
+    get_cpuinfo_sysfs(res);
+    if (res.empty())
+        get_cpuinfo_procfs(res);
+    return res;
+}
+
+static CPU get_cpu_name(CPUID cpuid)
+{
+    switch (cpuid.implementer) {
+    case 0x41: // ARM
+        switch (cpuid.part) {
+        case 0xb02: return CPU::arm_mpcore;
+        case 0xb36: return CPU::arm_1136jf_s;
+        case 0xb56: return CPU::arm_1156t2f_s;
+        case 0xb76: return CPU::arm_1176jzf_s;
+        case 0xc20: return CPU::arm_cortex_m0;
+        case 0xc21: return CPU::arm_cortex_m1;
+        case 0xc23: return CPU::arm_cortex_m3;
+        case 0xc24: return CPU::arm_cortex_m4;
+        case 0xc27: return CPU::arm_cortex_m7;
+        case 0xd20: return CPU::arm_cortex_m23;
+        case 0xd21: return CPU::arm_cortex_m33;
+        case 0xc05: return CPU::arm_cortex_a5;
+        case 0xc07: return CPU::arm_cortex_a7;
+        case 0xc08: return CPU::arm_cortex_a8;
+        case 0xc09: return CPU::arm_cortex_a9;
+        case 0xc0d: return CPU::arm_cortex_a12;
+        case 0xc0f: return CPU::arm_cortex_a15;
+        case 0xc0e: return CPU::arm_cortex_a17;
+        case 0xc14: return CPU::arm_cortex_r4;
+        case 0xc15: return CPU::arm_cortex_r5;
+        case 0xc17: return CPU::arm_cortex_r7;
+        case 0xc18: return CPU::arm_cortex_r8;
+        case 0xd13: return CPU::arm_cortex_r52;
+        case 0xd01: return CPU::arm_cortex_a32;
+        case 0xd04: return CPU::arm_cortex_a35;
+        case 0xd03: return CPU::arm_cortex_a53;
+        case 0xd05: return CPU::arm_cortex_a55;
+        case 0xd07: return CPU::arm_cortex_a57;
+        case 0xd08: return CPU::arm_cortex_a72;
+        case 0xd09: return CPU::arm_cortex_a73;
+        case 0xd0a: return CPU::arm_cortex_a75;
+        default: return CPU::generic;
+        }
+    case 0x42: // Broadcom (Cavium)
+        switch (cpuid.part) {
+        case 0x516: return CPU::cavium_thunderx2t99p1;
+        default: return CPU::generic;
+        }
+    case 0x43: // Cavium
+        switch (cpuid.part) {
+        case 0xa0: return CPU::cavium_thunderx;
+        case 0xa1:
+            if (cpuid.variant == 0)
+                return CPU::cavium_thunderx88p1;
+            return CPU::cavium_thunderx88;
+        case 0xa2: return CPU::cavium_thunderx81;
+        case 0xa3: return CPU::cavium_thunderx83;
+        case 0xaf: return CPU::cavium_thunderx2t99;
+        default: return CPU::generic;
+        }
+    case 0x4e: // NVIDIA
+        switch (cpuid.part) {
+        case 0x000: return CPU::nvidia_denver1;
+        case 0x003: return CPU::nvidia_denver2;
+        default: return CPU::generic;
+        }
+    case 0x50: // AppliedMicro
+        // x-gene 2
+        // x-gene 3
+        switch (cpuid.part) {
+        case 0x000: return CPU::apm_xgene1;
+        default: return CPU::generic;
+        }
+    case 0x51: // Qualcomm
+        switch (cpuid.part) {
+        case 0x00f:
+        case 0x02d:
+            return CPU::qualcomm_scorpion;
+        case 0x04d:
+        case 0x06f:
+            return CPU::qualcomm_krait;
+        case 0x201:
+        case 0x205:
+        case 0x211:
+            return CPU::qualcomm_kyro;
+        case 0x800:
+        case 0x801:
+            return CPU::arm_cortex_a73; // second-generation Kryo
+        case 0xc00:
+            return CPU::qualcomm_falkor;
+        case 0xc01:
+            return CPU::qualcomm_saphira;
+        default: return CPU::generic;
+        }
+    case 0x53: // Samsung
+        // exynos-m2
+        // exynos-m3
+        switch (cpuid.part) {
+        case 0x001: return CPU::samsung_exynos_m1;
+        default: return CPU::generic;
+        }
+    case 0x56: // Marvell
+        switch (cpuid.part) {
+        case 0x581:
+        case 0x584:
+            return CPU::marvell_pj4;
+        default: return CPU::generic;
+        }
+    case 0x67: // Apple
+        // swift
+        // cyclone
+        // twister
+        // hurricane
+        switch (cpuid.part) {
+        case 0x072: return CPU::apple_typhoon;
+        default: return CPU::generic;
+        }
+    case 0x69: // Intel
+        switch (cpuid.part) {
+        case 0x001: return CPU::intel_3735d;
+        default: return CPU::generic;
+        }
+    default:
+        return CPU::generic;
+    }
+}
+
+static std::pair<int,char> get_elf_arch(void)
+{
+#ifdef _CPU_AARCH64_
+    return std::make_pair(8, 'A');
+#else
+    int ver = 0;
+    char profile = 0;
+    struct utsname name;
+    if (uname(&name) >= 0) {
+        // name.machine is the elf_platform in the kernel.
+        if (strcmp(name.machine, "armv6l") == 0) {
+            ver = 6;
+        }
+        else if (strcmp(name.machine, "armv7l") == 0) {
+            ver = 7;
+        }
+        else if (strcmp(name.machine, "armv7ml") == 0) {
+            ver = 7;
+            profile = 'M';
+        }
+        else if (strcmp(name.machine, "armv8l") == 0 || strcmp(name.machine, "aarch64") == 0) {
+            ver = 8;
+        }
+    }
+    if (__ARM_ARCH > ver)
+        ver = __ARM_ARCH;
+#  if __ARM_ARCH > 6 && defined(__ARM_ARCH_PROFILE)
+    profile = __ARM_ARCH_PROFILE;
+#  endif
+    return std::make_pair(ver, profile);
+#endif
+}
+
+static inline const CPUSpec<CPU,feature_sz> *find_cpu(uint32_t cpu)
+{
+    return ::find_cpu(cpu, cpus, ncpu_names);
+}
+
+static inline const CPUSpec<CPU,feature_sz> *find_cpu(llvm::StringRef name)
+{
+    return ::find_cpu(name, cpus, ncpu_names);
+}
+
+static inline const char *find_cpu_name(uint32_t cpu)
+{
+    return ::find_cpu_name(cpu, cpus, ncpu_names);
+}
+
+static std::pair<int,bool> feature_arch_version(const FeatureList<feature_sz> &feature)
+{
+#ifdef _CPU_AARCH64_
+    return std::make_pair(8, false);
+#else
+    if (test_nbit(feature, Feature::v8))
+        return std::make_pair(8, test_nbit(feature, Feature::mclass));
+    if (test_nbit(feature, Feature::v7))
+        return std::make_pair(7, test_nbit(feature, Feature::mclass));
+    return std::make_pair(6, false);
+#endif
+}
+
+static CPU generic_for_arch(std::pair<int,bool> arch)
+{
+#ifdef _CPU_AARCH64_
+    return CPU::generic;
+#else
+#  if defined(__ARM_ARCH_PROFILE)
+    char klass = __ARM_ARCH_PROFILE;
+#  else
+    char klass = arch.second ? 'M' : 'A';
+#  endif
+    if (arch.first >= 8) {
+        if (klass == 'M') {
+            return CPU::armv8_m_base;
+        }
+        else if (klass == 'R') {
+            return CPU::armv8_r;
+        }
+        else {
+            return CPU::armv8_a;
+        }
+    }
+    else if (arch.first == 7) {
+        if (klass == 'M') {
+            return CPU::armv7_m;
+        }
+        else if (klass == 'R') {
+            return CPU::armv7_r;
+        }
+        else {
+            return CPU::armv7_a;
+        }
+    }
+    return CPU::generic;
+#endif
+}
+
+static bool check_cpu_arch_ver(uint32_t cpu, std::pair<int,bool> arch)
+{
+    auto spec = find_cpu(cpu);
+    // This happens on AArch64 and indicates that the cpu name isn't a valid aarch64 CPU
+    if (!spec)
+        return false;
+    auto cpu_arch = feature_arch_version(spec->features);
+    if (arch.second != cpu_arch.second)
+        return false;
+    if (arch.first > cpu_arch.first)
+        return false;
+    return true;
+}
+
+static void shrink_big_little(std::vector<std::pair<uint32_t,CPUID>> &list,
+                              const CPU *cpus, uint32_t ncpu)
+{
+    auto find = [&] (uint32_t name) {
+        for (uint32_t i = 0; i < ncpu; i++) {
+            if (cpus[i] == CPU(name)) {
+                return (int)i;
+            }
+        }
+        return -1;
+    };
+    int maxidx = -1;
+    for (auto &ele: list) {
+        int idx = find(ele.first);
+        if (idx > maxidx) {
+            maxidx = idx;
+        }
+    }
+    if (maxidx >= 0) {
+        list.erase(std::remove_if(list.begin(), list.end(), [&] (std::pair<uint32_t,CPUID> &ele) {
+                    int idx = find(ele.first);
+                    return idx != -1 && idx < maxidx;
+                }), list.end());
+    }
+}
+
+static inline const std::pair<uint32_t,FeatureList<feature_sz>> &get_host_cpu()
+{
+    static const auto host_cpu = [] {
+        FeatureList<feature_sz> features = {};
+        // Here we assume that only the lower 32bit are used on aarch64
+        // Change the cast here when that's not the case anymore (and when there's features in the
+        // high bits that we want to detect).
+        features[0] = (uint32_t)jl_getauxval(AT_HWCAP);
+        features[1] = (uint32_t)jl_getauxval(AT_HWCAP2);
+        auto cpuinfo = get_cpuinfo();
+        auto arch = get_elf_arch();
+#ifdef _CPU_ARM_
+        if (arch.first >= 7) {
+            if (arch.second == 'M') {
+                set_bit(features, Feature::mclass, true);
+            }
+            else if (arch.second == 'R') {
+                set_bit(features, Feature::rclass, true);
+            }
+            else if (arch.second == 'A') {
+                set_bit(features, Feature::aclass, true);
+            }
+        }
+        switch (arch.first) {
+        case 8:
+        set_bit(features, Feature::v8, true);
+        JL_FALLTHROUGH;
+        case 7:
+        set_bit(features, Feature::v7, true);
+        break;
+        default:
+        break;
+        }
+#endif
+
+        std::set<uint32_t> cpus;
+        std::vector<std::pair<uint32_t,CPUID>> list;
+        for (auto info: cpuinfo) {
+            auto name = (uint32_t)get_cpu_name(info);
+            if (name == 0)
+                continue;
+            if (!check_cpu_arch_ver(name, arch))
+                continue;
+            if (cpus.insert(name).second) {
+                features = features | find_cpu(name)->features;
+                list.emplace_back(name, info);
+            }
+        }
+        // Not all elements/pairs are valid
+        static constexpr CPU v8order[] = {
+            CPU::arm_cortex_a32,
+            CPU::arm_cortex_a35,
+            CPU::arm_cortex_a53,
+            CPU::arm_cortex_a55,
+            CPU::arm_cortex_a57,
+            CPU::arm_cortex_a72,
+            CPU::arm_cortex_a73,
+            CPU::arm_cortex_a75,
+            CPU::nvidia_denver2,
+            CPU::samsung_exynos_m1
+        };
+        shrink_big_little(list, v8order, sizeof(v8order) / sizeof(CPU));
+#ifdef _CPU_ARM_
+        // Not all elements/pairs are valid
+        static constexpr CPU v7order[] = {
+            CPU::arm_cortex_a5,
+            CPU::arm_cortex_a7,
+            CPU::arm_cortex_a8,
+            CPU::arm_cortex_a9,
+            CPU::arm_cortex_a12,
+            CPU::arm_cortex_a15,
+            CPU::arm_cortex_a17
+        };
+        shrink_big_little(list, v7order, sizeof(v7order) / sizeof(CPU));
+#endif
+        uint32_t cpu = 0;
+        if (list.empty()) {
+            cpu = (uint32_t)generic_for_arch(arch);
+        }
+        else {
+            // This also covers `list.size() > 1` case which means there's a unknown combination
+            // consists of CPU's we know. Unclear what else we could try so just randomly return
+            // one...
+            cpu = list[0].first;
+        }
+        // Ignore feature bits that we are not interested in.
+        mask_features(feature_masks, &features[0]);
+
+        return std::make_pair(cpu, features);
+    }();
+    return host_cpu;
+}
+
+static bool is_generic_cpu_name(uint32_t cpu)
+{
+    switch ((CPU)cpu) {
+    case CPU::generic:
+    case CPU::armv7_a:
+    case CPU::armv7_m:
+    case CPU::armv7e_m:
+    case CPU::armv7_r:
+    case CPU::armv8_a:
+    case CPU::armv8_m_base:
+    case CPU::armv8_m_main:
+    case CPU::armv8_r:
+    case CPU::armv8_1_a:
+    case CPU::armv8_2_a:
+    case CPU::armv8_3_a:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static inline const std::string &host_cpu_name()
+{
+    static std::string name = [] {
+        if (is_generic_cpu_name(get_host_cpu().first)) {
+            auto llvm_name = jl_get_cpu_name_llvm();
+            if (llvm_name != "generic") {
+                return llvm_name;
+            }
+        }
+        return std::string(find_cpu_name(get_host_cpu().first));
+    }();
+    return name;
+}
+
+template<size_t n>
+static inline void enable_depends(FeatureList<n> &features)
+{
+    if (test_nbit(features, Feature::v8_3a))
+        set_bit(features, Feature::v8_2a, true);
+    if (test_nbit(features, Feature::v8_2a))
+        set_bit(features, Feature::v8_1a, true);
+    if (test_nbit(features, Feature::v8_1a))
+        set_bit(features, Feature::crc, true);
+#ifdef _CPU_ARM_
+    if (test_nbit(features, Feature::v8_1a)) {
+        set_bit(features, Feature::v8, true);
+        set_bit(features, Feature::aclass, true);
+    }
+    if (test_nbit(features, Feature::v8_m_main)) {
+        set_bit(features, Feature::v8, true);
+        set_bit(features, Feature::mclass, true);
+    }
+    if (test_nbit(features, Feature::v8)) {
+        set_bit(features, Feature::v7, true);
+        if (test_nbit(features, Feature::aclass)) {
+            set_bit(features, Feature::neon, true);
+            set_bit(features, Feature::vfp3, true);
+            set_bit(features, Feature::vfp4, true);
+            set_bit(features, Feature::hwdiv_arm, true);
+            set_bit(features, Feature::hwdiv, true);
+            set_bit(features, Feature::d32, true);
+        }
+    }
+    ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
+#else
+    if (test_nbit(features, Feature::v8_1a)) {
+        set_bit(features, Feature::lse, true);
+        set_bit(features, Feature::rdm, true);
+    }
+#endif
+}
+
+template<size_t n>
+static inline void disable_depends(FeatureList<n> &features)
+{
+#ifdef _CPU_ARM_
+    ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
+#endif
+}
+
+static const std::vector<TargetData<feature_sz>> &get_cmdline_targets(void)
+{
+    auto feature_cb = [] (const char *str, size_t len, FeatureList<feature_sz> &list) {
+        auto fbit = find_feature_bit(feature_names, nfeature_names, str, len);
+        if (fbit == (uint32_t)-1)
+            return false;
+        set_bit(list, fbit, true);
+        return true;
+    };
+    return ::get_cmdline_targets<feature_sz>(feature_cb);
+}
+
+static std::vector<TargetData<feature_sz>> jit_targets;
+
+static TargetData<feature_sz> arg_target_data(const TargetData<feature_sz> &arg, bool require_host)
+{
+    TargetData<feature_sz> res = arg;
+    const FeatureList<feature_sz> *cpu_features = nullptr;
+    if (res.name == "native") {
+        res.name = host_cpu_name();
+        cpu_features = &get_host_cpu().second;
+    }
+    else if (auto spec = find_cpu(res.name)) {
+        cpu_features = &spec->features;
+    }
+    else {
+        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
+    }
+    if (cpu_features) {
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.en.features[i] |= (*cpu_features)[i];
+        }
+    }
+    enable_depends(res.en.features);
+    for (size_t i = 0; i < feature_sz; i++)
+        res.en.features[i] &= ~res.dis.features[i];
+    if (require_host) {
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.en.features[i] &= get_host_cpu().second[i];
+        }
+    }
+    disable_depends(res.en.features);
+    if (cpu_features) {
+        // If the base feature if known, fill in the disable features
+        for (size_t i = 0; i < feature_sz; i++) {
+            res.dis.features[i] = feature_masks[i] & ~res.en.features[i];
+        }
+    }
+    return res;
+}
+
+static int max_vector_size(const FeatureList<feature_sz> &features)
+{
+#ifdef _CPU_ARM_
+    if (test_nbit(features, Feature::neon))
+        return 16;
+    return 8;
+#else
+    // TODO SVE
+    return 16;
+#endif
+}
+
+static uint32_t sysimg_init_cb(const void *id)
+{
+    // First see what target is requested for the JIT.
+    auto &cmdline = get_cmdline_targets();
+    TargetData<feature_sz> target = arg_target_data(cmdline[0], true);
+    // Then find the best match in the sysimg
+    auto sysimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
+    auto match = match_sysimg_targets(sysimg, target, max_vector_size);
+    // Now we've decided on which sysimg version to use.
+    // Make sure the JIT target is compatible with it and save the JIT target.
+    if (match.vreg_size != max_vector_size(target.en.features) &&
+        (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) {
+#ifdef _CPU_ARM_
+        unset_bits(target.en.features, Feature::neon);
+#endif
+    }
+    jit_targets.push_back(std::move(target));
+    return match.best_idx;
+}
+
+static void ensure_jit_target(bool imaging)
+{
+    auto &cmdline = get_cmdline_targets();
+    check_cmdline(cmdline, imaging);
+    if (!jit_targets.empty())
+        return;
+    for (auto &arg: cmdline) {
+        auto data = arg_target_data(arg, jit_targets.empty());
+        jit_targets.push_back(std::move(data));
+    }
+    auto ntargets = jit_targets.size();
+    // Now decide the clone condition.
+    for (size_t i = 1; i < ntargets; i++) {
+        auto &t = jit_targets[i];
+        if (t.en.flags & JL_TARGET_CLONE_ALL)
+            continue;
+        // The most useful one in general...
+        t.en.flags |= JL_TARGET_CLONE_LOOP;
+#ifdef _CPU_ARM_
+        auto &features0 = jit_targets[t.base].en.features;
+        static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon};
+        for (auto fe: clone_math) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_MATH;
+                break;
+            }
+        }
+        static constexpr uint32_t clone_simd[] = {Feature::neon};
+        for (auto fe: clone_simd) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_SIMD;
+                break;
+            }
+        }
+#endif
+    }
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_noext(const TargetData<feature_sz> &data)
+{
+    std::string name = data.name;
+    auto *spec = find_cpu(name);
+    while (spec) {
+        if (spec->llvmver <= JL_LLVM_VERSION)
+            break;
+        spec = find_cpu((uint32_t)spec->fallback);
+        name = spec->name;
+    }
+    auto features = data.en.features;
+    if (spec) {
+        if (is_generic_cpu_name((uint32_t)spec->cpu)) {
+            features = features | spec->features;
+            name = "generic";
+        }
+    }
+    std::vector<std::string> feature_strs;
+    for (auto &fename: feature_names) {
+        if (fename.llvmver > JL_LLVM_VERSION)
+            continue;
+        if (fename.bit >= 32 * 2)
+            break;
+        const char *fename_str = fename.name;
+        bool enable = test_nbit(features, fename.bit);
+        bool disable = test_nbit(data.dis.features, fename.bit);
+#ifdef _CPU_ARM_
+        if (fename.bit == Feature::d32) {
+            if (enable) {
+                feature_strs.push_back("-d16");
+            }
+            else if (disable) {
+                feature_strs.push_back("+d16");
+            }
+            continue;
+        }
+#endif
+        if (enable) {
+            feature_strs.insert(feature_strs.begin(), std::string("+") + fename_str);
+        }
+        else if (disable) {
+            feature_strs.push_back(std::string("-") + fename_str);
+        }
+    }
+    if (test_nbit(features, Feature::v8_2a))
+        feature_strs.push_back("+v8.2a");
+    if (test_nbit(features, Feature::v8_1a))
+        feature_strs.push_back("+v8.1a");
+#ifdef _CPU_ARM_
+    if (test_nbit(features, Feature::v8_m_main)) {
+        feature_strs.push_back("+v8m.main");
+        feature_strs.push_back("+armv8-m.main");
+    }
+    if (test_nbit(features, Feature::aclass))
+        feature_strs.push_back("+aclass");
+    if (test_nbit(features, Feature::rclass))
+        feature_strs.push_back("+rclass");
+    if (test_nbit(features, Feature::mclass))
+        feature_strs.push_back("+mclass");
+    if (test_nbit(features, Feature::v8)) {
+        feature_strs.push_back("+v8");
+        if (test_nbit(features, Feature::aclass))
+            feature_strs.push_back("+armv8-a");
+        if (test_nbit(features, Feature::rclass))
+            feature_strs.push_back("+armv8-r");
+        if (test_nbit(features, Feature::mclass)) {
+            feature_strs.push_back("+v8m");
+            feature_strs.push_back("+armv8-m.base");
+        }
+    }
+    if (test_nbit(features, Feature::v7)) {
+        feature_strs.push_back("+v7");
+        if (test_nbit(features, Feature::aclass))
+            feature_strs.push_back("+armv7-a");
+        if (test_nbit(features, Feature::rclass))
+            feature_strs.push_back("+armv7-r");
+        if (test_nbit(features, Feature::mclass))
+            feature_strs.push_back("+armv7-m");
+    }
+    feature_strs.push_back("+v6");
+    feature_strs.push_back("+vfp2");
+#else
+    feature_strs.push_back("+neon");
+    feature_strs.push_back("+fp-armv8");
+#endif
+    return std::make_pair(std::move(name), std::move(feature_strs));
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_vec(const TargetData<feature_sz> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    append_ext_features(res0.second, data.ext_features);
+    return res0;
+}
+
+static std::pair<std::string,std::string>
+get_llvm_target_str(const TargetData<feature_sz> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    auto features = join_feature_strs(res0.second);
+    append_ext_features(features, data.ext_features);
+    return std::make_pair(std::move(res0.first), std::move(features));
+}
+
+static FeatureList<feature_sz> get_max_feature(void)
+{
+#ifdef _CPU_ARM_
+    auto arch = get_elf_arch();
+    auto features = real_feature_masks;
+    if (arch.second == 0)
+        arch.second = 'A';
+    set_bit(features, Feature::v7, true);
+    set_bit(features, Feature::v8, true);
+    if (arch.second == 'M') {
+        set_bit(features, Feature::mclass, true);
+        set_bit(features, Feature::v8_m_main, true);
+    }
+    else if (arch.second == 'R') {
+        set_bit(features, Feature::rclass, true);
+    }
+    else if (arch.second == 'A') {
+        set_bit(features, Feature::aclass, true);
+        set_bit(features, Feature::v8_1a, true);
+        set_bit(features, Feature::v8_2a, true);
+    }
+    return features;
+#else
+    // There isn't currently any conflicting features on AArch64
+    return feature_masks;
+#endif
+}
+
+}
+
+using namespace ARM;
+
+JL_DLLEXPORT void jl_dump_host_cpu(void)
+{
+    dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names,
+                  cpus, ncpu_names);
+}
+
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
+{
+    return jl_cstr_to_string(host_cpu_name().c_str());
+}
+
+jl_sysimg_fptrs_t jl_init_processor_sysimg(void *hdl)
+{
+    if (!jit_targets.empty())
+        jl_error("JIT targets already initialized");
+    return parse_sysimg(hdl, sysimg_init_cb);
+}
+
+std::pair<std::string,std::vector<std::string>> jl_get_llvm_target(bool imaging, uint32_t &flags)
+{
+    ensure_jit_target(imaging);
+    flags = jit_targets[0].en.flags;
+    return get_llvm_target_vec(jit_targets[0]);
+}
+
+const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
+{
+    // RAS is not currently detectable AFAICT
+    auto max_feature = get_max_feature();
+    static const auto res = get_llvm_target_str(TargetData<feature_sz>{host_cpu_name(),
+                JL_LLVM_VERSION >= 60000 ? "+dotprod,+ras" : "+ras",
+                {max_feature, 0}, {feature_masks & ~max_feature, 0}, 0});
+    return res;
+}
+
+std::vector<jl_target_spec_t> jl_get_llvm_clone_targets(void)
+{
+    if (jit_targets.empty())
+        jl_error("JIT targets not initialized");
+    std::vector<jl_target_spec_t> res;
+    for (auto &target: jit_targets) {
+        auto features_en = target.en.features;
+        auto features_dis = target.dis.features;
+        for (auto &fename: feature_names) {
+            if (fename.llvmver > JL_LLVM_VERSION) {
+                unset_bits(features_en, fename.bit);
+                unset_bits(features_dis, fename.bit);
+            }
+        }
+        ARM::disable_depends(features_en);
+        jl_target_spec_t ele;
+        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
+        ele.data = serialize_target_data(target.name, features_en, features_dis,
+                                         target.ext_features);
+        ele.flags = target.en.flags;
+        ele.base = target.base;
+        res.push_back(ele);
+    }
+    return res;
+}
+
+extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
+{
+    if (feature >= 32 * feature_sz)
+        return 0;
+    return test_nbit(&get_host_cpu().second[0], feature);
+}
+
+#ifdef _CPU_AARCH64_
+// FZ, bit [24]
+static constexpr uint32_t fpcr_fz_mask = 1 << 24;
+
+static inline uint32_t get_fpcr_aarch64(void)
+{
+    uint32_t fpcr;
+    asm volatile("mrs %0, fpcr" : "=r"(fpcr));
+    return fpcr;
+}
+
+static inline void set_fpcr_aarch64(uint32_t fpcr)
+{
+    asm volatile("msr fpcr, %0" :: "r"(fpcr));
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t fpcr = get_fpcr_aarch64();
+    fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask);
+    set_fpcr_aarch64(fpcr);
+    return 0;
+}
+#else
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    return isZero;
+}
+#endif

From f9885df3b5edc438efdffae9c77def49169e8bf2 Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sat, 1 Jul 2017 23:43:42 -0400
Subject: [PATCH 5/9] Implement fallback CPU detection and dispatch API

Fallback to LLVM for CPU name and feature detection.
Still allow cloning but is restricted to exact name matching dispatch.
---
 src/processor.cpp          |  26 +-----
 src/processor.h            |   1 +
 src/processor_fallback.cpp | 162 +++++++++++++++++++++++++++++++++++++
 3 files changed, 164 insertions(+), 25 deletions(-)
 create mode 100644 src/processor_fallback.cpp

diff --git a/src/processor.cpp b/src/processor.cpp
index ba5072efadf6a..fbc33a407d2aa 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -803,30 +803,6 @@ static inline void dump_cpu_spec(uint32_t cpu, const FeatureList<n> &features,
 
 #else
 
-JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
-{
-    return jl_cstr_to_string(jl_get_cpu_name_llvm().c_str());
-}
-
-JL_DLLEXPORT void jl_dump_host_cpu(void)
-{
-    jl_safe_printf("CPU: generic\n");
-    jl_safe_printf("Features:\n");
-}
-
-extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    return isZero;
-}
+#include "processor_fallback.cpp"
 
 #endif
diff --git a/src/processor.h b/src/processor.h
index 66d2b135b3291..512b8a2936e16 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -203,4 +203,5 @@ struct jl_target_spec_t {
  */
 std::vector<jl_target_spec_t> jl_get_llvm_clone_targets(void);
 std::string jl_get_cpu_name_llvm(void);
+std::string jl_get_cpu_features_llvm(void);
 #endif
diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp
new file mode 100644
index 0000000000000..416f1dd211a2b
--- /dev/null
+++ b/src/processor_fallback.cpp
@@ -0,0 +1,162 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Fallback processor detection and dispatch
+
+namespace Fallback {
+
+static inline const std::string &host_cpu_name()
+{
+    static std::string name = jl_get_cpu_name_llvm();
+    return name;
+}
+
+static const std::vector<TargetData<1>> &get_cmdline_targets(void)
+{
+    auto feature_cb = [] (const char*, size_t, FeatureList<1>&) {
+        return false;
+    };
+    return ::get_cmdline_targets<1>(feature_cb);
+}
+
+static std::vector<TargetData<1>> jit_targets;
+
+static TargetData<1> arg_target_data(const TargetData<1> &arg, bool require_host)
+{
+    TargetData<1> res = arg;
+    if (res.name == "native") {
+        res.name = host_cpu_name();
+        append_ext_features(res.ext_features, jl_get_cpu_features_llvm());
+    }
+    else {
+        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
+    }
+    return res;
+}
+
+static uint32_t sysimg_init_cb(const void *id)
+{
+    // First see what target is requested for the JIT.
+    auto &cmdline = get_cmdline_targets();
+    TargetData<1> target = arg_target_data(cmdline[0], true);
+    // Find the last name match or use the default one.
+    uint32_t best_idx = 0;
+    auto sysimg = deserialize_target_data<1>((const uint8_t*)id);
+    for (uint32_t i = 0; i < sysimg.size(); i++) {
+        auto &imgt = sysimg[i];
+        if (imgt.name == target.name) {
+            best_idx = i;
+        }
+    }
+    target = sysimg[best_idx];
+    jit_targets.push_back(std::move(target));
+    return best_idx;
+}
+
+static void ensure_jit_target(bool imaging)
+{
+    auto &cmdline = get_cmdline_targets();
+    check_cmdline(cmdline, imaging);
+    if (!jit_targets.empty())
+        return;
+    for (auto &arg: cmdline) {
+        auto data = arg_target_data(arg, jit_targets.empty());
+        jit_targets.push_back(std::move(data));
+    }
+    auto ntargets = jit_targets.size();
+    // Now decide the clone condition.
+    for (size_t i = 1; i < ntargets; i++) {
+        auto &t = jit_targets[i];
+        t.en.flags |= JL_TARGET_CLONE_ALL;
+    }
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_noext(const TargetData<1> &data)
+{
+    return std::make_pair(data.name, std::vector<std::string>{});
+}
+
+static std::pair<std::string,std::vector<std::string>>
+get_llvm_target_vec(const TargetData<1> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    append_ext_features(res0.second, data.ext_features);
+    return res0;
+}
+
+static std::pair<std::string,std::string>
+get_llvm_target_str(const TargetData<1> &data)
+{
+    auto res0 = get_llvm_target_noext(data);
+    auto features = join_feature_strs(res0.second);
+    append_ext_features(features, data.ext_features);
+    return std::make_pair(std::move(res0.first), std::move(features));
+}
+
+}
+
+using namespace Fallback;
+
+jl_sysimg_fptrs_t jl_init_processor_sysimg(void *hdl)
+{
+    if (!jit_targets.empty())
+        jl_error("JIT targets already initialized");
+    return parse_sysimg(hdl, sysimg_init_cb);
+}
+
+std::pair<std::string,std::vector<std::string>> jl_get_llvm_target(bool imaging, uint32_t &flags)
+{
+    ensure_jit_target(imaging);
+    flags = jit_targets[0].en.flags;
+    return get_llvm_target_vec(jit_targets[0]);
+}
+
+const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
+{
+    static const auto res = get_llvm_target_str(TargetData<1>{host_cpu_name(),
+                jl_get_cpu_features_llvm(), {{}, 0}, {{}, 0}, 0});
+    return res;
+}
+
+std::vector<jl_target_spec_t> jl_get_llvm_clone_targets(void)
+{
+    if (jit_targets.empty())
+        jl_error("JIT targets not initialized");
+    std::vector<jl_target_spec_t> res;
+    for (auto &target: jit_targets) {
+        jl_target_spec_t ele;
+        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
+        ele.data = serialize_target_data(target.name, target.en.features,
+                                         target.dis.features, target.ext_features);
+        ele.flags = target.en.flags;
+        ele.base = 0;
+        res.push_back(ele);
+    }
+    return res;
+}
+
+JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
+{
+    return jl_cstr_to_string(host_cpu_name().c_str());
+}
+
+JL_DLLEXPORT void jl_dump_host_cpu(void)
+{
+    jl_safe_printf("CPU: %s\n", host_cpu_name().c_str());
+    jl_safe_printf("Features: %s\n", jl_get_cpu_features_llvm().c_str());
+}
+
+extern "C" int jl_test_cpu_feature(jl_cpu_feature_t)
+{
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    return isZero;
+}

From 8b43eb9fe7dd203b490f6fbff19a7a4c4f9b56ca Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sun, 2 Jul 2017 05:12:13 -0400
Subject: [PATCH 6/9] Use processor.cpp API in disasm.cpp

Remove ARM specific workaround and do not assume the CPU name contains all
information about the code.
This should now allow disassemble of code that's not required by the base ISA
or even not supported on the current CPU.
This is especially important on ARM, which does not disassemble many armv7 instructions previously.
---
 src/Makefile   |  2 +-
 src/disasm.cpp | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 99f84cdd25178..fc8d65c341f99 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -186,7 +186,7 @@ $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\
 $(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h)
 $(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics.h
 $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(SRCDIR)/debuginfo.h
-$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h
+$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h
 $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/table.c
 $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h
diff --git a/src/disasm.cpp b/src/disasm.cpp
index 54f79502307d4..aa9140d578ff8 100644
--- a/src/disasm.cpp
+++ b/src/disasm.cpp
@@ -56,7 +56,6 @@
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/Host.h>
 #include "llvm/Support/TargetSelect.h"
 #include <llvm/Support/raw_ostream.h>
 #include "llvm/Support/FormattedStream.h"
@@ -72,6 +71,7 @@
 
 #include "julia.h"
 #include "julia_internal.h"
+#include "processor.h"
 
 using namespace llvm;
 #include "debuginfo.h"
@@ -636,15 +636,9 @@ static void jl_dump_asm_internal(
     std::string TripleName = sys::getDefaultTargetTriple();
     Triple TheTriple(Triple::normalize(TripleName));
 
-    std::string MCPU = sys::getHostCPUName();
-#ifdef _CPU_ARM_
-    // The Raspberry Pi CPU is misdetected by LLVM (at least of version
-    // 3.6); correct this.
-    if (MCPU == "arm1176jz-s")
-        MCPU = "arm1176jzf-s";
-#endif
-    SubtargetFeatures Features;
-    Features.getDefaultSubtargetFeatures(TheTriple);
+    const auto &target = jl_get_llvm_disasm_target();
+    const auto &cpu = target.first;
+    const auto &features = target.second;
 
     std::string err;
     const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, err);
@@ -670,7 +664,7 @@ static void jl_dump_asm_internal(
 
     // Set up Subtarget and Disassembler
     std::unique_ptr<MCSubtargetInfo>
-        STI(TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
+        STI(TheTarget->createMCSubtargetInfo(TripleName, cpu, features));
     std::unique_ptr<MCDisassembler> DisAsm(TheTarget->createMCDisassembler(*STI, Ctx));
     if (!DisAsm) {
         jl_printf(JL_STDERR, "ERROR: no disassembler for target %s\n",
@@ -696,9 +690,9 @@ static void jl_dump_asm_internal(
         CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
 #if JL_LLVM_VERSION >= 40000
         MCTargetOptions Options;
-        MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU, Options);
+        MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, cpu, Options);
 #else
-        MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU);
+        MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, cpu);
 #endif
     }
 

From 229864e0b53c07af73999f2ed2c67451ef6d66c5 Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sun, 2 Jul 2017 11:31:59 -0400
Subject: [PATCH 7/9] Use processor.cpp API to initialize LLVM JIT

This is currently ignoring the sysimg data.
Will be fixed when we have the actual cloning pass.
---
 src/Makefile     |   2 +-
 src/codegen.cpp  | 199 ++---------------------------------------------
 src/staticdata.c |   5 --
 3 files changed, 8 insertions(+), 198 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index fc8d65c341f99..db1ac4fd6fb93 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -182,7 +182,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase
 # additional dependency links
 $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h
 $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\
-	intrinsics.cpp jitlayers.h intrinsics.h debuginfo.h codegen_shared.h cgutils.cpp ccall.cpp abi_*.cpp)
+	intrinsics.cpp jitlayers.h intrinsics.h debuginfo.h codegen_shared.h cgutils.cpp ccall.cpp abi_*.cpp processor.h)
 $(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h)
 $(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics.h
 $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(SRCDIR)/debuginfo.h
diff --git a/src/codegen.cpp b/src/codegen.cpp
index ee7bc9edff2a8..b075a2fdd366b 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -98,6 +98,7 @@ namespace llvm {
 #include "julia_internal.h"
 #include "jitlayers.h"
 #include "codegen_shared.h"
+#include "processor.h"
 #include "julia_assert.h"
 
 // LLVM version compatibility macros
@@ -6603,189 +6604,6 @@ static void init_julia_llvm_env(Module *m)
     addOptimizationPasses(jl_globalPM, jl_options.opt_level);
 }
 
-static inline std::string getNativeTarget()
-{
-    std::string cpu = sys::getHostCPUName();
-#if defined(_CPU_ARM_)
-    // Try slightly harder than LLVM at determining the CPU architecture.
-    if (cpu == "generic") {
-        // This is the most reliable way I can find
-        // `/proc/cpuinfo` changes between kernel versions
-        struct utsname name;
-        if (uname(&name) >= 0) {
-            // name.machine is the elf_platform in the kernel.
-            if (strcmp(name.machine, "armv6l") == 0) {
-                return "armv6";
-            }
-            if (strcmp(name.machine, "armv7l") == 0) {
-                return "armv7";
-            }
-            if (strcmp(name.machine, "armv7ml") == 0) {
-                // Thumb
-                return "armv7-m";
-            }
-            if (strcmp(name.machine, "armv8l") == 0 ||
-                strcmp(name.machine, "aarch64") == 0) {
-                return "armv8";
-            }
-        }
-    }
-#endif
-    return cpu;
-}
-
-#if defined(_CPU_ARM_) || defined(_CPU_AARCH64_)
-// Check if the cpu name is a ARM/AArch64 arch name and return a
-// string that can be used as LLVM feature name
-static inline void checkARMArchFeature(std::string &cpu,
-                                       StringMap<bool> &HostFeatures)
-{
-#if defined(_CPU_ARM_)
-    if (cpu == "generic") {
-        HostFeatures["neon"] = false;
-        return;
-    }
-#endif
-    StringRef cpu_s = cpu;
-    if (!cpu_s.startswith("armv"))
-        return;
-    // Generic names
-#if defined(_CPU_ARM_)
-    if (!cpu_s.startswith("armv8")) {
-        // Turn off `neon` for generic archs on ARM
-        // since LLVM seems to enable it for all armv7-a processors.
-        HostFeatures["neon"] = false;
-    }
-    // "v7" and "v8" are not available in the form of `armv*`
-    // in the feature list
-    if (cpu == "armv7") {
-        HostFeatures["v7"] = true;
-    }
-    else if (cpu == "armv8") {
-        HostFeatures["v8"] = true;
-    }
-    else {
-        HostFeatures[cpu] = true;
-    }
-#else
-    // These two are allowed on 32bit. Allow them on 64bits too for consistency since
-    // they basically mean "generic" on aarch64.
-    // In particular, "armv8-a" is the generic value for the GCC `-march` option.
-    if (cpu != "armv8" && cpu != "armv8-a")
-        HostFeatures[cpu.substr(3)] = true;
-#endif
-    cpu = "generic";
-}
-#endif
-
-// Helper to figure out what features to set for the LLVM target
-// If the user specifies native (or does not specify) we default
-// using the API provided by LLVM
-static inline SmallVector<std::string,10> getTargetFeatures(std::string &cpu)
-{
-    StringMap<bool> HostFeatures;
-    if (jl_options.cpu_target && !strcmp(jl_options.cpu_target,"native")) {
-        // On earlier versions of LLVM this is empty
-        llvm::sys::getHostCPUFeatures(HostFeatures);
-    }
-
-    // Platform specific overides follow
-#if defined(_CPU_X86_64_)
-    // Require cx16 (cmpxchg16b)
-    // We need this for 128-bit atomic operations. We only need this
-    // when threading is enabled; however, to test whether this
-    // excludes important systems, we require this even when threading
-    // is disabled.
-    HostFeatures["cx16"] = true;
-#endif
-
-    // Figure out if we know the cpu_target
-    cpu = ((jl_options.cpu_target && strcmp(jl_options.cpu_target,"native")) ?
-            jl_options.cpu_target : getNativeTarget());
-#if defined(_CPU_ARM_)
-    // Figure out what we are compiling against from the C defines.
-    // This might affect ABI but is fine since
-    // 1. We define the C ABI explicitly.
-    // 2. This does not change when running the same binary on different
-    //    machines.
-    // This shouldn't affect making generic binaries since that requires a
-    // generic C -march anyway.
-    HostFeatures["vfp2"] = true;
-
-    // Arch version
-#if __ARM_ARCH >= 8
-#  if defined(__ARM_ARCH_PROFILE) && __ARM_ARCH_PROFILE == 'A'
-    HostFeatures["armv8-a"] = true;
-#  else
-    HostFeatures["v8"] = true;
-#  endif
-#elif __ARM_ARCH >= 7
-    // v7 + aclass emits slightly different code than armv7-a
-    // In particular LLVM does not use the armv7-a instruction for barrier
-    // with v7 + aclass.
-#  if defined(__ARM_ARCH_PROFILE) && __ARM_ARCH_PROFILE == 'A'
-    HostFeatures["armv7-a"] = true;
-#  elif defined(__ARM_ARCH_PROFILE) && __ARM_ARCH_PROFILE == 'R'
-    HostFeatures["armv7-r"] = true;
-#  elif defined(__ARM_ARCH_PROFILE) && __ARM_ARCH_PROFILE == 'M'
-    // Thumb
-    HostFeatures["armv7-m"] = true;
-#  else
-    HostFeatures["v7"] = true;
-#  endif
-#else
-    // minimum requirement
-    HostFeatures["v6"] = true;
-#endif
-
-    // ARM profile
-    // Only do this on ARM and not AArch64 since LLVM aarch64 backend
-    // doesn't support setting profiles.
-    // AFAIK there's currently no 64bit R and M profile either
-    // (v8r and v8m are both 32bit)
-#if defined(__ARM_ARCH_PROFILE)
-#  if __ARM_ARCH_PROFILE == 'A'
-    HostFeatures["aclass"] = true;
-#  elif __ARM_ARCH_PROFILE == 'R'
-    HostFeatures["rclass"] = true;
-#  elif __ARM_ARCH_PROFILE == 'M'
-    // Thumb
-    HostFeatures["mclass"] = true;
-#  endif
-#endif
-#endif // _CPU_ARM_
-
-    // On ARM and AArch64, allow using cpu_target to specify a CPU architecture
-    // which is specified in the feature set in LLVM.
-#if defined(_CPU_ARM_) || defined(_CPU_AARCH64_)
-    // Supported ARM arch names on LLVM 3.8:
-    //   armv6, armv6-m, armv6j, armv6k, armv6kz, armv6s-m, armv6t2,
-    //   armv7, armv7-a, armv7-m, armv7-r, armv7e-m, armv7k, armv7s,
-    //   armv8, armv8-a, armv8.1-a, armv8.2-a
-    // Additional ARM arch names on LLVM 3.9:
-    //   armv8-m.base, armv8-m.main
-    //
-    // Supported AArch64 arch names on LLVM 3.8:
-    //   armv8.1a, armv8.2a
-    checkARMArchFeature(cpu, HostFeatures);
-#endif
-
-    SmallVector<std::string,10> attr;
-    for (auto it = HostFeatures.begin(); it != HostFeatures.end(); it++) {
-        if (it->getValue()) {
-            attr.append(1, it->getKey().str());
-        }
-    }
-    // Explicitly disabled features need to be added at the end so that
-    // they are not reenabled by other features that implies them by default.
-    for (auto it = HostFeatures.begin(); it != HostFeatures.end(); it++) {
-        if (!it->getValue()) {
-            attr.append(1, std::string("-") + it->getKey().str());
-        }
-    }
-    return attr;
-}
-
 extern "C" void *jl_init_llvm(void)
 {
     const char *const argv_tailmerge[] = {"", "-enable-tail-merge=0"}; // NOO TOUCHIE; NO TOUCH! See #922
@@ -6848,14 +6666,11 @@ extern "C" void *jl_init_llvm(void)
 #if defined(FORCE_ELF)
     TheTriple.setObjectFormat(Triple::ELF);
 #endif
-    bool help = false;
-    if (jl_options.cpu_target && strcmp(jl_options.cpu_target, "help") == 0) {
-        help = true;
-        jl_options.cpu_target = "native";
-    }
-    std::string TheCPU;
-    SmallVector<std::string, 10> targetFeatures = getTargetFeatures(TheCPU);
-    {
+    uint32_t target_flags = 0;
+    auto target = jl_get_llvm_target(imaging_mode, target_flags);
+    auto &TheCPU = target.first;
+    SmallVector<std::string, 10> targetFeatures(target.second.begin(), target.second.end());
+    if (jl_processor_print_help || (target_flags & JL_TARGET_UNKNOWN_NAME)) {
         std::string errorstr;
         const Target *target = TargetRegistry::lookupTarget("", TheTriple, errorstr);
         assert(target);
@@ -6863,7 +6678,7 @@ extern "C" void *jl_init_llvm(void)
             target->createMCSubtargetInfo(TheTriple.str(), "", ""));
         if (!MSTI->isCPUStringValid(TheCPU))
             jl_errorf("Invalid CPU name %s.", TheCPU.c_str());
-        if (help) {
+        if (jl_processor_print_help) {
             // This is the only way I can find to print the help message once.
             // It'll be nice if we can iterate through the features and print our own help
             // message...
diff --git a/src/staticdata.c b/src/staticdata.c
index 2a2af6b3008e4..72986e5312c6c 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -1336,11 +1336,6 @@ JL_DLLEXPORT void jl_set_sysimg_so(void *handle)
     void* *jl_RTLD_DEFAULT_handle_pointer = (void**)jl_dlsym_e(handle, "jl_RTLD_DEFAULT_handle_pointer");
     if (!jl_RTLD_DEFAULT_handle_pointer || (void*)&jl_RTLD_DEFAULT_handle != *jl_RTLD_DEFAULT_handle_pointer)
         jl_error("System image file failed consistency check: maybe opened the wrong version?");
-    // TODO make sure the sysimg and the JIT agrees on the ABI.
-    // This shouldn't be a problem for any required C types on any platforms we support
-    // but could be a problem from optional types. In particular, we need to make sure
-    // the two agrees on the usable register sizes so that functions that take
-    // a vector as input can use consistent calling convention.
     if (jl_options.cpu_target == NULL)
         jl_options.cpu_target = "native";
     jl_sysimg_handle = handle;

From 6ebdd797812a1833f01496dd18dfd31eee6b235e Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Thu, 29 Jun 2017 04:28:06 -0400
Subject: [PATCH 8/9] Implement function multi versioning in sysimg

* Implementing function cloning pass.
* Hook up debug info lookup and sysimg loading to the processor initialization API.
---
 src/Makefile                 |    6 +-
 src/anticodegen.c            |    4 +-
 src/debuginfo.cpp            |   27 +-
 src/jitlayers.cpp            |   30 +-
 src/jitlayers.h              |    3 +-
 src/julia_internal.h         |    4 +-
 src/llvm-multiversioning.cpp | 1067 ++++++++++++++++++++++++++++++++++
 src/processor.h              |    2 +-
 src/staticdata.c             |   41 +-
 9 files changed, 1135 insertions(+), 49 deletions(-)
 create mode 100644 src/llvm-multiversioning.cpp

diff --git a/src/Makefile b/src/Makefile
index db1ac4fd6fb93..0bbbedba99b5a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,7 +52,7 @@ LLVMLINK :=
 ifeq ($(JULIACODEGEN),LLVM)
 SRCS += codegen jitlayers disasm debuginfo llvm-simdloop llvm-ptls llvm-muladd \
 	llvm-late-gc-lowering llvm-lower-handlers llvm-gc-invariant-verifier \
-	llvm-propagate-addrspaces llvm-alloc-opt cgmemmgr
+	llvm-propagate-addrspaces llvm-multiversioning llvm-alloc-opt cgmemmgr
 FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
 LLVM_LIBS := all
 ifeq ($(USE_POLLY),1)
@@ -185,10 +185,12 @@ $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\
 	intrinsics.cpp jitlayers.h intrinsics.h debuginfo.h codegen_shared.h cgutils.cpp ccall.cpp abi_*.cpp processor.h)
 $(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h)
 $(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics.h
-$(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(SRCDIR)/debuginfo.h
+$(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: \
+	$(addprefix $(SRCDIR)/,debuginfo.h processor.h)
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h
 $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/table.c
+$(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/processor.h
 $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
diff --git a/src/anticodegen.c b/src/anticodegen.c
index b19afe21d57a8..8c87d306408f8 100644
--- a/src/anticodegen.c
+++ b/src/anticodegen.c
@@ -36,10 +36,10 @@ int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int no
     return 0;
 }
 
-void jl_register_fptrs(uint64_t sysimage_base, const char *base, const int32_t *offsets,
+void jl_register_fptrs(uint64_t sysimage_base, const struct _jl_sysimg_fptrs_t *fptrs,
                        jl_method_instance_t **linfos, size_t n)
 {
-    (void)sysimage_base; (void)base; (void)offsets; (void)linfos; (void)n;
+    (void)sysimage_base; (void)fptrs; (void)linfos; (void)n;
 }
 
 void jl_compile_linfo(jl_method_instance_t *li) { }
diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp
index b380062668e34..82f3cbf9e28ff 100644
--- a/src/debuginfo.cpp
+++ b/src/debuginfo.cpp
@@ -37,6 +37,7 @@ using llvm_file_magic = sys::fs::file_magic;
 #if defined(_OS_LINUX_)
 #  include <link.h>
 #endif
+#include "processor.h"
 
 #include <string>
 #include <sstream>
@@ -706,20 +707,14 @@ openDebugInfo(StringRef debuginfopath, const debug_link_info &info)
 }
 
 static uint64_t jl_sysimage_base;
-static const char *sysimg_fvars_base = nullptr;
-static const int32_t *sysimg_fvars_offsets;
+static jl_sysimg_fptrs_t sysimg_fptrs;
 static jl_method_instance_t **sysimg_fvars_linfo;
 static size_t sysimg_fvars_n;
-static const void *sysimg_fvars(size_t idx)
-{
-    return sysimg_fvars_base + sysimg_fvars_offsets[idx];
-}
-void jl_register_fptrs(uint64_t sysimage_base, const char *base, const int32_t *offsets,
+void jl_register_fptrs(uint64_t sysimage_base, const jl_sysimg_fptrs_t *fptrs,
                        jl_method_instance_t **linfos, size_t n)
 {
     jl_sysimage_base = (uintptr_t)sysimage_base;
-    sysimg_fvars_base = base;
-    sysimg_fvars_offsets = offsets;
+    sysimg_fptrs = *fptrs;
     sysimg_fvars_linfo = linfos;
     sysimg_fvars_n = n;
 }
@@ -738,7 +733,7 @@ static void get_function_name_and_base(const object::ObjectFile *object, bool in
                                        int64_t slide, bool untrusted_dladdr)
 {
     // Assume we only need base address for sysimg for now
-    if (!insysimage || !sysimg_fvars_base)
+    if (!insysimage || !sysimg_fptrs.base)
         saddr = nullptr;
     bool needs_saddr = saddr && (!*saddr || untrusted_dladdr);
     bool needs_name = name && (!*name || untrusted_dladdr);
@@ -1091,9 +1086,17 @@ static int jl_getDylibFunctionInfo(jl_frame_t **frames, size_t pointer, int skip
         return 1;
     }
     frame0->fromC = !isSysImg;
-    if (isSysImg && sysimg_fvars_base && saddr) {
+    if (isSysImg && sysimg_fptrs.base && saddr) {
+        intptr_t diff = (uintptr_t)saddr - (uintptr_t)sysimg_fptrs.base;
+        for (size_t i = 0; i < sysimg_fptrs.nclones; i++) {
+            if (diff == sysimg_fptrs.clone_offsets[i]) {
+                uint32_t idx = sysimg_fptrs.clone_idxs[i] & jl_sysimg_val_mask;
+                frame0->linfo = sysimg_fvars_linfo[idx];
+                break;
+            }
+        }
         for (size_t i = 0; i < sysimg_fvars_n; i++) {
-            if (saddr == sysimg_fvars(i)) {
+            if (diff == sysimg_fptrs.offsets[i]) {
                 frame0->linfo = sysimg_fvars_linfo[i];
                 break;
             }
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index a164c2bd531e7..46594b440fe10 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -97,7 +97,7 @@ void addTargetPasses(legacy::PassManagerBase *PM, TargetMachine *TM)
 
 // this defines the set of optimization passes defined for Julia at various optimization levels.
 // it assumes that the TLI and TTI wrapper passes have already been added.
-void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level)
+void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump_native)
 {
 #ifdef JL_DEBUG_BUILD
     PM->add(createGCInvariantVerifierPass(true));
@@ -133,6 +133,8 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level)
         PM->add(createLateLowerGCFramePass());
         PM->add(createLowerPTLSPass(imaging_mode));
 #endif
+        if (dump_native)
+            PM->add(createMultiVersioningPass());
         return;
     }
     PM->add(createPropagateJuliaAddrspaces());
@@ -172,6 +174,8 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level)
     PM->add(createAllocOptPass());
 #endif
     PM->add(createInstructionCombiningPass()); // Cleanup for scalarrepl.
+    if (dump_native)
+        PM->add(createMultiVersioningPass());
     PM->add(createSROAPass());                 // Break up aggregate allocas
     PM->add(createInstructionCombiningPass()); // Cleanup for scalarrepl.
     PM->add(createJumpThreadingPass());        // Thread jumps.
@@ -1022,20 +1026,18 @@ void jl_add_to_shadow(Module *m)
 
 static void emit_offset_table(Module *mod, const std::vector<GlobalValue*> &vars, StringRef name)
 {
+    // Emit a global variable with all the variable addresses.
+    // The cloning pass will convert them into offsets.
     assert(!vars.empty());
-    addComdat(GlobalAlias::create(GlobalVariable::ExternalLinkage, name + "_base", vars[0]));
-    auto vbase = ConstantExpr::getPtrToInt(vars[0], T_size);
     size_t nvars = vars.size();
-    std::vector<Constant*> offsets(nvars);
-    for (size_t i = 0; i < nvars; i++) {
-        auto ptrdiff = ConstantExpr::getSub(ConstantExpr::getPtrToInt(vars[i], T_size), vbase);
-        offsets[i] = sizeof(void*) == 8 ? ConstantExpr::getTrunc(ptrdiff, T_uint32) : ptrdiff;
-    }
-    ArrayType *vars_type = ArrayType::get(T_uint32, nvars);
-    addComdat(new GlobalVariable(*mod, vars_type, true,
-                                 GlobalVariable::ExternalLinkage,
-                                 ConstantArray::get(vars_type, ArrayRef<Constant*>(offsets)),
-                                 name + "_offsets"));
+    std::vector<Constant*> addrs(nvars);
+    for (size_t i = 0; i < nvars; i++)
+        addrs[i] = ConstantExpr::getBitCast(vars[i], T_psize);
+    ArrayType *vars_type = ArrayType::get(T_psize, nvars);
+    new GlobalVariable(*mod, vars_type, true,
+                       GlobalVariable::ExternalLinkage,
+                       ConstantArray::get(vars_type, addrs),
+                       name);
 }
 
 
@@ -1144,7 +1146,7 @@ void jl_dump_native(const char *bc_fname, const char *unopt_bc_fname, const char
     }
 
     if (bc_fname || obj_fname)
-        addOptimizationPasses(&PM, jl_options.opt_level);
+        addOptimizationPasses(&PM, jl_options.opt_level, true);
 
     if (bc_fname) {
         // call output handler directly to avoid special case handling of `-` filename
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 9b20c9d704a47..c7e48f10f111f 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -43,7 +43,7 @@ extern size_t jltls_offset_idx;
 typedef struct {Value *gv; int32_t index;} jl_value_llvm; // uses 1-based indexing
 
 void addTargetPasses(legacy::PassManagerBase *PM, TargetMachine *TM);
-void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level);
+void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump_native=false);
 void* jl_emit_and_add_to_shadow(GlobalVariable *gv, void *gvarinit = NULL);
 GlobalVariable *jl_emit_sysimg_slot(Module *m, Type *typ, const char *name,
                                     uintptr_t init, size_t &idx);
@@ -191,6 +191,7 @@ Pass *createLateLowerGCFramePass();
 Pass *createLowerExcHandlersPass();
 Pass *createGCInvariantVerifierPass(bool Strong);
 Pass *createPropagateJuliaAddrspaces();
+Pass *createMultiVersioningPass();
 Pass *createAllocOptPass();
 // Whether the Function is an llvm or julia intrinsic.
 static inline bool isIntrinsicFunction(Function *F)
diff --git a/src/julia_internal.h b/src/julia_internal.h
index e80a57dcf18ac..6901776d535df 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1001,7 +1001,9 @@ extern jl_sym_t *nospecialize_sym;
 extern jl_sym_t *boundscheck_sym;
 extern jl_sym_t *gc_preserve_begin_sym; extern jl_sym_t *gc_preserve_end_sym;
 
-void jl_register_fptrs(uint64_t sysimage_base, const char *base, const int32_t *offsets,
+struct _jl_sysimg_fptrs_t;
+
+void jl_register_fptrs(uint64_t sysimage_base, const struct _jl_sysimg_fptrs_t *fptrs,
                        jl_method_instance_t **linfos, size_t n);
 
 extern arraylist_t partial_inst;
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
new file mode 100644
index 0000000000000..22ef07f8d0b8b
--- /dev/null
+++ b/src/llvm-multiversioning.cpp
@@ -0,0 +1,1067 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Function multi-versioning
+#define DEBUG_TYPE "julia_multiversioning"
+#undef DEBUG
+
+// LLVM pass to clone function for different archs
+
+#include "llvm-version.h"
+#include "support/dtypes.h"
+
+#include <llvm/Pass.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/CallGraph.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include "julia.h"
+#include "julia_internal.h"
+#include "processor.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "codegen_shared.h"
+#include "julia_assert.h"
+
+using namespace llvm;
+
+extern std::pair<MDNode*,MDNode*> tbaa_make_child(const char *name, MDNode *parent=nullptr,
+                                                  bool isConstant=false);
+
+namespace {
+
+// These are valid detail cloning conditions in the target flags.
+constexpr uint32_t clone_mask =
+    JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH;
+
+struct MultiVersioning;
+
+// Treat identical mapping as missing and return `def` in that case.
+// We mainly need this to identify cloned function using value map after LLVM cloning
+// functions fills the map with identity entries.
+template<typename T>
+Value *map_get(T &&vmap, Value *key, Value *def=nullptr)
+{
+    auto val = vmap.lookup(key);
+    if (!val || key == val)
+        return def;
+    return val;
+}
+
+// Iterate through uses of a particular type.
+// Recursively scan through `ConstantExpr` and `ConstantAggregate` use.
+template<typename U>
+struct ConstantUses {
+    template<typename T>
+    struct Info {
+        Use *use;
+        T *val;
+        // If `samebits == true`, the offset the original value appears in the constant.
+        size_t offset;
+        // This specify whether the original value appears in the current value in exactly
+        // the same bit pattern (with possibly an offset determined by `offset`).
+        bool samebits;
+        Info(Use *use, T *val, size_t offset, bool samebits) :
+            use(use),
+            val(val),
+            offset(offset),
+            samebits(samebits)
+        {
+        }
+        Info(Use *use, size_t offset, bool samebits) :
+            use(use),
+            val(cast<T>(use->getUser())),
+            offset(offset),
+            samebits(samebits)
+        {
+        }
+    };
+    using UseInfo = Info<U>;
+    struct Frame : Info<Constant> {
+        template<typename... Args>
+        Frame(Args &&... args) :
+            Info<Constant>(std::forward<Args>(args)...),
+            cur(this->val->use_empty() ? nullptr : &*this->val->use_begin()),
+            _next(cur ? cur->getNext() : nullptr)
+        {
+        }
+    private:
+        void next()
+        {
+            cur = _next;
+            if (!cur)
+                return;
+            _next = cur->getNext();
+        }
+        Use *cur;
+        Use *_next;
+        friend struct ConstantUses;
+    };
+    ConstantUses(Constant *c, Module &M)
+        : stack{Frame(nullptr, c, 0u, true)},
+          M(M)
+    {
+        forward();
+    }
+    UseInfo get_info() const
+    {
+        auto &top = stack.back();
+        return UseInfo(top.cur, top.offset, top.samebits);
+    }
+    const SmallVector<Frame, 4> &get_stack() const
+    {
+        return stack;
+    }
+    void next()
+    {
+        stack.back().next();
+        forward();
+    }
+    bool done()
+    {
+        return stack.empty();
+    }
+private:
+    void forward();
+    SmallVector<Frame, 4> stack;
+    Module &M;
+};
+
+template<typename U>
+void ConstantUses<U>::forward()
+{
+    assert(!stack.empty());
+    auto frame = &stack.back();
+    const DataLayout &DL = M.getDataLayout();
+    auto pop = [&] {
+        stack.pop_back();
+        if (stack.empty()) {
+            return false;
+        }
+        frame = &stack.back();
+        return true;
+    };
+    auto push = [&] (Use *use, Constant *c, size_t offset, bool samebits) {
+        stack.emplace_back(use, c, offset, samebits);
+        frame = &stack.back();
+    };
+    auto handle_constaggr = [&] (Use *use, ConstantAggregate *aggr) {
+        if (!frame->samebits) {
+            push(use, aggr, 0, false);
+            return;
+        }
+        if (auto strct = dyn_cast<ConstantStruct>(aggr)) {
+            auto layout = DL.getStructLayout(strct->getType());
+            push(use, strct, frame->offset + layout->getElementOffset(use->getOperandNo()), true);
+        }
+        else if (auto ary = dyn_cast<ConstantArray>(aggr)) {
+            auto elty = ary->getType()->getElementType();
+            push(use, ary, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
+        }
+        else if (auto vec = dyn_cast<ConstantVector>(aggr)) {
+            auto elty = vec->getType()->getElementType();
+            push(use, vec, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
+        }
+        else {
+            jl_safe_printf("Unknown ConstantAggregate:\n");
+            llvm_dump(aggr);
+            abort();
+        }
+    };
+    auto handle_constexpr = [&] (Use *use, ConstantExpr *expr) {
+        if (!frame->samebits) {
+            push(use, expr, 0, false);
+            return;
+        }
+        auto opcode = expr->getOpcode();
+        if (opcode == Instruction::PtrToInt || opcode == Instruction::IntToPtr ||
+            opcode == Instruction::AddrSpaceCast || opcode == Instruction::BitCast) {
+            push(use, expr, frame->offset, true);
+        }
+        else {
+            push(use, expr, 0, false);
+        }
+    };
+    while (true) {
+        auto use = frame->cur;
+        if (!use) {
+            if (!pop())
+                return;
+            continue;
+        }
+        auto user = use->getUser();
+        if (isa<U>(user))
+            return;
+        frame->next();
+        if (auto aggr = dyn_cast<ConstantAggregate>(user)) {
+            handle_constaggr(use, aggr);
+        }
+        else if (auto expr = dyn_cast<ConstantExpr>(user)) {
+            handle_constexpr(use, expr);
+        }
+    }
+}
+
+struct CloneCtx {
+    struct Target {
+        int idx;
+        uint32_t flags;
+        std::unique_ptr<ValueToValueMapTy> vmap; // ValueToValueMapTy is not movable....
+        // function ids that needs relocation to be initialized
+        std::set<uint32_t> relocs{};
+        Target(int idx, const jl_target_spec_t &spec) :
+            idx(idx),
+            flags(spec.flags),
+            vmap(new ValueToValueMapTy)
+        {
+        }
+    };
+    struct Group : Target {
+        std::vector<Target> clones;
+        std::set<uint32_t> clone_fs;
+        Group(int base, const jl_target_spec_t &spec) :
+            Target(base, spec),
+            clones{},
+            clone_fs{}
+        {}
+        Function *base_func(Function *orig_f) const
+        {
+            if (idx == 0)
+                return orig_f;
+            return cast<Function>(vmap->lookup(orig_f));
+        }
+    };
+    CloneCtx(MultiVersioning *pass, Module &M);
+    void clone_bases();
+    void collect_func_infos();
+    void clone_all_partials();
+    void fix_gv_uses();
+    void fix_inst_uses();
+    void emit_metadata();
+private:
+    void prepare_vmap(ValueToValueMapTy &vmap);
+    bool is_vector(FunctionType *ty) const;
+    void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap);
+    uint32_t collect_func_info(Function &F);
+    void check_partial(Group &grp, Target &tgt);
+    void clone_partial(Group &grp, Target &tgt);
+    void add_features(Function *F, StringRef name, StringRef features) const;
+    template<typename T>
+    T *add_comdat(T *G) const;
+    uint32_t get_func_id(Function *F);
+    template<typename Stack>
+    Constant *rewrite_gv_init(const Stack& stack);
+    template<typename Stack>
+    Value *rewrite_inst_use(const Stack& stack, Value *replace, Instruction *insert_before);
+    std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F);
+    Constant *get_ptrdiff32(Constant *ptr, Constant *base) const;
+    template<typename T>
+    Constant *emit_offset_table(const std::vector<T*> &vars, StringRef name) const;
+
+    LLVMContext &ctx;
+    Type *T_size;
+    Type *T_int32;
+    Type *T_void;
+    PointerType *T_psize;
+    PointerType *T_pvoidfunc;
+    MDNode *tbaa_const;
+    MultiVersioning *pass;
+    std::vector<jl_target_spec_t> specs;
+    std::vector<Group> groups{};
+    std::vector<Function*> fvars;
+    std::vector<Constant*> gvars;
+    Module &M;
+    // Map from original functiton to one based index in `fvars`
+    std::map<const Function*,uint32_t> func_ids{};
+    std::vector<Function*> orig_funcs{};
+    std::vector<uint32_t> func_infos{};
+    std::set<Function*> cloned{};
+    // GV addresses and their corresponding function id (i.e. 0-based index in `fvars`)
+    std::vector<std::pair<Constant*,uint32_t>> gv_relocs{};
+    // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
+    std::map<uint32_t,GlobalVariable*> const_relocs;
+    bool has_veccall{false};
+    bool has_cloneall{false};
+};
+
+struct MultiVersioning: public ModulePass {
+    static char ID;
+    MultiVersioning()
+        : ModulePass(ID)
+    {}
+
+private:
+    bool runOnModule(Module &M) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override
+    {
+        AU.addRequired<LoopInfoWrapperPass>();
+        AU.addRequired<CallGraphWrapperPass>();
+        AU.addPreserved<LoopInfoWrapperPass>();
+    }
+    friend struct CloneCtx;
+};
+
+template<typename T>
+static inline std::vector<T*> consume_gv(Module &M, const char *name)
+{
+    // Get information about sysimg export functions from the two global variables.
+    // Strip them from the Module so that it's easier to handle the uses.
+    GlobalVariable *gv = M.getGlobalVariable(name);
+    assert(gv && gv->hasInitializer());
+    auto *ary = cast<ConstantArray>(gv->getInitializer());
+    unsigned nele = ary->getNumOperands();
+    std::vector<T*> res(nele);
+    for (unsigned i = 0; i < nele; i++)
+        res[i] = cast<T>(ary->getOperand(i)->stripPointerCasts());
+    assert(gv->use_empty());
+    gv->eraseFromParent();
+    if (ary->use_empty())
+        ary->destroyConstant();
+    return res;
+}
+
+// Collect basic information about targets and functions.
+CloneCtx::CloneCtx(MultiVersioning *pass, Module &M)
+    : ctx(M.getContext()),
+      T_size(M.getDataLayout().getIntPtrType(ctx, 0)),
+      T_int32(Type::getInt32Ty(ctx)),
+      T_void(Type::getVoidTy(ctx)),
+      T_psize(PointerType::get(T_size, 0)),
+      T_pvoidfunc(FunctionType::get(T_void, false)->getPointerTo()),
+      tbaa_const(tbaa_make_child("jtbaa_const", nullptr, true).first),
+      pass(pass),
+      specs(jl_get_llvm_clone_targets()),
+      fvars(consume_gv<Function>(M, "jl_sysimg_fvars")),
+      gvars(consume_gv<Constant>(M, "jl_sysimg_gvars")),
+      M(M)
+{
+    groups.emplace_back(0, specs[0]);
+    uint32_t ntargets = specs.size();
+    for (uint32_t i = 1; i < ntargets; i++) {
+        auto &spec = specs[i];
+        if (spec.flags & JL_TARGET_CLONE_ALL) {
+            has_cloneall = true;
+            groups.emplace_back(i, spec);
+        }
+        else {
+            auto base = spec.base;
+            bool found = false;
+            for (auto &grp: groups) {
+                if (grp.idx == base) {
+                    found = true;
+                    grp.clones.emplace_back(i, spec);
+                    break;
+                }
+            }
+            (void)found;
+        }
+    }
+    uint32_t nfvars = fvars.size();
+    for (uint32_t i = 0; i < nfvars; i++)
+        func_ids[fvars[i]] = i + 1;
+    for (auto &F: M) {
+        if (F.empty())
+            continue;
+        orig_funcs.push_back(&F);
+    }
+}
+
+void CloneCtx::prepare_vmap(ValueToValueMapTy &vmap)
+{
+    // Workaround LLVM `CloneFunctionInfo` bug (?) pre-5.0
+    // The `DICompileUnit`s are being cloned but are not added to the `llvm.dbg.cu` metadata
+    // which triggers assertions when generating native code/in the verifier.
+    // Fix this by forcing an identical mapping for all `DICompileUnit` recorded.
+    // The `DISubprogram` cloning on LLVM 5.0 handles this
+    // but it doesn't hurt to enforce the identity either.
+    auto &MD = vmap.MD();
+    for (auto cu: M.debug_compile_units()) {
+        MD[cu].reset(cu);
+    }
+}
+
+void CloneCtx::clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap)
+{
+    Function::arg_iterator DestI = new_f->arg_begin();
+    for (Function::const_arg_iterator J = F->arg_begin(); J != F->arg_end(); ++J) {
+        DestI->setName(J->getName());
+        vmap[&*J] = &*DestI++;
+    }
+    SmallVector<ReturnInst*,8> Returns;
+    CloneFunctionInto(new_f, F, vmap, true, Returns);
+}
+
+// Clone all clone_all targets. Makes sure that the base targets are all available.
+void CloneCtx::clone_bases()
+{
+    if (!has_cloneall)
+        return;
+    uint32_t ngrps = groups.size();
+    for (uint32_t gid = 1; gid < ngrps; gid++) {
+        auto &grp = groups[gid];
+        auto suffix = ".clone_" + std::to_string(grp.idx);
+        auto &vmap = *grp.vmap;
+        // Fill in old->new mapping. We need to do this before cloning the function so that
+        // the intra target calls are automatically fixed up on cloning.
+        for (auto F: orig_funcs) {
+            Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                               F->getName() + suffix, &M);
+            new_f->copyAttributesFrom(F);
+            vmap[F] = new_f;
+        }
+        prepare_vmap(vmap);
+        for (auto F: orig_funcs) {
+            clone_function(F, cast<Function>(vmap.lookup(F)), vmap);
+        }
+    }
+}
+
+bool CloneCtx::is_vector(FunctionType *ty) const
+{
+    if (ty->getReturnType()->isVectorTy())
+        return true;
+    for (auto arg: ty->params()) {
+        if (arg->isVectorTy()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+uint32_t CloneCtx::collect_func_info(Function &F)
+{
+    uint32_t flag = 0;
+    if (!pass->getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo().empty())
+        flag |= JL_TARGET_CLONE_LOOP;
+    if (is_vector(F.getFunctionType())) {
+        flag |= JL_TARGET_CLONE_SIMD;
+        has_veccall = true;
+    }
+    for (auto &bb: F) {
+        for (auto &I: bb) {
+            if (auto call = dyn_cast<CallInst>(&I)) {
+                if (is_vector(call->getFunctionType())) {
+                    has_veccall = true;
+                    flag |= JL_TARGET_CLONE_SIMD;
+                }
+                if (auto callee = call->getCalledFunction()) {
+                    auto name = callee->getName();
+                    if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) {
+                        flag |= JL_TARGET_CLONE_MATH;
+                    }
+                }
+            }
+            else if (auto store = dyn_cast<StoreInst>(&I)) {
+                if (store->getValueOperand()->getType()->isVectorTy()) {
+                    flag |= JL_TARGET_CLONE_SIMD;
+                }
+            }
+            else if (I.getType()->isVectorTy()) {
+                flag |= JL_TARGET_CLONE_SIMD;
+            }
+            if (auto mathOp = dyn_cast<FPMathOperator>(&I)) {
+                if (mathOp->getFastMathFlags().any()) {
+                    flag |= JL_TARGET_CLONE_MATH;
+                }
+            }
+            if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) {
+                return flag;
+            }
+        }
+    }
+    return flag;
+}
+
+void CloneCtx::collect_func_infos()
+{
+    uint32_t nfuncs = orig_funcs.size();
+    func_infos.resize(nfuncs);
+    for (uint32_t i = 0; i < nfuncs; i++) {
+        func_infos[i] = collect_func_info(*orig_funcs[i]);
+    }
+}
+
+void CloneCtx::clone_all_partials()
+{
+    // First decide what to clone
+    // Do this before actually cloning the functions
+    // so that the call graph is easier to understand
+    for (auto &grp: groups) {
+        for (auto &tgt: grp.clones) {
+            check_partial(grp, tgt);
+        }
+    }
+    for (auto &grp: groups) {
+        for (auto &tgt: grp.clones)
+            clone_partial(grp, tgt);
+        // Also set feature strings for base target functions
+        // now that all the actual cloning is done.
+        auto &base_spec = specs[grp.idx];
+        for (auto orig_f: orig_funcs) {
+            add_features(grp.base_func(orig_f), base_spec.cpu_name, base_spec.cpu_features);
+        }
+    }
+    func_infos.clear(); // We don't need this anymore
+}
+
+void CloneCtx::check_partial(Group &grp, Target &tgt)
+{
+    auto flag = specs[tgt.idx].flags & clone_mask;
+    auto suffix = ".clone_" + std::to_string(tgt.idx);
+    auto &vmap = *tgt.vmap;
+    uint32_t nfuncs = func_infos.size();
+
+    std::set<Function*> all_origs;
+    // Use a simple heuristic to decide which function we need to clone.
+    for (uint32_t i = 0; i < nfuncs; i++) {
+        if (!(func_infos[i] & flag))
+            continue;
+        auto orig_f = orig_funcs[i];
+        // Fill in old->new mapping. We need to do this before cloning the function so that
+        // the intra target calls are automatically fixed up on cloning.
+        auto F = grp.base_func(orig_f);
+        Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                           F->getName() + suffix, &M);
+        new_f->copyAttributesFrom(F);
+        vmap[F] = new_f;
+        if (!has_cloneall)
+            cloned.insert(orig_f);
+        grp.clone_fs.insert(i);
+        all_origs.insert(orig_f);
+    }
+    std::set<Function*> sets[2] = {all_origs, {}};
+    auto *cur_set = &sets[0];
+    auto *next_set = &sets[1];
+    // Reduce dispatch by expand the cloning set to functions that are directly called by
+    // and calling cloned functions.
+    auto &graph = pass->getAnalysis<CallGraphWrapperPass>().getCallGraph();
+    while (!cur_set->empty()) {
+        for (auto orig_f: *cur_set) {
+            // Use the uncloned function since it's already in the call graph
+            auto node = graph[orig_f];
+            for (const auto &I: *node) {
+                auto child_node = I.second;
+                auto orig_child_f = child_node->getFunction();
+                if (!orig_child_f)
+                    continue;
+                // Already cloned
+                if (all_origs.count(orig_child_f))
+                    continue;
+                bool calling_clone = false;
+                for (const auto &I2: *child_node) {
+                    auto orig_child_f2 = I2.second->getFunction();
+                    if (!orig_child_f2)
+                        continue;
+                    if (all_origs.count(orig_child_f2)) {
+                        calling_clone = true;
+                        break;
+                    }
+                }
+                if (!calling_clone)
+                    continue;
+                next_set->insert(orig_child_f);
+                all_origs.insert(orig_child_f);
+                auto child_f = grp.base_func(orig_child_f);
+                Function *new_f = Function::Create(child_f->getFunctionType(),
+                                                   child_f->getLinkage(),
+                                                   child_f->getName() + suffix, &M);
+                new_f->copyAttributesFrom(child_f);
+                vmap[child_f] = new_f;
+            }
+        }
+        std::swap(cur_set, next_set);
+        next_set->clear();
+    }
+    for (uint32_t i = 0; i < nfuncs; i++) {
+        // Only need to handle expanded functions
+        if (func_infos[i] & flag)
+            continue;
+        auto orig_f = orig_funcs[i];
+        if (all_origs.count(orig_f)) {
+            if (!has_cloneall)
+                cloned.insert(orig_f);
+            grp.clone_fs.insert(i);
+        }
+    }
+}
+
+void CloneCtx::clone_partial(Group &grp, Target &tgt)
+{
+    auto &spec = specs[tgt.idx];
+    auto &vmap = *tgt.vmap;
+    uint32_t nfuncs = orig_funcs.size();
+    prepare_vmap(vmap);
+    for (uint32_t i = 0; i < nfuncs; i++) {
+        auto orig_f = orig_funcs[i];
+        auto F = grp.base_func(orig_f);
+        if (auto new_v = map_get(vmap, F)) {
+            auto new_f = cast<Function>(new_v);
+            assert(new_f != F);
+            clone_function(F, new_f, vmap);
+            // We can set the feature strings now since no one is going to
+            // clone these functions again.
+            add_features(new_f, spec.cpu_name, spec.cpu_features);
+        }
+    }
+}
+
+void CloneCtx::add_features(Function *F, StringRef name, StringRef features) const
+{
+    auto attr = F->getFnAttribute("target-features");
+    if (attr.isStringAttribute()) {
+        std::string new_features = attr.getValueAsString();
+        new_features += ",";
+        new_features += features;
+        F->addFnAttr("target-features", new_features);
+    }
+    else {
+        F->addFnAttr("target-features", features);
+    }
+    F->addFnAttr("target-cpu", name);
+}
+
+uint32_t CloneCtx::get_func_id(Function *F)
+{
+    auto &ref = func_ids[F];
+    if (!ref) {
+        fvars.push_back(F);
+        ref = fvars.size();
+    }
+    return ref - 1;
+}
+
+template<typename Stack>
+Constant *CloneCtx::rewrite_gv_init(const Stack& stack)
+{
+    // Null initialize so that LLVM put it in the correct section.
+    SmallVector<Constant*, 8> args;
+    Constant *res = ConstantPointerNull::get(cast<PointerType>(stack[0].val->getType()));
+    uint32_t nlevel = stack.size();
+    for (uint32_t i = 1; i < nlevel; i++) {
+        auto &frame = stack[i];
+        auto val = frame.val;
+        Use *use = frame.use;
+        unsigned idx = use->getOperandNo();
+        unsigned nargs = val->getNumOperands();
+        args.resize(nargs);
+        for (unsigned j = 0; j < nargs; j++) {
+            if (idx == j) {
+                args[j] = res;
+            }
+            else {
+                args[j] = cast<Constant>(val->getOperand(j));
+            }
+        }
+        if (auto expr = dyn_cast<ConstantExpr>(val)) {
+            res = expr->getWithOperands(args);
+        }
+        else if (auto ary = dyn_cast<ConstantArray>(val)) {
+            res = ConstantArray::get(ary->getType(), args);
+        }
+        else if (auto strct = dyn_cast<ConstantStruct>(val)) {
+            res = ConstantStruct::get(strct->getType(), args);
+        }
+        else if (isa<ConstantVector>(val)) {
+            res = ConstantVector::get(args);
+        }
+        else {
+            jl_safe_printf("Unknown const use.");
+            llvm_dump(val);
+            abort();
+        }
+    }
+    return res;
+}
+
+void CloneCtx::fix_gv_uses()
+{
+    auto single_pass = [&] (Function *orig_f) {
+        bool changed = false;
+        for (auto uses = ConstantUses<GlobalValue>(orig_f, M); !uses.done(); uses.next()) {
+            changed = true;
+            auto &stack = uses.get_stack();
+            auto info = uses.get_info();
+            // We only support absolute pointer relocation.
+            assert(info.samebits);
+            // And only for non-constant global variable initializers
+            auto val = cast<GlobalVariable>(info.val);
+            assert(info.use->getOperandNo() == 0);
+            assert(!val->isConstant());
+            auto fid = get_func_id(orig_f);
+            auto addr = ConstantExpr::getPtrToInt(val, T_size);
+            if (info.offset)
+                addr = ConstantExpr::getAdd(addr, ConstantInt::get(T_size, info.offset));
+            gv_relocs.emplace_back(addr, fid);
+            val->setInitializer(rewrite_gv_init(stack));
+        }
+        return changed;
+    };
+    for (auto orig_f: orig_funcs) {
+        if (!has_cloneall && !cloned.count(orig_f))
+            continue;
+        while (single_pass(orig_f)) {
+        }
+    }
+}
+
+std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F)
+{
+    // Null initialize so that LLVM put it in the correct section.
+    auto id = get_func_id(F);
+    auto &slot = const_relocs[id];
+    if (!slot)
+        slot = new GlobalVariable(M, T_pvoidfunc, false, GlobalVariable::InternalLinkage,
+                                  ConstantPointerNull::get(T_pvoidfunc),
+                                  F->getName() + ".reloc_slot");
+    return std::make_pair(id, slot);
+}
+
+template<typename Stack>
+Value *CloneCtx::rewrite_inst_use(const Stack& stack, Value *replace, Instruction *insert_before)
+{
+    SmallVector<Constant*, 8> args;
+    uint32_t nlevel = stack.size();
+    for (uint32_t i = 1; i < nlevel; i++) {
+        auto &frame = stack[i];
+        auto val = frame.val;
+        Use *use = frame.use;
+        unsigned idx = use->getOperandNo();
+        if (auto expr = dyn_cast<ConstantExpr>(val)) {
+            auto inst = expr->getAsInstruction();
+            inst->replaceUsesOfWith(val->getOperand(idx), replace);
+            inst->insertBefore(insert_before);
+            replace = inst;
+            continue;
+        }
+        unsigned nargs = val->getNumOperands();
+        args.resize(nargs);
+        for (unsigned j = 0; j < nargs; j++) {
+            auto op = val->getOperand(j);
+            if (idx == j) {
+                args[j] = UndefValue::get(op->getType());
+            }
+            else {
+                args[j] = cast<Constant>(op);
+            }
+        }
+        if (auto ary = dyn_cast<ConstantArray>(val)) {
+            replace = InsertValueInst::Create(ConstantArray::get(ary->getType(), args),
+                                              replace, {idx}, "", insert_before);
+        }
+        else if (auto strct = dyn_cast<ConstantStruct>(val)) {
+            replace = InsertValueInst::Create(ConstantStruct::get(strct->getType(), args),
+                                              replace, {idx}, "", insert_before);
+        }
+        else if (isa<ConstantVector>(val)) {
+            replace = InsertElementInst::Create(ConstantVector::get(args), replace,
+                                                ConstantInt::get(T_size, idx), "",
+                                                insert_before);
+        }
+        else {
+            jl_safe_printf("Unknown const use.");
+            llvm_dump(val);
+            abort();
+        }
+    }
+    return replace;
+}
+
+void CloneCtx::fix_inst_uses()
+{
+    uint32_t nfuncs = orig_funcs.size();
+    for (auto &grp: groups) {
+        auto suffix = ".clone_" + std::to_string(grp.idx);
+        for (uint32_t i = 0; i < nfuncs; i++) {
+            if (!grp.clone_fs.count(i))
+                continue;
+            auto orig_f = orig_funcs[i];
+            auto F = grp.base_func(orig_f);
+            bool changed;
+            do {
+                changed = false;
+                for (auto uses = ConstantUses<Instruction>(F, M); !uses.done(); uses.next()) {
+                    auto info = uses.get_info();
+                    auto use_i = info.val;
+                    auto use_f = use_i->getFunction();
+                    if (!use_f->getName().endswith(suffix))
+                        continue;
+                    Instruction *insert_before = use_i;
+                    if (auto phi = dyn_cast<PHINode>(use_i))
+                        insert_before = phi->getIncomingBlock(*info.use)->getTerminator();
+                    uint32_t id;
+                    GlobalVariable *slot;
+                    std::tie(id, slot) = get_reloc_slot(orig_f);
+                    Instruction *ptr = new LoadInst(T_pvoidfunc, slot, "", false, insert_before);
+                    ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
+                    ptr = new BitCastInst(ptr, F->getType(), "", insert_before);
+                    use_i->setOperand(info.use->getOperandNo(),
+                                      rewrite_inst_use(uses.get_stack(), ptr,
+                                                       insert_before));
+
+                    grp.relocs.insert(id);
+                    for (auto &tgt: grp.clones) {
+                        // The enclosing function of the use is cloned,
+                        // no need to deal with this use on this target.
+                        if (map_get(*tgt.vmap, use_f))
+                            continue;
+                        tgt.relocs.insert(id);
+                    }
+
+                    changed = true;
+                }
+            } while (changed);
+        }
+    }
+}
+
+template<typename T>
+inline T *CloneCtx::add_comdat(T *G) const
+{
+#if defined(_OS_WINDOWS_)
+    // Add comdat information to make MSVC link.exe happy
+    // it's valid to emit this for ld.exe too,
+    // but makes it very slow to link for no benefit
+#if defined(_COMPILER_MICROSOFT_)
+    Comdat *jl_Comdat = G->getParent()->getOrInsertComdat(G->getName());
+    // ELF only supports Comdat::Any
+    jl_Comdat->setSelectionKind(Comdat::NoDuplicates);
+    G->setComdat(jl_Comdat);
+#endif
+    // add __declspec(dllexport) to everything marked for export
+    if (G->getLinkage() == GlobalValue::ExternalLinkage)
+        G->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
+    else
+        G->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+#endif
+    return G;
+}
+
+Constant *CloneCtx::get_ptrdiff32(Constant *ptr, Constant *base) const
+{
+    if (ptr->getType()->isPointerTy())
+        ptr = ConstantExpr::getPtrToInt(ptr, T_size);
+    auto ptrdiff = ConstantExpr::getSub(ptr, base);
+    return sizeof(void*) == 8 ? ConstantExpr::getTrunc(ptrdiff, T_int32) : ptrdiff;
+}
+
+template<typename T>
+Constant *CloneCtx::emit_offset_table(const std::vector<T*> &vars, StringRef name) const
+{
+    assert(!vars.empty());
+    add_comdat(GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage,
+                                   name + "_base",
+                                   ConstantExpr::getBitCast(vars[0], T_psize), &M));
+    auto vbase = ConstantExpr::getPtrToInt(vars[0], T_size);
+    uint32_t nvars = vars.size();
+    std::vector<Constant*> offsets(nvars + 1);
+    offsets[0] = ConstantInt::get(T_int32, nvars);
+    offsets[1] = ConstantInt::get(T_int32, 0);
+    for (uint32_t i = 1; i < nvars; i++)
+        offsets[i + 1] = get_ptrdiff32(vars[i], vbase);
+    ArrayType *vars_type = ArrayType::get(T_int32, nvars + 1);
+    add_comdat(new GlobalVariable(M, vars_type, true,
+                                  GlobalVariable::ExternalLinkage,
+                                  ConstantArray::get(vars_type, offsets),
+                                  name + "_offsets"));
+    return vbase;
+}
+
+void CloneCtx::emit_metadata()
+{
+    // Store back the information about exported functions.
+    auto fbase = emit_offset_table(fvars, "jl_sysimg_fvars");
+    auto gbase = emit_offset_table(gvars, "jl_sysimg_gvars");
+    uint32_t nfvars = fvars.size();
+
+    uint32_t ntargets = specs.size();
+    SmallVector<Target*, 8> targets(ntargets);
+    for (auto &grp: groups) {
+        targets[grp.idx] = &grp;
+        for (auto &tgt: grp.clones) {
+            targets[tgt.idx] = &tgt;
+        }
+    }
+
+    // Generate `jl_dispatch_target_ids`
+    {
+        const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0;
+        std::vector<uint8_t> data;
+        auto push_i32 = [&] (uint32_t v) {
+            uint8_t buff[4];
+            memcpy(buff, &v, 4);
+            data.insert(data.end(), buff, buff + 4);
+        };
+        push_i32(ntargets);
+        for (uint32_t i = 0; i < ntargets; i++) {
+            push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME));
+            auto &specdata = specs[i].data;
+            data.insert(data.end(), specdata.begin(), specdata.end());
+        }
+        auto value = ConstantDataArray::get(ctx, data);
+        add_comdat(new GlobalVariable(M, value->getType(), true,
+                                      GlobalVariable::ExternalLinkage,
+                                      value, "jl_dispatch_target_ids"));
+    }
+
+    // Generate `jl_dispatch_reloc_slots`
+    std::set<uint32_t> shared_relocs;
+    {
+        std::stable_sort(gv_relocs.begin(), gv_relocs.end(),
+                         [] (const std::pair<Constant*,uint32_t> &lhs,
+                             const std::pair<Constant*,uint32_t> &rhs) {
+                             return lhs.second < rhs.second;
+                         });
+        std::vector<Constant*> values{nullptr};
+        uint32_t gv_reloc_idx = 0;
+        uint32_t ngv_relocs = gv_relocs.size();
+        for (uint32_t id = 0; id < nfvars; id++) {
+            // TODO:
+            // explicitly set section? so that we are sure the relocation slots
+            // are in the same section as `gbase`.
+            auto id_v = ConstantInt::get(T_int32, id);
+            for (; gv_reloc_idx < ngv_relocs && gv_relocs[gv_reloc_idx].second == id;
+                 gv_reloc_idx++) {
+                shared_relocs.insert(id);
+                values.push_back(id_v);
+                values.push_back(get_ptrdiff32(gv_relocs[gv_reloc_idx].first, gbase));
+            }
+            auto it = const_relocs.find(id);
+            if (it != const_relocs.end()) {
+                values.push_back(id_v);
+                values.push_back(get_ptrdiff32(it->second, gbase));
+            }
+        }
+        values[0] = ConstantInt::get(T_int32, values.size() / 2);
+        ArrayType *vars_type = ArrayType::get(T_int32, values.size());
+        add_comdat(new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage,
+                                      ConstantArray::get(vars_type, values),
+                                      "jl_dispatch_reloc_slots"));
+    }
+
+    // Generate `jl_dispatch_fvars_idxs` and `jl_dispatch_fvars_offsets`
+    {
+        std::vector<uint32_t> idxs;
+        std::vector<Constant*> offsets;
+        for (uint32_t i = 0; i < ntargets; i++) {
+            auto tgt = targets[i];
+            auto &spec = specs[i];
+            uint32_t len_idx = idxs.size();
+            idxs.push_back(0); // We will fill in the real value later.
+            uint32_t count = 0;
+            if (i == 0 || spec.flags & JL_TARGET_CLONE_ALL) {
+                auto grp = static_cast<Group*>(tgt);
+                count = jl_sysimg_tag_mask;
+                for (uint32_t j = 0; j < nfvars; j++) {
+                    if (shared_relocs.count(j) || tgt->relocs.count(j)) {
+                        count++;
+                        idxs.push_back(j);
+                    }
+                    if (i != 0) {
+                        offsets.push_back(get_ptrdiff32(grp->base_func(fvars[j]), fbase));
+                    }
+                }
+            }
+            else {
+                auto baseidx = spec.base;
+                auto grp = static_cast<Group*>(targets[baseidx]);
+                idxs.push_back(baseidx);
+                for (uint32_t j = 0; j < nfvars; j++) {
+                    auto base_f = grp->base_func(fvars[j]);
+                    if (shared_relocs.count(j)) {
+                        count++;
+                        idxs.push_back(jl_sysimg_tag_mask | j);
+                        auto f = map_get(*tgt->vmap, base_f, base_f);
+                        offsets.push_back(get_ptrdiff32(cast<Function>(f), fbase));
+                    }
+                    else if (auto f = map_get(*tgt->vmap, base_f)) {
+                        count++;
+                        idxs.push_back(tgt->relocs.count(j) ? (jl_sysimg_tag_mask | j) : j);
+                        offsets.push_back(get_ptrdiff32(cast<Function>(f), fbase));
+                    }
+                }
+            }
+            idxs[len_idx] = count;
+        }
+        auto idxval = ConstantDataArray::get(ctx, idxs);
+        add_comdat(new GlobalVariable(M, idxval->getType(), true,
+                                      GlobalVariable::ExternalLinkage,
+                                      idxval, "jl_dispatch_fvars_idxs"));
+        ArrayType *offsets_type = ArrayType::get(T_int32, offsets.size());
+        add_comdat(new GlobalVariable(M, offsets_type, true,
+                                      GlobalVariable::ExternalLinkage,
+                                      ConstantArray::get(offsets_type, offsets),
+                                      "jl_dispatch_fvars_offsets"));
+    }
+}
+
+bool MultiVersioning::runOnModule(Module &M)
+{
+    // Group targets and identify cloning bases.
+    // Also initialize function info maps (we'll update these maps as we go)
+    // Maps that we need includes,
+    //
+    //     * Original function -> ID (initialize from `fvars` and allocate ID lazily)
+    //     * Cloned function -> Original function (add as we clone functions)
+    //     * Original function -> Base function (target specific and updated by LLVM)
+    //     * ID -> relocation slots (const).
+    CloneCtx clone(this, M);
+
+    // Collect a list of original functions and clone base functions
+    clone.clone_bases();
+
+    // Collect function info (type of instruction used)
+    clone.collect_func_infos();
+
+    // If any partially cloned target exist decide which functions to clone for these targets.
+    // Clone functions for each group and collect a list of them.
+    // We can also add feature strings for cloned functions
+    // now that no additional cloning needs to be done.
+    clone.clone_all_partials();
+
+    // Scan **ALL** cloned functions (including full cloning for base target)
+    // for global variables initialization use.
+    // Replace them with `null` slot to be initialized at runtime and record relocation slot.
+    // These relocations must be initialized for **ALL** targets.
+    clone.fix_gv_uses();
+
+    // For each group, scan all functions cloned by **PARTIALLY** cloned targets for
+    // instruction use.
+    // A function needs a const relocation slot if it is cloned and is called by a
+    // uncloned function for at least one partially cloned target in the group.
+    // This is also the condition that a use in an uncloned function needs to be replaced with
+    // a slot load (i.e. if both the caller and the callee are always cloned or not cloned
+    // on all targets, the caller site does not need a relocation slot).
+    // A target needs a slot to be initialized iff at least one caller is not initialized.
+    clone.fix_inst_uses();
+
+    // Store back sysimg information with the correct format.
+    // At this point, we should have fixed up all the uses of the cloned functions
+    // and collected all the shared/target-specific relocations.
+    clone.emit_metadata();
+
+    return true;
+}
+
+char MultiVersioning::ID = 0;
+static RegisterPass<MultiVersioning> X("JuliaMultiVersioning", "JuliaMultiVersioning Pass",
+                                       false /* Only looks at CFG */,
+                                       false /* Analysis Pass */);
+
+}
+
+Pass *createMultiVersioningPass()
+{
+    return new MultiVersioning();
+}
diff --git a/src/processor.h b/src/processor.h
index 512b8a2936e16..8ab8fecbb23bf 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -124,7 +124,7 @@ int jl_test_cpu_feature(jl_cpu_feature_t feature);
 static const uint32_t jl_sysimg_tag_mask = 0x80000000u;
 static const uint32_t jl_sysimg_val_mask = ~((uint32_t)0x80000000u);
 
-typedef struct {
+typedef struct _jl_sysimg_fptrs_t {
     // base function pointer
     const char *base;
     // number of functions
diff --git a/src/staticdata.c b/src/staticdata.c
index 72986e5312c6c..e06907947fb1c 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -9,6 +9,7 @@
 #include "julia.h"
 #include "julia_internal.h"
 #include "builtin_proto.h"
+#include "processor.h"
 
 #ifndef _OS_WINDOWS_
 #include <dlfcn.h>
@@ -134,19 +135,13 @@ static void *jl_sysimg_handle = NULL;
 static uint64_t sysimage_base = 0;
 static uintptr_t *sysimg_gvars_base = NULL;
 static const int32_t *sysimg_gvars_offsets = NULL;
-static const char *sysimg_fvars_base = NULL;
-static const int32_t *sysimg_fvars_offsets = NULL;
+static jl_sysimg_fptrs_t sysimg_fptrs;
 
 static inline uintptr_t *sysimg_gvars(uintptr_t *base, size_t idx)
 {
     return base + sysimg_gvars_offsets[idx] / sizeof(base[0]);
 }
 
-static inline uintptr_t sysimg_fvars(const char *base, size_t idx)
-{
-    return (uintptr_t)(base + sysimg_fvars_offsets[idx]);
-}
-
 JL_DLLEXPORT int jl_running_on_valgrind(void)
 {
     return RUNNING_ON_VALGRIND;
@@ -160,9 +155,8 @@ static void jl_load_sysimg_so(void)
         sysimg_gvars_base = (uintptr_t*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_gvars_base");
         sysimg_gvars_offsets = (const int32_t*)jl_dlsym(jl_sysimg_handle,
                                                         "jl_sysimg_gvars_offsets");
-        sysimg_fvars_base = (const char*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_fvars_base");
-        sysimg_fvars_offsets = (const int32_t*)jl_dlsym(jl_sysimg_handle,
-                                                        "jl_sysimg_fvars_offsets");
+        sysimg_gvars_offsets += 1;
+        assert(sysimg_fptrs.base);
         globalUnique = *(size_t*)jl_dlsym(jl_sysimg_handle, "jl_globalUnique");
 #ifdef JULIA_ENABLE_THREADING
         size_t tls_getter_idx = *(size_t*)jl_dlsym(jl_sysimg_handle,
@@ -187,6 +181,9 @@ static void jl_load_sysimg_so(void)
         }
 #endif
     }
+    else {
+        memset(&sysimg_fptrs, 0, sizeof(sysimg_fptrs));
+    }
     const char *sysimg_data = (const char*)jl_dlsym(jl_sysimg_handle, "jl_system_image_data");
     size_t len = *(size_t*)jl_dlsym(jl_sysimg_handle, "jl_system_image_size");
     jl_restore_system_image_data(sysimg_data, len);
@@ -931,14 +928,15 @@ static jl_value_t *jl_read_value(jl_serializer_state *s)
 
 static void jl_update_all_fptrs(jl_serializer_state *s)
 {
-    const char *fvars_base = sysimg_fvars_base;
+    jl_sysimg_fptrs_t fvars = sysimg_fptrs;
     // make these NULL now so we skip trying to restore GlobalVariable pointers later
     sysimg_gvars_base = NULL;
-    sysimg_fvars_base = NULL;
+    sysimg_fptrs.base = NULL;
     int sysimg_fvars_max = s->fptr_record->size / sizeof(void*);
     size_t i;
     uintptr_t base = (uintptr_t)&s->s->buf[0];
     jl_method_instance_t **linfos = (jl_method_instance_t**)&s->fptr_record->buf[0];
+    uint32_t clone_idx = 0;
     for (i = 0; i < sysimg_fvars_max; i++) {
         uintptr_t val = (uintptr_t)&linfos[i];
         uint32_t offset = load_uint32_be(&val);
@@ -950,18 +948,28 @@ static void jl_update_all_fptrs(jl_serializer_state *s)
                 offset = ~offset;
             }
             jl_method_instance_t *li = (jl_method_instance_t*)(base + offset);
-            if (fvars_base == NULL) {
+            if (fvars.base == NULL) {
                 li->jlcall_api = 0;
             }
             else {
+                uintptr_t base = (uintptr_t)fvars.base;
                 assert(jl_is_method(li->def.method) && li->jlcall_api && li->jlcall_api != 2);
                 linfos[i] = li;
-                jl_fptr_to_llvm((jl_fptr_t)sysimg_fvars(fvars_base, i), li, cfunc);
+                int32_t offset = fvars.offsets[i];
+                for (; clone_idx < fvars.nclones; clone_idx++) {
+                    uint32_t idx = fvars.clone_idxs[clone_idx] & jl_sysimg_val_mask;
+                    if (idx < i)
+                        continue;
+                    if (idx == i)
+                        offset = fvars.clone_offsets[clone_idx];
+                    break;
+                }
+                jl_fptr_to_llvm((jl_fptr_t)(base + offset), li, cfunc);
             }
         }
     }
-    if (fvars_base) {
-        jl_register_fptrs(sysimage_base, fvars_base, sysimg_fvars_offsets, linfos, sysimg_fvars_max);
+    if (fvars.base) {
+        jl_register_fptrs(sysimage_base, &fvars, linfos, sysimg_fvars_max);
     }
 }
 
@@ -1339,6 +1347,7 @@ JL_DLLEXPORT void jl_set_sysimg_so(void *handle)
     if (jl_options.cpu_target == NULL)
         jl_options.cpu_target = "native";
     jl_sysimg_handle = handle;
+    sysimg_fptrs = jl_init_processor_sysimg(handle);
 }
 
 static void jl_restore_system_image_from_stream(ios_t *f)

From ded8d464e8caeac5098192dc023e0b2520ccbcbe Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Mon, 25 Sep 2017 16:25:46 -0400
Subject: [PATCH 9/9] Finish up function multiversioning support

* Enable test function multiversioning on the CI

  We can't do too much cloning on the CI before hitting the timeout or memory limit...
  Also avoid turning on cloning on circle CI since we seem to be very close to the memory limit.

* Add devdoc
---
 .travis.yml                       |  1 +
 contrib/windows/appveyor_build.sh |  1 +
 doc/src/devdocs/sysimg.md         | 65 +++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index f4636e759ecff..d7cf3002f686b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -101,6 +101,7 @@ before_install:
         export JULIA_CPU_CORES=2;
         export JULIA_TEST_MAXRSS_MB=600;
         TESTSTORUN="all --skip linalg/triangular subarray"; fi # TODO: re enable these if possible without timing out
+    - echo "override JULIA_CPU_TARGET=generic;native" >> Make.user
     - git clone -q git://git.kitenet.net/moreutils
 script:
     - echo BUILDOPTS=$BUILDOPTS
diff --git a/contrib/windows/appveyor_build.sh b/contrib/windows/appveyor_build.sh
index 4f1ab3451016c..33f101cf18eb6 100755
--- a/contrib/windows/appveyor_build.sh
+++ b/contrib/windows/appveyor_build.sh
@@ -53,6 +53,7 @@ else
   echo 'LIBBLAS = -L$(JULIAHOME)/usr/bin -lopenblas' >> Make.user
   echo 'LIBBLASNAME = libopenblas' >> Make.user
 fi
+echo "override JULIA_CPU_TARGET=generic;native" >> Make.user
 
 # Set XC_HOST if in Cygwin or Linux
 case $(uname) in
diff --git a/doc/src/devdocs/sysimg.md b/doc/src/devdocs/sysimg.md
index 2248911710e90..ba0221d475208 100644
--- a/doc/src/devdocs/sysimg.md
+++ b/doc/src/devdocs/sysimg.md
@@ -38,3 +38,68 @@ and `force` set to `true`, one would execute:
 ```
 julia build_sysimg.jl /tmp/sys core2 ~/userimg.jl --force
 ```
+
+## System image optimized for multiple microarchitectures
+
+The system image can be compiled simultaneously for multiple CPU microarchitectures
+under the same instruction set architecture (ISA). Multiple versions of the same function
+may be created with minimum dispatch point inserted into shared functions
+in order to take advantage of different ISA extensions or other microarchitecture features.
+The version that offers the best performance will be selected automatically at runtime
+based on available features.
+
+### Specifying multiple system image targets
+
+Multi-microarch system image can be enabled by passing multiple targets
+during system image compilation. This can be done either with the `JULIA_CPU_TARGET` make option
+or with the `-C` command line option when running the compilation command manually.
+Multiple targets are separated by `;` in the option.
+The syntax for each target is a CPU name followed by multiple features separated by `,`.
+All features supported by LLVM is supported and a feature can be disabled with a `-` prefix.
+(`+` prefix is also allowed and ignored to be consistent with LLVM syntax).
+Additionally, two special features are supported to control the function cloning behavior.
+
+1. `clone_all`
+
+    By default, only functions that are the most likely to benefit from
+    the microarchitecture features will be cloned.
+    When `clone_all` is specified for a target, however,
+    **all** functions in the system image will be cloned for the target.
+    The negative form `-clone_all` can be used to prevent the built-in
+    heuristic from cloning all functions.
+
+2. `base(<n>)`
+
+    Where `<n>` is a placeholder for a non-negative number (e.g. `base(0)`, `base(1)`).
+    By default, a partially cloned (i.e. not `clone_all`) target will use functions
+    from the default target (first one specified) if a function is not cloned.
+    This behavior can be changed by specifying a different base with the `base(<n>)` option.
+    The `n`th target (0-based) will be used as the base target instead of the default (`0`th) one.
+    The base target has to be either `0` or another `clone_all` target.
+    Specifying a non default `clone_all` target as the base target will cause an error.
+
+### Implementation overview
+
+This is a brief overview of different part involved in the implementation.
+See code comments for each components for more implementation details.
+
+1. System image compilation
+
+    The parsing and cloning decision are done in `src/processor*`.
+    We currently support cloning of function based on the present of loops, simd instructions,
+    or other math operations (e.g. fastmath, fma, muladd).
+    This information is passed on to `src/llvm-multiversioning.cpp` which does the actual cloning.
+    In addition to doing the cloning and insert dispatch slots
+    (see comments in `MultiVersioning::runOnModule` for how this is done),
+    the pass also generates metadata so that the runtime can load and initialize the
+    system image correctly.
+    A detail description of the metadata is available in `src/processor.h`.
+
+2. System image loading
+
+    The loading and initialization of the system image is done in `src/processor*` by
+    parsing the metadata saved during system image generation.
+    Host feature detection and selection decision are done in `src/processor_*.cpp`
+    depending on the ISA. The target selection will prefer exact CPU name match,
+    larger vector register size, and larget number of features.
+    An overview of this process is in `src/processor.cpp`.