Skip to content

Commit ea9bd29

Browse files
committed
Merge tag 'kvm-x86-fixes-6.14-rcN.2' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 6.14-rcN #2 - Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow. - Ensure DEBUGCTL is context switched on AMD to avoid running the guest with the host's value, which can lead to unexpected bus lock #DBs. - Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly emulate BTF. KVM's lack of context switching has meant BTF has always been broken to some extent. - Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest can enable DebugSwap without KVM's knowledge. - Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO memory" phase without actually generating a write-protection fault. - Fix a printf() goof in the SEV smoke test that causes build failures with -Werror. - Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2 isn't supported by KVM.
2 parents 1cdad67 + f9dc8fb commit ea9bd29

File tree

12 files changed

+91
-35
lines changed

12 files changed

+91
-35
lines changed

arch/x86/include/asm/kvm_host.h

+1
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,7 @@ struct kvm_vcpu_arch {
780780
u32 pkru;
781781
u32 hflags;
782782
u64 efer;
783+
u64 host_debugctl;
783784
u64 apic_base;
784785
struct kvm_lapic *apic; /* kernel irqchip context */
785786
bool load_eoi_exitmap_pending;

arch/x86/kvm/cpuid.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
17631763

17641764
entry->ecx = entry->edx = 0;
17651765
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
1766-
entry->eax = entry->ebx;
1766+
entry->eax = entry->ebx = 0;
17671767
break;
17681768
}
17691769

arch/x86/kvm/svm/sev.c

+17-7
Original file line numberDiff line numberDiff line change
@@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
45904590

45914591
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
45924592
{
4593+
struct kvm *kvm = svm->vcpu.kvm;
4594+
45934595
/*
45944596
* All host state for SEV-ES guests is categorized into three swap types
45954597
* based on how it is handled by hardware during a world switch:
@@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
46134615

46144616
/*
46154617
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
4616-
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
4617-
* saves and loads debug registers (Type-A).
4618+
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
4619+
* not save or load debug registers. Sadly, KVM can't prevent SNP
4620+
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
4621+
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
4622+
* the guest has actually enabled (or not!) in the VMSA.
4623+
*
4624+
* If DebugSwap is *possible*, save the masks so that they're restored
4625+
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
4626+
* rely on the CPU to restore the host values; KVM will restore them as
4627+
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
4628+
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
4629+
* don't need to be restored per se, KVM just needs to ensure they are
4630+
* loaded with the correct values *if* the CPU writes the MSRs.
46184631
*/
4619-
if (sev_vcpu_has_debug_swap(svm)) {
4620-
hostsa->dr0 = native_get_debugreg(0);
4621-
hostsa->dr1 = native_get_debugreg(1);
4622-
hostsa->dr2 = native_get_debugreg(2);
4623-
hostsa->dr3 = native_get_debugreg(3);
4632+
if (sev_vcpu_has_debug_swap(svm) ||
4633+
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
46244634
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
46254635
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
46264636
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);

arch/x86/kvm/svm/svm.c

+49
Original file line numberDiff line numberDiff line change
@@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
31653165
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
31663166
break;
31673167
}
3168+
3169+
/*
3170+
* AMD changed the architectural behavior of bits 5:2. On CPUs
3171+
* without BusLockTrap, bits 5:2 control "external pins", but
3172+
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
3173+
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
3174+
* the guest to set bits 5:2 despite not actually virtualizing
3175+
* Performance-Monitoring/Breakpoint external pins. Drop bits
3176+
* 5:2 for backwards compatibility.
3177+
*/
3178+
data &= ~GENMASK(5, 2);
3179+
3180+
/*
3181+
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
3182+
* way to communicate lack of support to the guest.
3183+
*/
3184+
if (data & DEBUGCTLMSR_BTF) {
3185+
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
3186+
data &= ~DEBUGCTLMSR_BTF;
3187+
}
3188+
31683189
if (data & DEBUGCTL_RESERVED_BITS)
31693190
return 1;
31703191

@@ -4189,6 +4210,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
41894210

41904211
guest_state_enter_irqoff();
41914212

4213+
/*
4214+
* Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
4215+
* VMRUN controls whether or not physical IRQs are masked (KVM always
4216+
* runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
4217+
* temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
4218+
* into guest state if delivery of an event during VMRUN triggers a
4219+
* #VMEXIT, and the guest_state transitions already tell lockdep that
4220+
* IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
4221+
* this path, so IRQs aren't actually unmasked while running host code.
4222+
*/
4223+
raw_local_irq_enable();
4224+
41924225
amd_clear_divider();
41934226

41944227
if (sev_es_guest(vcpu->kvm))
@@ -4197,6 +4230,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
41974230
else
41984231
__svm_vcpu_run(svm, spec_ctrl_intercepted);
41994232

4233+
raw_local_irq_disable();
4234+
42004235
guest_state_exit_irqoff();
42014236
}
42024237

@@ -4253,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
42534288
clgi();
42544289
kvm_load_guest_xsave_state(vcpu);
42554290

4291+
/*
4292+
* Hardware only context switches DEBUGCTL if LBR virtualization is
4293+
* enabled. Manually load DEBUGCTL if necessary (and restore it after
4294+
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
4295+
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
4296+
*/
4297+
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4298+
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4299+
update_debugctlmsr(svm->vmcb->save.dbgctl);
4300+
42564301
kvm_wait_lapic_expire(vcpu);
42574302

42584303
/*
@@ -4280,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
42804325
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
42814326
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
42824327

4328+
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4329+
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4330+
update_debugctlmsr(vcpu->arch.host_debugctl);
4331+
42834332
kvm_load_host_xsave_state(vcpu);
42844333
stgi();
42854334

arch/x86/kvm/svm/svm.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
584584
/* svm.c */
585585
#define MSR_INVALID 0xffffffffU
586586

587-
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
587+
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
588588

589589
extern bool dump_invalid_vmcb;
590590

arch/x86/kvm/svm/vmenter.S

+1-9
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
170170
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
171171

172172
/* Enter guest mode */
173-
sti
174-
175173
3: vmrun %_ASM_AX
176174
4:
177-
cli
178-
179175
/* Pop @svm to RAX while it's the only available register. */
180176
pop %_ASM_AX
181177

@@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
340336
mov KVM_VMCB_pa(%rax), %rax
341337

342338
/* Enter guest mode */
343-
sti
344-
345339
1: vmrun %rax
346-
347-
2: cli
348-
340+
2:
349341
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
350342
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
351343

arch/x86/kvm/vmx/vmx.c

+2-6
Original file line numberDiff line numberDiff line change
@@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
15141514
*/
15151515
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
15161516
{
1517-
struct vcpu_vmx *vmx = to_vmx(vcpu);
1518-
15191517
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
15201518
shrink_ple_window(vcpu);
15211519

15221520
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
15231521

15241522
vmx_vcpu_pi_load(vcpu, cpu);
1525-
1526-
vmx->host_debugctlmsr = get_debugctlmsr();
15271523
}
15281524

15291525
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
74587454
}
74597455

74607456
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7461-
if (vmx->host_debugctlmsr)
7462-
update_debugctlmsr(vmx->host_debugctlmsr);
7457+
if (vcpu->arch.host_debugctl)
7458+
update_debugctlmsr(vcpu->arch.host_debugctl);
74637459

74647460
#ifndef CONFIG_X86_64
74657461
/*

arch/x86/kvm/vmx/vmx.h

-2
Original file line numberDiff line numberDiff line change
@@ -340,8 +340,6 @@ struct vcpu_vmx {
340340
/* apic deadline value in host tsc */
341341
u64 hv_deadline_tsc;
342342

343-
unsigned long host_debugctlmsr;
344-
345343
/*
346344
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
347345
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included

arch/x86/kvm/x86.c

+2
Original file line numberDiff line numberDiff line change
@@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1096810968
set_debugreg(0, 7);
1096910969
}
1097010970

10971+
vcpu->arch.host_debugctl = get_debugctlmsr();
10972+
1097110973
guest_timing_enter_irqoff();
1097210974

1097310975
for (;;) {

tools/testing/selftests/kvm/mmu_stress_test.c

+13-8
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "ucall_common.h"
1919

2020
static bool mprotect_ro_done;
21+
static bool all_vcpus_hit_ro_fault;
2122

2223
static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
2324
{
@@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
3637

3738
/*
3839
* Write to the region while mprotect(PROT_READ) is underway. Keep
39-
* looping until the memory is guaranteed to be read-only, otherwise
40-
* vCPUs may complete their writes and advance to the next stage
41-
* prematurely.
40+
* looping until the memory is guaranteed to be read-only and a fault
41+
* has occurred, otherwise vCPUs may complete their writes and advance
42+
* to the next stage prematurely.
4243
*
4344
* For architectures that support skipping the faulting instruction,
4445
* generate the store via inline assembly to ensure the exact length
@@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
5657
#else
5758
vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
5859
#endif
59-
} while (!READ_ONCE(mprotect_ro_done));
60+
} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));
6061

6162
/*
6263
* Only architectures that write the entire range can explicitly sync,
@@ -81,6 +82,7 @@ struct vcpu_info {
8182

8283
static int nr_vcpus;
8384
static atomic_t rendezvous;
85+
static atomic_t nr_ro_faults;
8486

8587
static void rendezvous_with_boss(void)
8688
{
@@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
148150
* be stuck on the faulting instruction for other architectures. Go to
149151
* stage 3 without a rendezvous
150152
*/
151-
do {
152-
r = _vcpu_run(vcpu);
153-
} while (!r);
153+
r = _vcpu_run(vcpu);
154154
TEST_ASSERT(r == -1 && errno == EFAULT,
155155
"Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
156156

157+
atomic_inc(&nr_ro_faults);
158+
if (atomic_read(&nr_ro_faults) == nr_vcpus) {
159+
WRITE_ONCE(all_vcpus_hit_ro_fault, true);
160+
sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
161+
}
162+
157163
#if defined(__x86_64__) || defined(__aarch64__)
158164
/*
159165
* Verify *all* writes from the guest hit EFAULT due to the VMA now
@@ -378,7 +384,6 @@ int main(int argc, char *argv[])
378384
rendezvous_with_vcpus(&time_run2, "run 2");
379385

380386
mprotect(mem, slot_size, PROT_READ);
381-
usleep(10);
382387
mprotect_ro_done = true;
383388
sync_global_to_guest(vm, mprotect_ro_done);
384389

tools/testing/selftests/kvm/x86/nested_exceptions_test.c

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
8585

8686
GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
8787
GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
88+
GUEST_ASSERT(!ctrl->int_state);
8889
}
8990

9091
static void l1_svm_code(struct svm_test_data *svm)
@@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
122123
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
123124
GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
124125
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
126+
GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO));
125127
}
126128

127129
static void l1_vmx_code(struct vmx_pages *vmx)

tools/testing/selftests/kvm/x86/sev_smoke_test.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
5252
bool bad = false;
5353
for (i = 0; i < 4095; i++) {
5454
if (from_host[i] != from_guest[i]) {
55-
printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
55+
printf("mismatch at %u | %02hhx %02hhx\n",
56+
i, from_host[i], from_guest[i]);
5657
bad = true;
5758
}
5859
}

0 commit comments

Comments
 (0)