-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
An extra memcpy with -Zmir-opt-level=2 #77613
Comments
Something is really wrong here as removing if let Some(clip) = clip {
clip
} else {
return
}; brings it back. |
Here's the llvm-ir for define void @_ZN7example1g17h14264d45cdc1c8ddE(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip) unnamed_addr #1 !dbg !18 {
start:
%_9 = alloca %SpecificDisplayItem, align 8
%_8 = alloca %DI, align 8
%item = alloca %SpecificDisplayItem, align 8
%_2 = call align 1 dereferenceable(1) i8* @"_ZN4core6option15Option$LT$T$GT$6unwrap17hd51eb875dba3900fE"(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip, %"std::panic::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc21 to %"std::panic::Location"*)), !dbg !21
br label %bb1, !dbg !21
bb1: ; preds = %start
%0 = bitcast %SpecificDisplayItem* %item to i64*, !dbg !22
store i64 0, i64* %0, align 8, !dbg !22
%1 = bitcast %SpecificDisplayItem* %_9 to i8*, !dbg !23
%2 = bitcast %SpecificDisplayItem* %item to i8*, !dbg !23
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 184, i1 false), !dbg !23
%3 = bitcast %DI* %_8 to %SpecificDisplayItem*, !dbg !24
%4 = bitcast %SpecificDisplayItem* %3 to i8*, !dbg !24
%5 = bitcast %SpecificDisplayItem* %_9 to i8*, !dbg !24
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %5, i64 184, i1 false), !dbg !24
call void @_ZN7example7do_item17h87ff3f7e8c231d7dE(%DI* noalias readonly align 8 dereferenceable(184) %_8), !dbg !25
br label %bb2, !dbg !25
bb2: ; preds = %bb1
ret void, !dbg !26
} and mir-opt-level=2 define void @_ZN7example1g17h14264d45cdc1c8ddE(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip) unnamed_addr #1 !dbg !18 {
start:
%item = alloca %SpecificDisplayItem, align 8
%_6 = alloca %DI, align 8
%_2 = call align 1 dereferenceable(1) i8* @"_ZN4core6option15Option$LT$T$GT$6unwrap17hd51eb875dba3900fE"(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip, %"std::panic::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc21 to %"std::panic::Location"*)), !dbg !21
br label %bb1, !dbg !21
bb1: ; preds = %start
%0 = bitcast %SpecificDisplayItem* %item to i64*, !dbg !22
store i64 0, i64* %0, align 8, !dbg !22
%1 = bitcast %DI* %_6 to %SpecificDisplayItem*, !dbg !23
%2 = bitcast %SpecificDisplayItem* %1 to i8*, !dbg !23
%3 = bitcast %SpecificDisplayItem* %item to i8*, !dbg !23
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 184, i1 false), !dbg !23
call void @_ZN7example7do_item17h87ff3f7e8c231d7dE(%DI* noalias readonly align 8 dereferenceable(184) %_6), !dbg !24
br label %bb2, !dbg !24
bb2: ; preds = %bb1
ret void, !dbg !25
} |
I was only able to reproduce this with
|
I think I know what the issue is, broadly. The tl;dr is that LLVM doesn't take into account context from multiple basic blocks when applying the memcpy optimization. And certain combinations of opt-level and mir-opt-level generate code where the context required to optimize away the memcpy is in multiple basic blocks. Disclaimer: this is my first real brush with MIR and LLVM-IR, so some of the following has a good chance of being wrong. Part of the issue comes from pub fn g(clip: bool) {
if clip {
return;
}
let item = SpecificDisplayItem::PopStackingContext;
do_item(&DI {
item,
});
} Here is the LLVM-IR just before the LLVM optimization pass that would remove the memcpy: define void @_ZN3lib1g17h19b6423a2d95f243E(i1 zeroext %clip) unnamed_addr #0 {
start:
%item.sroa.2 = alloca [22 x i64], align 8
%_5 = alloca %DI, align 8
br i1 %clip, label %bb4, label %bb1
bb1: ; preds = %start
%0 = bitcast %DI* %_5 to i8*
call void @llvm.lifetime.start.p0i8(i64 184, i8* nonnull %0)
%item.sroa.0.0..sroa_idx = getelementptr inbounds %DI, %DI* %_5, i64 0, i32 0, i64 0
store i64 0, i64* %item.sroa.0.0..sroa_idx, align 8
%item.sroa.2.0..sroa_idx1 = getelementptr inbounds %DI, %DI* %_5, i64 0, i32 1, i32 2
%item.sroa.2.0..sroa_cast = bitcast [22 x i64]* %item.sroa.2.0..sroa_idx1 to i8*
%item.sroa.2.0.sroa_cast = bitcast [22 x i64]* %item.sroa.2 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %item.sroa.2.0..sroa_cast, i8* nonnull align 8 dereferenceable(176) %item.sroa.2.0.sroa_cast, i64 176, i1 false)
call fastcc void @_ZN3lib7do_item17h4ec77638412d024eE(%DI* noalias nonnull readonly align 8 dereferenceable(184) %_5)
call void @llvm.lifetime.end.p0i8(i64 184, i8* nonnull %0)
br label %bb4
bb4: ; preds = %start, %bb1
ret void
} The problem is that the source of the memcpy, You can also tell LLVM that Why doesn't the issue occur with define void @_ZN3lib1g17h19b6423a2d95f243E(i1 zeroext %clip) unnamed_addr #0 {
start:
%_8.sroa.4 = alloca [22 x i64], align 8
%_7 = alloca %DI, align 8
%item.sroa.4 = alloca [22 x i64], align 8
br i1 %clip, label %bb4, label %bb1
bb1: ; preds = %start
%item.sroa.4.0.sroa_cast = bitcast [22 x i64]* %item.sroa.4 to i8*
call void @llvm.lifetime.start.p0i8(i64 176, i8* nonnull %item.sroa.4.0.sroa_cast)
%0 = bitcast %DI* %_7 to i8*
call void @llvm.lifetime.start.p0i8(i64 184, i8* nonnull %0)
%_8.sroa.4.0.sroa_cast10 = bitcast [22 x i64]* %_8.sroa.4 to i8*
call void @llvm.lifetime.start.p0i8(i64 176, i8* nonnull %_8.sroa.4.0.sroa_cast10)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0.sroa_cast10, i8* nonnull align 8 dereferenceable(176) %item.sroa.4.0.sroa_cast, i64 176, i1 false)
%_8.sroa.0.0..sroa_idx = getelementptr inbounds %DI, %DI* %_7, i64 0, i32 0, i64 0
store i64 0, i64* %_8.sroa.0.0..sroa_idx, align 8
%_8.sroa.4.0..sroa_idx7 = getelementptr inbounds %DI, %DI* %_7, i64 0, i32 1, i32 2
%_8.sroa.4.0..sroa_cast = bitcast [22 x i64]* %_8.sroa.4.0..sroa_idx7 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0..sroa_cast, i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0.sroa_cast10, i64 176, i1 false)
call void @llvm.lifetime.end.p0i8(i64 176, i8* nonnull %_8.sroa.4.0.sroa_cast10)
call fastcc void @_ZN3lib7do_item17h4ec77638412d024eE(%DI* noalias nonnull readonly align 8 dereferenceable(184) %_7)
call void @llvm.lifetime.end.p0i8(i64 184, i8* nonnull %0)
call void @llvm.lifetime.end.p0i8(i64 176, i8* nonnull %item.sroa.4.0.sroa_cast)
br label %bb4
bb4: ; preds = %start, %bb1
ret void
} Here, the sources of the memcpys in Possible fixes:
1 would be desirable regardless of any other fix. 2 might be good to enable other optimizations. 3 may or may not be worthwhile. |
FWIW I have a new patch to enabled that at https://reviews.llvm.org/D89207. Will take a while until this is enabled by default though. |
Extra memcpy is no longer present on nightly, presumably as a result of #82806. |
With -Zmir-opt-level=1 this compiles to
With mir-opt-level=2 it compiles to:
This is with Rust nightly on rust.godbolt.org
The text was updated successfully, but these errors were encountered: