improve performance issue of @nospecialize-d keyword func call

aviatesk · aviatesk · commit bf0ce6a3a613 · 2022-10-05T21:49:35.000+09:00
This commit tries to fix and improve performance for calling keyword funcs whose arguments types are not fully known but `@nospecialize`-d. The final result would look like (this particular example is taken from our Julia-level compiler implementation): ```julia abstract type CallInfo end struct NoCallInfo <: CallInfo end struct NewInstruction stmt::Any type::Any info::CallInfo line::Union{Int32,Nothing} # if nothing, copy the line from previous statement in the insertion location flag::Union{UInt8,Nothing} # if nothing, IR flags will be recomputed on insertion function NewInstruction(@nospecialize(stmt), @nospecialize(type), @nospecialize(info::CallInfo), line::Union{Int32,Nothing}, flag::Union{UInt8,Nothing}) return new(stmt, type, info, line, flag) end end @nospecialize function NewInstruction(newinst::NewInstruction; stmt=newinst.stmt, type=newinst.type, info::CallInfo=newinst.info, line::Union{Int32,Nothing}=newinst.line, flag::Union{UInt8,Nothing}=newinst.flag) return NewInstruction(stmt, type, info, line, flag) end @Specialize using BenchmarkTools struct VirtualKwargs stmt::Any type::Any info::CallInfo end vkws = VirtualKwargs(nothing, Any, NoCallInfo()) newinst = NewInstruction(nothing, Any, NoCallInfo(), nothing, nothing) runner(newinst, vkws) = NewInstruction(newinst; vkws.stmt, vkws.type, vkws.info) @benchmark runner($newinst, $vkws) ``` > on master ``` BenchmarkTools.Trial: 10000 samples with 186 evaluations. Range (min … max): 559.898 ns … 4.173 μs ┊ GC (min … max): 0.00% … 85.29% Time (median): 605.608 ns ┊ GC (median): 0.00% Time (mean ± σ): 638.170 ns ± 125.080 ns ┊ GC (mean ± σ): 0.06% ± 0.85% █▇▂▆▄ ▁█▇▄▂ ▂ ██████▅██████▇▇▇██████▇▇▇▆▆▅▄▅▄▂▄▄▅▇▆▆▆▆▆▅▆▆▄▄▅▅▄▃▄▄▄▅▃▅▅▆▅▆▆ █ 560 ns Histogram: log(frequency) by time 1.23 μs < Memory estimate: 32 bytes, allocs estimate: 2. ``` > on this commit ```julia BenchmarkTools.Trial: 10000 samples with 1000 evaluations. Range (min … max): 3.080 ns … 83.177 ns ┊ GC (min … max): 0.00% … 0.00% Time (median): 3.098 ns ┊ GC (median): 0.00% Time (mean ± σ): 3.118 ns ± 0.885 ns ┊ GC (mean ± σ): 0.00% ± 0.00% ▂▅▇█▆▅▄▂ ▂▄▆▆▇████████▆▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▂▂▂ ▃ 3.08 ns Histogram: frequency by time 3.19 ns < Memory estimate: 0 bytes, allocs estimate: 0. ``` So for this particular case it achieves roughly 200x speed up. This is because this commit allows inlining of a call to keyword sorter as well as removal of `NamedTuple` call. Especially this commit is composed of the following improvements: - add early return case for `structdiff`: This change improves the return type inference for a case when compared `NamedTuple`s are type unstable but there is no difference in their names, e.g. given two `NamedTuple{(:a,:b),T} where T<:Tuple{Any,Any}`s. And in such case the optimizer will remove `structdiff` and succeeding `pairs` calls, letting the keyword sorter to be inlined. - add special SROA handling for `NamedTuple` generated by keyword sorter: With the change on `structdiff`, IR for a call with type-unstable keyword arguments after inlining would look like: ``` %1 = tuple(a, b, c)::Tuple{Any, Any, Any} %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any} %3 = Core.getfield(%2, :a)::Any %4 = Core.getfield(%2, :b)::Any %5 = Core.getfield(%2, :c)::Any [... other body of the keyword func ...] ``` We can implement a bit hacky special handling within our SROA pass that checks if this definition (%2) is partly well-known `NamedTuple` construction, where its names are fully known, and also checks if its call argument (%1) is fully-known `tuple` call. In a case when the length of the `NamedTuple` names and the length of the arguments for the `tuple` call, we can safely replace those `getfield` calls with the corresponding `tuple` call argument, while letting the later DCE pass to delete the constructions of tuple and named-tuple altogether. With these changes, the IR for the example `NewInstruction` constructor is fairly optimized, like: ```julia julia> Base.code_ircode((NewInstruction,Any,Any,CallInfo)) do newinst, stmt, type, info NewInstruction(newinst; stmt, type, info) end |> only 2 1 ── %1 = Base.getfield(_2, :line)::Union{Nothing, Int32} │╻╷ Type##kw │ %2 = Base.getfield(_2, :flag)::Union{Nothing, UInt8} ││┃ getproperty │ %3 = (isa)(%1, Nothing)::Bool ││ │ %4 = (isa)(%2, Nothing)::Bool ││ │ %5 = (Core.Intrinsics.and_int)(%3, %4)::Bool ││ └─── goto #3 if not %5 ││ 2 ── %7 = %new(Main.NewInstruction, _3, _4, _5, nothing, nothing)::NewInstruction NewInstruction └─── goto #10 ││ 3 ── %9 = (isa)(%1, Int32)::Bool ││ │ %10 = (isa)(%2, Nothing)::Bool ││ │ %11 = (Core.Intrinsics.and_int)(%9, %10)::Bool ││ └─── goto #5 if not %11 ││ 4 ── %13 = π (%1, Int32) ││ │ %14 = %new(Main.NewInstruction, _3, _4, _5, %13, nothing)::NewInstruction│││╻ NewInstruction └─── goto #10 ││ 5 ── %16 = (isa)(%1, Nothing)::Bool ││ │ %17 = (isa)(%2, UInt8)::Bool ││ │ %18 = (Core.Intrinsics.and_int)(%16, %17)::Bool ││ └─── goto #7 if not %18 ││ 6 ── %20 = π (%2, UInt8) ││ │ %21 = %new(Main.NewInstruction, _3, _4, _5, nothing, %20)::NewInstruction│││╻ NewInstruction └─── goto #10 ││ 7 ── %23 = (isa)(%1, Int32)::Bool ││ │ %24 = (isa)(%2, UInt8)::Bool ││ │ %25 = (Core.Intrinsics.and_int)(%23, %24)::Bool ││ └─── goto #9 if not %25 ││ 8 ── %27 = π (%1, Int32) ││ │ %28 = π (%2, UInt8) ││ │ %29 = %new(Main.NewInstruction, _3, _4, _5, %27, %28)::NewInstruction │││╻ NewInstruction └─── goto #10 ││ 9 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} └─── unreachable ││ 10 ┄ %33 = φ (#2 => %7, #4 => %14, #6 => %21, #8 => %29)::NewInstruction ││ └─── goto #11 ││ 11 ─ return %33 │ => NewInstruction ```
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
@@ -415,6 +415,42 @@ function lift_leaves(compact::IncrementalCompact,
                 return nothing
             else
                 typ = argextype(leaf, compact)
+
+                # Here we implement a bit hacky special case to optimize a function call with
+                # type-unstable but `@nospecialize`-d keyword arguments, whose IR at this point
+                # would look like:
+                # ```
+                # %1 = tuple(a, b, c)::Tuple{Any, Any, Any}
+                # %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any}
+                # %3  = Core.getfield(%2, :a)::Any
+                # %4  = Core.getfield(%2, :b)::Any
+                # %5  = Core.getfield(%2, :c)::Any
+                # [... other body of the keyword func ...]
+                # ```
+                # First, we check if this definition (%2) is partly well-known
+                # `NamedTuple` construction, where its names are fully known, and also
+                # if its call argument (%1) is fully-known `tuple` call.
+                # In a case when the length of the `NamedTuple` names and the length of
+                # the arguments for the `tuple` call, we can safely replace those `getfield`
+                # calls with the corresponding `tuple` call argument, while letting the
+                # later DCE pass to delete the constructions of tuple and named-tuple
+                if isa(typ′, DataType) && typ′.name === _NAMEDTUPLE_NAME && length(typ′.parameters) == 2
+                    ns = typ′.parameters[1]
+                    if is_known_call(def, NamedTuple{ns}, compact) && length(def.args) == 2
+                        arg = def.args[2]
+                        if isa(arg, AnySSAValue)
+                            argexpr = compact[arg][:inst]
+                            if is_known_call(argexpr, tuple, compact) && length(ns) == length(argexpr.args)-1
+                                # ok, we know this NamedTuple construction is nothrow,
+                                # let's mark this NamedTuple as DCE-eligible
+                                compact[leaf::AnySSAValue][:flag] |= IR_FLAG_EFFECT_FREE
+                                lift_arg!(compact, argexpr.args[field+1], cache_key, argexpr, field+1, lifted_leaves)
+                                continue
+                            end
+                        end
+                    end
+                end
+
                 if !isa(typ, Const)
                     # TODO: (disabled since #27126)
                     # If the leaf is an old ssa value, insert a getfield here
@@ -469,7 +505,7 @@ function lift_arg!(
         end
     end
     lifted_leaves[cache_key] = LiftedValue(lifted)
-    nothing
+    return nothing
 end
 
 function walk_to_def(compact::IncrementalCompact, @nospecialize(leaf))
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
@@ -1596,6 +1596,11 @@ function apply_type_tfunc(@nospecialize(headtypetype), @nospecialize args...)
     end
     if istuple
         return Type{<:appl}
+    elseif isa(appl, DataType) && appl.name === _NAMEDTUPLE_NAME && appl.parameters[1] === ()
+        # if the first parameter of `NamedTuple` is known to be empty tuple,
+        # the second argument should also be empty tuple type,
+        # so refine it here
+        return Const(NamedTuple{(),Tuple{}})
     end
     ans = Type{appl}
     for i = length(outervars):-1:1
diff --git a/base/namedtuple.jl b/base/namedtuple.jl
@@ -335,22 +335,27 @@ reverse(nt::NamedTuple) = NamedTuple{reverse(keys(nt))}(reverse(values(nt)))
 end
 
 """
-    structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn},Type{NamedTuple{bn}}}) where {an,bn}
+    structdiff(a::NamedTuple, b::Union{NamedTuple,Type{NamedTuple}})
 
 Construct a copy of named tuple `a`, except with fields that exist in `b` removed.
 `b` can be a named tuple, or a type of the form `NamedTuple{field_names}`.
 """
 function structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn}, Type{NamedTuple{bn}}}) where {an, bn}
     if @generated
         names = diff_names(an, bn)
+        isempty(names) && return (;) # just a fast pass
         idx = Int[ fieldindex(a, names[n]) for n in 1:length(names) ]
         types = Tuple{Any[ fieldtype(a, idx[n]) for n in 1:length(idx) ]...}
         vals = Any[ :(getfield(a, $(idx[n]))) for n in 1:length(idx) ]
-        :( NamedTuple{$names,$types}(($(vals...),)) )
+        return :( NamedTuple{$names,$types}(($(vals...),)) )
     else
         names = diff_names(an, bn)
+        # N.B this early return is necessary to get a better type stability,
+        # and also allows us to cut off the cost from constructing
+        # potentially type unstable closure passed to the `map` below
+        isempty(names) && return (;)
         types = Tuple{Any[ fieldtype(typeof(a), names[n]) for n in 1:length(names) ]...}
-        NamedTuple{names,types}(map(Fix1(getfield, a), names))
+        return NamedTuple{names,types}(map(n::Symbol->getfield(a, n), names))
     end
 end
 
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
@@ -2336,6 +2336,12 @@ end
 # Equivalence of Const(T.instance) and T for singleton types
 @test Const(nothing) ⊑ Nothing && Nothing ⊑ Const(nothing)
 
+# `apply_type_tfunc` should always return accurate result for empty NamedTuple case
+import Core: Const
+import Core.Compiler: apply_type_tfunc
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple{}) === Const(typeof((;)))
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple) === Const(typeof((;)))
+
 # Don't pessimize apply_type to anything worse than Type and yield Bottom for invalid Unions
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union}}) == Type{Union{}}
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union},Any}) == Type