Skip to content

Commit 5f3d0b7

Browse files
committedJan 30, 2015
Use AVX unaligned memcpy only if AVX2 is available
memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
1 parent b658fdd commit 5f3d0b7

File tree

10 files changed

+37
-10
lines changed

10 files changed

+37
-10
lines changed
 

‎ChangeLog

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
2015-01-30 H.J. Lu <hongjiu.lu@intel.com>
2+
3+
[BZ #17801]
4+
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
5+
Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
6+
* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
7+
New.
8+
(index_AVX_Fast_Unaligned_Load): Likewise.
9+
(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
10+
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
11+
bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
12+
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
13+
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
14+
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
15+
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
16+
HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
17+
* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
18+
119
2015-01-29 Andreas Schwab <schwab@suse.de>
220

321
* sysdeps/nptl/allocrtsig.c: Include <signal.h>.

‎NEWS

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ Version 2.21
1717
17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653,
1818
17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722,
1919
17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748,
20-
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803,
21-
17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
20+
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801,
21+
17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
2222

2323
* A new semaphore algorithm has been implemented in generic C code for all
2424
machines. Previous custom assembly implementations of semaphore were

‎sysdeps/x86_64/multiarch/init-arch.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,14 @@ __init_cpu_features (void)
171171
/* Determine if AVX is usable. */
172172
if (CPUID_AVX)
173173
__cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
174-
/* Determine if AVX2 is usable. */
174+
#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
175+
# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
176+
#endif
177+
/* Determine if AVX2 is usable. Unaligned load with 256-bit
178+
AVX registers are faster on processors with AVX2. */
175179
if (CPUID_AVX2)
176-
__cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
180+
__cpu_features.feature[index_AVX2_Usable]
181+
|= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
177182
/* Determine if FMA is usable. */
178183
if (CPUID_FMA)
179184
__cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;

‎sysdeps/x86_64/multiarch/init-arch.h

+4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define bit_FMA4_Usable (1 << 8)
2626
#define bit_Slow_SSE4_2 (1 << 9)
2727
#define bit_AVX2_Usable (1 << 10)
28+
#define bit_AVX_Fast_Unaligned_Load (1 << 11)
2829

2930
/* CPUID Feature flags. */
3031

@@ -74,6 +75,7 @@
7475
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
7576
# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
7677
# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
78+
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
7779

7880
#else /* __ASSEMBLER__ */
7981

@@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
169171
# define index_FMA4_Usable FEATURE_INDEX_1
170172
# define index_Slow_SSE4_2 FEATURE_INDEX_1
171173
# define index_AVX2_Usable FEATURE_INDEX_1
174+
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
172175

173176
# define HAS_ARCH_FEATURE(name) \
174177
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
@@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
181184
# define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable)
182185
# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
183186
# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable)
187+
# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
184188

185189
#endif /* __ASSEMBLER__ */

‎sysdeps/x86_64/multiarch/memcpy.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
3333
jne 1f
3434
call __init_cpu_features
3535
1: leaq __memcpy_avx_unaligned(%rip), %rax
36-
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
36+
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
3737
jz 1f
3838
ret
3939
1: leaq __memcpy_sse2(%rip), %rax

‎sysdeps/x86_64/multiarch/memcpy_chk.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
3939
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
4040
jz 2f
4141
leaq __memcpy_chk_ssse3_back(%rip), %rax
42-
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
42+
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
4343
jz 2f
4444
leaq __memcpy_chk_avx_unaligned(%rip), %rax
4545
2: ret

‎sysdeps/x86_64/multiarch/memmove.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
4949
ifunc symbol properly. */
5050
extern __typeof (__redirect_memmove) __libc_memmove;
5151
libc_ifunc (__libc_memmove,
52-
HAS_AVX
52+
HAS_AVX_FAST_UNALIGNED_LOAD
5353
? __memmove_avx_unaligned
5454
: (HAS_SSSE3
5555
? (HAS_FAST_COPY_BACKWARD

‎sysdeps/x86_64/multiarch/memmove_chk.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
3030
#include "debug/memmove_chk.c"
3131

3232
libc_ifunc (__memmove_chk,
33-
HAS_AVX ? __memmove_chk_avx_unaligned :
33+
HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
3434
(HAS_SSSE3
3535
? (HAS_FAST_COPY_BACKWARD
3636
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)

‎sysdeps/x86_64/multiarch/mempcpy.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ ENTRY(__mempcpy)
3737
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
3838
jz 2f
3939
leaq __mempcpy_ssse3_back(%rip), %rax
40-
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
40+
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
4141
jz 2f
4242
leaq __mempcpy_avx_unaligned(%rip), %rax
4343
2: ret

‎sysdeps/x86_64/multiarch/mempcpy_chk.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
3939
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
4040
jz 2f
4141
leaq __mempcpy_chk_ssse3_back(%rip), %rax
42-
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
42+
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
4343
jz 2f
4444
leaq __mempcpy_chk_avx_unaligned(%rip), %rax
4545
2: ret

0 commit comments

Comments
 (0)
Please sign in to comment.