-
Notifications
You must be signed in to change notification settings - Fork 222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
copy_misaligned_words: use inline asm on ARM, simplify fallback implementation #808
base: master
Are you sure you want to change the base?
Conversation
No luck unfortunately, memcpy gets some minor improvements but memmove regresses further. Report
|
For reference: With this PRcompiler_builtins::mem::memcpy:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
.cfi_def_cfa_offset 36
.cfi_offset lr, -4
.cfi_offset r11, -8
.cfi_offset r10, -12
.cfi_offset r9, -16
.cfi_offset r8, -20
.cfi_offset r7, -24
.cfi_offset r6, -28
.cfi_offset r5, -32
.cfi_offset r4, -36
.pad #12
sub sp, sp, #12
.cfi_def_cfa_offset 48
cmp r2, #16
blo .LBB427_9
rsb r3, r0, #0
and r4, r3, #3
add lr, r0, r4
cmp r0, lr
bhs .LBB427_4
mov r3, r4
mov r7, r0
mov r6, r1
.LBB427_3:
ldrb r5, [r6], #1
subs r3, r3, #1
strb r5, [r7], #1
bne .LBB427_3
.LBB427_4:
sub r12, r2, r4
add r1, r1, r4
bic r2, r12, #3
ands r4, r1, #3
add r3, lr, r2
bne .LBB427_12
cmp lr, r3
bhs .LBB427_8
mov r4, r1
.LBB427_7:
ldr r5, [r4], #4
str r5, [lr], #4
cmp lr, r3
blo .LBB427_7
.LBB427_8:
add r1, r1, r2
and r2, r12, #3
add r7, r3, r2
cmp r3, r7
blo .LBB427_10
b .LBB427_11
.LBB427_9:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB427_11
.LBB427_10:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB427_10
.LBB427_11:
add sp, sp, #12
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.LBB427_12:
add r5, sp, #8
mov r7, #0
orr r6, r5, r4
add r8, r5, #4
lsl r9, r4, #3
cmp r6, r8
str r7, [sp, #8]
bhs .LBB427_16
ldrb r7, [r1]
mov r5, r6
strb r7, [r5], #1
cmp r5, r8
beq .LBB427_15
ldrb r5, [r1, #1]
strb r5, [r6, #1]
add r5, r6, #2
cmp r5, r8
ldrbne r6, [r1, #2]
strbne r6, [r5]
.LBB427_15:
ldr r7, [sp, #8]
.LBB427_16:
add r5, lr, #4
sub r6, r1, r4
cmp r5, r3
rsb r5, r9, #0
str r5, [sp]
bhs .LBB427_19
and r11, r5, #24
.LBB427_18:
ldr r8, [r6, #4]!
add r10, lr, #4
lsl r5, r8, r11
orr r5, r5, r7, lsr r9
str r5, [lr], #8
cmp lr, r3
mov lr, r10
mov r7, r8
blo .LBB427_18
b .LBB427_20
.LBB427_19:
mov r8, r7
mov r10, lr
.LBB427_20:
add r11, sp, #4
ldr r5, [sp]
orr r7, r11, r4
mov lr, #0
cmp r11, r7
str lr, [sp, #4]
bhs .LBB427_24
add lr, r6, #4
mov r7, #0
.LBB427_22:
ldrb r6, [lr, r7]
strb r6, [r11, r7]
add r7, r7, #1
cmp r4, r7
bne .LBB427_22
ldr lr, [sp, #4]
.LBB427_24:
and r4, r5, #24
lsl r7, lr, r4
orr r7, r7, r8, lsr r9
str r7, [r10]
b .LBB427_8
compiler_builtins::mem::memmove:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
.cfi_def_cfa_offset 36
.cfi_offset lr, -4
.cfi_offset r11, -8
.cfi_offset r10, -12
.cfi_offset r9, -16
.cfi_offset r8, -20
.cfi_offset r7, -24
.cfi_offset r6, -28
.cfi_offset r5, -32
.cfi_offset r4, -36
.pad #20
sub sp, sp, #20
.cfi_def_cfa_offset 56
sub r3, r0, r1
cmp r3, r2
bhs .LBB227_13
add r4, r1, r2
add r10, r0, r2
cmp r2, #16
blo .LBB227_10
and r12, r10, #3
bic r3, r10, #3
rsb r9, r12, #0
cmp r3, r10
bhs .LBB227_5
add r7, r1, r2
mov r5, r10
sub r7, r7, #1
.LBB227_4:
ldrb r6, [r7], #-1
strb r6, [r5, #-1]!
cmp r3, r5
blo .LBB227_4
.LBB227_5:
sub r12, r2, r12
add r4, r4, r9
bic r7, r12, #3
ands r11, r4, #3
sub r6, r3, r7
rsb lr, r7, #0
bne .LBB227_25
cmp r6, r3
bhs .LBB227_9
add r1, r12, r1
mov r2, r3
sub r1, r1, #4
.LBB227_8:
ldr r5, [r1], #-4
str r5, [r2, #-4]!
cmp r6, r2
blo .LBB227_8
.LBB227_9:
add r4, r4, lr
add r10, r3, lr
and r2, r12, #3
.LBB227_10:
sub r1, r10, r2
cmp r1, r10
bhs .LBB227_24
sub r2, r4, #1
.LBB227_12:
ldrb r3, [r2], #-1
strb r3, [r10, #-1]!
cmp r1, r10
blo .LBB227_12
b .LBB227_24
.LBB227_13:
cmp r2, #16
blo .LBB227_22
rsb r3, r0, #0
and r12, r3, #3
add r4, r0, r12
cmp r0, r4
bhs .LBB227_17
mov r3, r12
mov r6, r0
mov r5, r1
.LBB227_16:
ldrb r7, [r5], #1
subs r3, r3, #1
strb r7, [r6], #1
bne .LBB227_16
.LBB227_17:
sub r2, r2, r12
add r12, r1, r12
bic r5, r2, #3
ands r7, r12, #3
add r3, r4, r5
bne .LBB227_33
cmp r4, r3
bhs .LBB227_21
mov r1, r12
.LBB227_20:
ldr r7, [r1], #4
str r7, [r4], #4
cmp r4, r3
blo .LBB227_20
.LBB227_21:
add r1, r12, r5
and r2, r2, #3
add r7, r3, r2
cmp r3, r7
blo .LBB227_23
b .LBB227_24
.LBB227_22:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB227_24
.LBB227_23:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB227_23
.LBB227_24:
add sp, sp, #20
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.LBB227_25:
sub r7, r4, r11
str r7, [sp, #8]
add r7, sp, #16
lsl r8, r11, #3
orr r7, r7, r11
str r8, [sp, #12]
str r7, [sp, #4]
mov r5, #0
mov r8, r7
add r7, sp, #16
cmp r7, r8
str r5, [sp, #16]
bhs .LBB227_29
ldr r5, [sp, #8]
cmp r11, #1
ldrb r5, [r5]
strb r5, [sp, #16]
beq .LBB227_28
ldr r5, [sp, #8]
cmp r11, #2
ldrb r5, [r5, #1]
strb r5, [sp, #17]
ldrne r5, [sp, #8]
ldrbne r5, [r5, #2]
strbne r5, [sp, #18]
.LBB227_28:
ldr r5, [sp, #16]
.LBB227_29:
ldr r8, [sp, #12]
add r6, r6, #4
cmp r6, r3
rsb r7, r8, #0
bhs .LBB227_40
sub r2, r2, r11
str r7, [sp]
add r8, r1, r2
and r7, r7, #24
.LBB227_31:
add r1, r8, r9
ldr r2, [sp, #12]
sub r8, r8, #4
ldr r1, [r1, #-4]
lsr r2, r1, r2
orr r2, r2, r5, lsl r7
add r5, r10, r9
sub r10, r10, #4
str r2, [r5, #-4]
add r2, r10, r9
cmp r6, r2
mov r5, r1
blo .LBB227_31
add r5, r8, r9
ldr r8, [sp, #12]
ldr r7, [sp]
str r5, [sp, #8]
b .LBB227_41
.LBB227_33:
add r6, sp, #16
mov r8, #0
orr r1, r6, r7
add r10, r6, #4
lsl r9, r7, #3
cmp r1, r10
str r8, [sp, #16]
bhs .LBB227_37
ldrb lr, [r12]
mov r6, r1
strb lr, [r6], #1
cmp r6, r10
beq .LBB227_36
add lr, r1, #2
ldrb r6, [r12, #1]
cmp lr, r10
strb r6, [r1, #1]
ldrbne r6, [r12, #2]
strbne r6, [lr]
.LBB227_36:
ldr r8, [sp, #16]
.LBB227_37:
str r1, [sp, #12]
add r1, r4, #4
cmp r1, r3
rsb r1, r9, #0
sub r6, r12, r7
str r1, [sp, #8]
bhs .LBB227_47
and lr, r1, #24
.LBB227_39:
ldr r10, [r6, #4]!
add r11, r4, #4
lsl r1, r10, lr
orr r1, r1, r8, lsr r9
str r1, [r4], #8
cmp r4, r3
mov r4, r11
mov r8, r10
blo .LBB227_39
b .LBB227_48
.LBB227_40:
mov r1, r5
mov r2, r3
.LBB227_41:
add r5, sp, #16
mov r6, #0
add r9, r5, #4
ldr r5, [sp, #4]
str r6, [sp, #16]
cmp r5, r9
bhs .LBB227_46
ldr r6, [sp, #8]
mov r10, r7
add r6, r6, r11
ldrb r7, [r6, #-4]!
strb r7, [r5]
add r7, r5, #1
cmp r7, r9
beq .LBB227_45
ldrb r7, [r6, #1]
strb r7, [r5, #1]
add r7, r5, #2
cmp r7, r9
beq .LBB227_45
ldrb r7, [r6, #2]
strb r7, [r5, #2]
add r7, r5, #3
cmp r7, r9
ldrbne r5, [r6, #3]
strbne r5, [r7]
.LBB227_45:
ldr r6, [sp, #16]
mov r7, r10
.LBB227_46:
lsr r5, r6, r8
and r6, r7, #24
orr r1, r5, r1, lsl r6
str r1, [r2, #-4]
b .LBB227_9
.LBB227_47:
mov r10, r8
mov r11, r4
.LBB227_48:
ldr r1, [sp, #12]
add lr, sp, #16
mov r4, #0
cmp lr, r1
str r4, [sp, #16]
bhs .LBB227_52
add r6, r6, #4
.LBB227_50:
ldrb r1, [r6, r4]
strb r1, [lr, r4]
add r4, r4, #1
cmp r7, r4
bne .LBB227_50
ldr r4, [sp, #16]
.LBB227_52:
ldr r1, [sp, #8]
and r1, r1, #24
lsl r1, r4, r1
orr r1, r1, r10, lsr r9
str r1, [r11]
b .LBB227_21 Current master with #799compiler_builtins::mem::memcpy:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
.cfi_def_cfa_offset 36
.cfi_offset lr, -4
.cfi_offset r11, -8
.cfi_offset r10, -12
.cfi_offset r9, -16
.cfi_offset r8, -20
.cfi_offset r7, -24
.cfi_offset r6, -28
.cfi_offset r5, -32
.cfi_offset r4, -36
.pad #16
sub sp, sp, #16
.cfi_def_cfa_offset 52
cmp r2, #16
blo .LBB226_9
rsb r3, r0, #0
and r4, r3, #3
add lr, r0, r4
cmp r0, lr
bhs .LBB226_4
mov r3, r4
mov r7, r0
mov r6, r1
.LBB226_3:
ldrb r5, [r6], #1
subs r3, r3, #1
strb r5, [r7], #1
bne .LBB226_3
.LBB226_4:
sub r12, r2, r4
add r1, r1, r4
bic r2, r12, #3
ands r5, r1, #3
add r3, lr, r2
bne .LBB226_12
cmp lr, r3
bhs .LBB226_8
mov r4, r1
.LBB226_7:
ldr r5, [r4], #4
str r5, [lr], #4
cmp lr, r3
blo .LBB226_7
.LBB226_8:
add r1, r1, r2
and r2, r12, #3
add r7, r3, r2
cmp r3, r7
blo .LBB226_10
b .LBB226_11
.LBB226_9:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB226_11
.LBB226_10:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB226_10
.LBB226_11:
add sp, sp, #16
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.LBB226_12:
add r4, sp, #12
rsb r7, r5, #4
mov r6, #0
orr r4, r4, r5
tst r7, #1
str r6, [sp, #12]
ldrbne r6, [r1]
lsl r9, r5, #3
strbne r6, [r4]
movne r6, #1
tst r7, #2
addne r7, r1, r6
addne r4, r4, r6
sub r6, r1, r5
ldrhne r7, [r7]
strhne r7, [r4]
add r4, lr, #4
ldr r7, [sp, #12]
cmp r4, r3
rsb r4, r9, #0
str r4, [sp]
bhs .LBB226_15
and r10, r4, #24
.LBB226_14:
ldr r8, [r6, #4]!
lsl r4, r8, r10
orr r7, r4, r7, lsr r9
add r4, lr, #4
str r7, [lr], #8
cmp lr, r3
mov r7, r8
mov lr, r4
blo .LBB226_14
b .LBB226_16
.LBB226_15:
mov r8, r7
mov r4, lr
.LBB226_16:
mov lr, #0
cmp r5, #1
strb lr, [sp, #8]
strb lr, [sp, #6]
bne .LBB226_18
add r11, sp, #8
mov r5, #0
mov r10, #0
mov r7, #0
b .LBB226_19
.LBB226_18:
ldrb r7, [r6, #5]
add r11, sp, #6
ldrb r5, [r6, #4]
strb r5, [sp, #8]
lsl r10, r7, #8
mov r7, #2
.LBB226_19:
tst r1, #1
beq .LBB226_21
add r6, r6, #4
ldrb r7, [r6, r7]
strb r7, [r11]
ldrb r7, [sp, #6]
ldrb r5, [sp, #8]
lsl lr, r7, #16
.LBB226_21:
ldr r6, [sp]
orr r7, r10, lr
orr r7, r7, r5
and r6, r6, #24
lsl r7, r7, r6
orr r7, r7, r8, lsr r9
str r7, [r4]
b .LBB226_8
compiler_builtins::mem::memmove:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
.cfi_def_cfa_offset 36
.cfi_offset lr, -4
.cfi_offset r11, -8
.cfi_offset r10, -12
.cfi_offset r9, -16
.cfi_offset r8, -20
.cfi_offset r7, -24
.cfi_offset r6, -28
.cfi_offset r5, -32
.cfi_offset r4, -36
.pad #40
sub sp, sp, #40
.cfi_def_cfa_offset 76
sub r3, r0, r1
cmp r3, r2
bhs .LBB227_13
add r4, r1, r2
add r8, r0, r2
cmp r2, #16
blo .LBB227_10
and r12, r8, #3
bic r3, r8, #3
rsb r11, r12, #0
cmp r3, r8
bhs .LBB227_5
add r7, r1, r2
mov r5, r8
sub r7, r7, #1
.LBB227_4:
ldrb r6, [r7], #-1
strb r6, [r5, #-1]!
cmp r3, r5
blo .LBB227_4
.LBB227_5:
sub r12, r2, r12
add r4, r4, r11
bic r7, r12, #3
ands r9, r4, #3
sub r6, r3, r7
rsb lr, r7, #0
bne .LBB227_25
cmp r6, r3
bhs .LBB227_9
add r1, r12, r1
mov r2, r3
sub r1, r1, #4
.LBB227_8:
ldr r5, [r1], #-4
str r5, [r2, #-4]!
cmp r6, r2
blo .LBB227_8
.LBB227_9:
add r4, r4, lr
add r8, r3, lr
and r2, r12, #3
.LBB227_10:
sub r1, r8, r2
cmp r1, r8
bhs .LBB227_24
sub r2, r4, #1
.LBB227_12:
ldrb r3, [r2], #-1
strb r3, [r8, #-1]!
cmp r1, r8
blo .LBB227_12
b .LBB227_24
.LBB227_13:
cmp r2, #16
blo .LBB227_22
rsb r3, r0, #0
and r12, r3, #3
add r7, r0, r12
cmp r0, r7
bhs .LBB227_17
mov r3, r12
mov r5, r0
mov r4, r1
.LBB227_16:
ldrb r6, [r4], #1
subs r3, r3, #1
strb r6, [r5], #1
bne .LBB227_16
.LBB227_17:
sub r2, r2, r12
add r11, r1, r12
bic r5, r2, #3
ands r6, r11, #3
add r3, r7, r5
bne .LBB227_27
cmp r7, r3
bhs .LBB227_21
mov r1, r11
.LBB227_20:
ldr r6, [r1], #4
str r6, [r7], #4
cmp r7, r3
blo .LBB227_20
.LBB227_21:
add r1, r11, r5
and r2, r2, #3
add r7, r3, r2
cmp r3, r7
blo .LBB227_23
b .LBB227_24
.LBB227_22:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB227_24
.LBB227_23:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB227_23
.LBB227_24:
add sp, sp, #40
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.LBB227_25:
sub r5, r4, r9
mov r10, #0
lsl r7, r9, #3
cmp r9, #1
strb r10, [sp, #32]
strb r10, [sp, #30]
str r7, [sp, #12]
str r5, [sp, #8]
bne .LBB227_30
mov r5, #0
add r7, sp, #32
str r5, [sp, #4]
b .LBB227_40
.LBB227_27:
add r1, sp, #24
rsb r4, r6, #4
orr r12, r1, r6
tst r4, #1
mov r8, #0
ldrbne r1, [r11]
str r8, [sp, #24]
movne r8, #1
strbne r1, [r12]
tst r4, #2
addne r1, r11, r8
addne r4, r12, r8
lsl lr, r6, #3
sub r12, r11, r6
ldrhne r1, [r1]
strhne r1, [r4]
add r1, r7, #4
ldr r4, [sp, #24]
cmp r1, r3
rsb r1, lr, #0
str r1, [sp, #12]
bhs .LBB227_32
and r10, r1, #24
.LBB227_29:
ldr r8, [r12, #4]!
add r9, r7, #4
lsl r1, r8, r10
orr r1, r1, r4, lsr lr
str r1, [r7], #8
cmp r7, r3
mov r7, r9
mov r4, r8
blo .LBB227_29
b .LBB227_33
.LBB227_30:
ldrb r10, [r5]
tst r4, #1
ldrb r5, [r5, #1]
str r5, [sp, #4]
strb r10, [sp, #32]
bne .LBB227_39
mov r7, #0
b .LBB227_41
.LBB227_32:
mov r8, r4
mov r9, r7
.LBB227_33:
mov r7, #0
cmp r6, #1
strb r7, [sp, #20]
strb r7, [sp, #18]
bne .LBB227_35
add r1, sp, #20
mov r6, #0
mov r10, #0
mov r4, #0
b .LBB227_36
.LBB227_35:
ldrb r1, [r12, #5]
mov r4, #2
ldrb r6, [r12, #4]
strb r6, [sp, #20]
lsl r10, r1, #8
add r1, sp, #18
.LBB227_36:
tst r11, #1
beq .LBB227_38
add r7, r12, #4
ldrb r7, [r7, r4]
strb r7, [r1]
ldrb r1, [sp, #18]
ldrb r6, [sp, #20]
lsl r7, r1, #16
.LBB227_38:
orr r1, r10, r7
ldr r7, [sp, #12]
orr r1, r1, r6
and r7, r7, #24
lsl r1, r1, r7
orr r1, r1, r8, lsr lr
str r1, [r9]
b .LBB227_21
.LBB227_39:
add r7, sp, #30
mov r10, #2
.LBB227_40:
ldr r5, [sp, #8]
ldrb r5, [r5, r10]
strb r5, [r7]
ldrb r7, [sp, #30]
ldrb r10, [sp, #32]
lsl r7, r7, #16
.LBB227_41:
ldr r5, [sp, #4]
orr r7, r7, r5, lsl #8
add r5, r6, #4
str r5, [sp]
cmp r5, r3
ldr r5, [sp, #12]
orr r10, r7, r10
rsb r7, r5, #0
str r7, [sp, #4]
bhs .LBB227_45
sub r2, r2, r9
ldr r5, [sp]
add r6, r1, r2
and r1, r7, #24
str r1, [sp, #8]
.LBB227_43:
add r1, r6, r11
ldr r2, [sp, #12]
ldr r7, [sp, #8]
sub r6, r6, #4
ldr r1, [r1, #-4]
lsr r2, r1, r2
orr r2, r2, r10, lsl r7
add r7, r8, r11
sub r8, r8, #4
mov r10, r1
str r2, [r7, #-4]
add r2, r8, r11
cmp r5, r2
blo .LBB227_43
add r7, r6, r11
b .LBB227_46
.LBB227_45:
mov r1, r10
mov r2, r3
ldr r7, [sp, #8]
.LBB227_46:
add r5, sp, #36
mov r6, #0
orr r8, r5, r9
add r5, r7, r9
sub r7, r5, #4
rsb r5, r9, #4
str r6, [sp, #36]
tst r5, #1
ldrbne r6, [r7]
strbne r6, [r8]
movne r6, #1
tst r5, #2
addne r5, r7, r6
addne r6, r8, r6
ldrhne r5, [r5]
strhne r5, [r6]
ldr r5, [sp, #36]
ldr r6, [sp, #12]
lsr r5, r5, r6
ldr r6, [sp, #4]
and r6, r6, #24
orr r1, r5, r1, lsl r6
str r1, [r2, #-4]
b .LBB227_9 Before #799compiler_builtins::mem::memcpy:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r11, lr}
push {r4, r5, r6, r7, r8, r9, r11, lr}
.cfi_def_cfa_offset 32
.cfi_offset lr, -4
.cfi_offset r11, -8
.cfi_offset r9, -12
.cfi_offset r8, -16
.cfi_offset r7, -20
.cfi_offset r6, -24
.cfi_offset r5, -28
.cfi_offset r4, -32
cmp r2, #16
blo .LBB226_9
rsb r3, r0, #0
and r4, r3, #3
add r12, r0, r4
cmp r0, r12
bhs .LBB226_4
mov r3, r4
mov r6, r0
mov r5, r1
.LBB226_3:
ldrb r7, [r5], #1
subs r3, r3, #1
strb r7, [r6], #1
bne .LBB226_3
.LBB226_4:
sub lr, r2, r4
add r1, r1, r4
bic r2, lr, #3
tst r1, #3
add r3, r12, r2
bne .LBB226_12
cmp r12, r3
bhs .LBB226_8
mov r4, r1
.LBB226_7:
ldr r5, [r4], #4
str r5, [r12], #4
cmp r12, r3
blo .LBB226_7
.LBB226_8:
add r1, r1, r2
and r2, lr, #3
add r7, r3, r2
cmp r3, r7
blo .LBB226_10
b .LBB226_11
.LBB226_9:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB226_11
.LBB226_10:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB226_10
.LBB226_11:
pop {r4, r5, r6, r7, r8, r9, r11, pc}
.LBB226_12:
cmp r12, r3
bhs .LBB226_8
bic r7, r1, #3
lsl r6, r1, #3
add r5, r7, #4
rsb r6, r6, #0
ldr r7, [r7]
mov r4, #24
and r8, r4, r1, lsl #3
and r9, r6, #24
.LBB226_14:
ldr r4, [r5], #4
lsl r6, r4, r9
orr r6, r6, r7, lsr r8
str r6, [r12], #4
cmp r12, r3
mov r7, r4
blo .LBB226_14
b .LBB226_8
compiler_builtins::mem::memmove:
.fnstart
.cfi_startproc
.save {r4, r5, r6, r7, r8, r9, r10, lr}
push {r4, r5, r6, r7, r8, r9, r10, lr}
.cfi_def_cfa_offset 32
.cfi_offset lr, -4
.cfi_offset r10, -8
.cfi_offset r9, -12
.cfi_offset r8, -16
.cfi_offset r7, -20
.cfi_offset r6, -24
.cfi_offset r5, -28
.cfi_offset r4, -32
sub r3, r0, r1
cmp r3, r2
bhs .LBB227_13
add r4, r1, r2
add r3, r0, r2
cmp r2, #16
blo .LBB227_10
and r5, r3, #3
bic r12, r3, #3
rsb r8, r5, #0
cmp r12, r3
bhs .LBB227_5
add r7, r1, r2
sub r7, r7, #1
.LBB227_4:
ldrb r6, [r7], #-1
strb r6, [r3, #-1]!
cmp r12, r3
blo .LBB227_4
.LBB227_5:
sub r2, r2, r5
add r4, r4, r8
bic r7, r2, #3
tst r4, #3
sub r3, r12, r7
rsb lr, r7, #0
bne .LBB227_25
cmp r3, r12
bhs .LBB227_9
add r1, r2, r1
mov r5, r12
sub r1, r1, #4
.LBB227_8:
ldr r6, [r1], #-4
str r6, [r5, #-4]!
cmp r3, r5
blo .LBB227_8
.LBB227_9:
add r4, r4, lr
add r3, r12, lr
and r2, r2, #3
.LBB227_10:
sub r1, r3, r2
cmp r1, r3
bhs .LBB227_24
sub r2, r4, #1
.LBB227_12:
ldrb r7, [r2], #-1
strb r7, [r3, #-1]!
cmp r1, r3
blo .LBB227_12
b .LBB227_24
.LBB227_13:
cmp r2, #16
blo .LBB227_22
rsb r3, r0, #0
and r5, r3, #3
add r12, r0, r5
cmp r0, r12
bhs .LBB227_17
mov r3, r5
mov r7, r0
mov r6, r1
.LBB227_16:
ldrb r4, [r6], #1
subs r3, r3, #1
strb r4, [r7], #1
bne .LBB227_16
.LBB227_17:
sub r2, r2, r5
add r5, r1, r5
bic r4, r2, #3
tst r5, #3
add r3, r12, r4
bne .LBB227_28
cmp r12, r3
bhs .LBB227_21
mov r1, r5
.LBB227_20:
ldr r7, [r1], #4
str r7, [r12], #4
cmp r12, r3
blo .LBB227_20
.LBB227_21:
add r1, r5, r4
and r2, r2, #3
add r7, r3, r2
cmp r3, r7
blo .LBB227_23
b .LBB227_24
.LBB227_22:
mov r3, r0
add r7, r3, r2
cmp r3, r7
bhs .LBB227_24
.LBB227_23:
ldrb r7, [r1], #1
subs r2, r2, #1
strb r7, [r3], #1
bne .LBB227_23
.LBB227_24:
pop {r4, r5, r6, r7, r8, r9, r10, pc}
.LBB227_25:
cmp r3, r12
bhs .LBB227_9
mov r1, #24
bic r6, r4, #3
and r8, r1, r4, lsl #3
lsl r1, r4, #3
rsb r7, r1, #0
ldr r1, [r6]
and r9, r7, #24
sub r5, r6, #4
mov r7, r12
.LBB227_27:
ldr r10, [r5], #-4
lsr r6, r10, r8
orr r1, r6, r1, lsl r9
str r1, [r7, #-4]!
mov r1, r10
cmp r3, r7
blo .LBB227_27
b .LBB227_9
.LBB227_28:
cmp r12, r3
bhs .LBB227_21
mov r1, #24
bic r7, r5, #3
and lr, r1, r5, lsl #3
lsl r1, r5, #3
add r6, r7, #4
rsb r1, r1, #0
ldr r7, [r7]
and r8, r1, #24
.LBB227_30:
ldr r9, [r6], #4
lsl r1, r9, r8
orr r1, r1, r7, lsr lr
str r1, [r12], #4
cmp r12, r3
mov r7, r9
blo .LBB227_30
b .LBB227_21 Edit: IR for two of the versions is at https://gist.github.com/tgross35/95d2e6821db82a7a4b160a560c3281a5 |
Hm, bummer. Any other ideas for what we could do to make this faster without causing UB? |
I have one idea: use inline assembly to do the partial loads, at least on a few targets we primarily care about. However, that's beyond my inline asm skill level. ;) Maybe @beetrees wants to give it a shot? Also, kind of the entire point of this codepath was to be portable, so it's a bit of a bummer. But at least most of the logic would still be portable... |
I feel like there has to be a way to hint LLVM better, or else there is missed optimization. Asked at https://rust-lang.zulipchat.com/#narrow/channel/187780-t-compiler.2Fwg-llvm/topic/Hinting.20to.20LLVM.20that.20it.20can.20read.20uninitialized.20data/with/507486212 |
This should cover all tier 1 targets: /// `addr` must be aligned to `align_of::<usize>()`.
pub unsafe fn load(addr: *mut usize) -> usize {
let mut out;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe {
core::arch::asm!("mov {out}, [{addr}]", addr = in(reg) addr, out = lateout(reg) out, options(nostack, readonly, preserves_flags));
}
#[cfg(any(target_arch = "arm", target_arch = "aarch64", target_arch = "arm64ec"))]
unsafe {
core::arch::asm!("ldr {out}, [{addr}]", addr = in(reg) addr, out = lateout(reg) out, options(nostack, readonly, preserves_flags));
}
out
} |
02c5049
to
86a3fe3
Compare
Thanks! I've applied the ARM branch in this PR. x86 uses the I've kept the simplified fallback implementation using @tgross35 should be ready for the next round of benchmarks. :) |
86a3fe3
to
ba18877
Compare
ba18877
to
1fd36a5
Compare
We are just not having any luck; this didn't recover any of the
Full run
|
Maybe one day somebody will port https://github.com/ARM-software/optimized-routines/blob/850309be878e7d15d064ea7d5589bb0266499288/string/arm/memcpy.S to a |
Hm, the assembly should be quite close to what it was before my changes now, shouldn't it? Except for the last loop iteration being unrolled. But for larger sizes, only the loop should matter. I played around with the loop conditions a bit to make the loop more like how it as before. Does that make a difference? |
Maybe this is faster than #799?
@tgross35 would be great if you could run the benchmarks again. :)