Skip to content

Commit d311236

Browse files
committed
Refs #189. Fixed the bug of s/cdot about invalid reading NAN on x86_64.
1 parent 36e0982 commit d311236

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

kernel/x86_64/dot_sse.S

+4-3
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@
530530
#endif
531531
movsd -32 * SIZE(Y), %xmm8
532532

533-
pshufd $0x39, %xmm4, %xmm5
533+
pshufd $0x29, %xmm4, %xmm5
534534

535535
mulps %xmm8, %xmm5
536536
addps %xmm5, %xmm3
@@ -750,7 +750,8 @@
750750
xorps %xmm5, %xmm5
751751
movhlps %xmm4, %xmm5
752752

753-
mulps -32 * SIZE(Y), %xmm5
753+
movlps -32 * SIZE(Y), %xmm4
754+
mulps %xmm4, %xmm5
754755
addps %xmm5, %xmm0
755756

756757
addq $2 * SIZE, X
@@ -992,7 +993,7 @@
992993
movsd -32 * SIZE(Y), %xmm8
993994

994995
movss %xmm5, %xmm4
995-
shufps $0x93, %xmm5, %xmm4
996+
shufps $0x93, %xmm4, %xmm4
996997

997998
mulps %xmm8, %xmm4
998999
addps %xmm4, %xmm3

kernel/x86_64/zdot_sse.S

+4-4
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@
699699
movsd -32 * SIZE(X), %xmm4
700700

701701
pshufd $0xb1, %xmm4, %xmm12
702-
shufps $0x39, %xmm8, %xmm8
702+
shufps $0x59, %xmm8, %xmm8
703703
mulps %xmm8, %xmm4
704704
addps %xmm4, %xmm0
705705
mulps %xmm8, %xmm12
@@ -1336,7 +1336,7 @@
13361336

13371337
movss %xmm9, %xmm8
13381338
pshufd $0xb1, %xmm4, %xmm12
1339-
shufps $0x93, %xmm8, %xmm8
1339+
shufps $0x03, %xmm8, %xmm8
13401340
mulps %xmm8, %xmm4
13411341
addps %xmm4, %xmm0
13421342
mulps %xmm8, %xmm12
@@ -1697,7 +1697,7 @@
16971697
movsd -32 * SIZE(Y), %xmm4
16981698

16991699
pshufd $0xb1, %xmm4, %xmm12
1700-
shufps $0x39, %xmm8, %xmm8
1700+
shufps $0xa9, %xmm8, %xmm8
17011701
mulps %xmm8, %xmm4
17021702
addps %xmm4, %xmm0
17031703
mulps %xmm8, %xmm12
@@ -2024,7 +2024,7 @@
20242024

20252025
movss %xmm9, %xmm8
20262026
pshufd $0xb1, %xmm4, %xmm12
2027-
shufps $0x93, %xmm8, %xmm8
2027+
shufps $0x03, %xmm8, %xmm8
20282028
mulps %xmm8, %xmm4
20292029
addps %xmm4, %xmm0
20302030
mulps %xmm8, %xmm12

0 commit comments

Comments
 (0)