Skip to content

Commit 9ea30f3

Browse files
authoredMay 9, 2019
Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125)
* Mark iamax_sse.S as unsuitable for MIN due to issue #2116 * Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116
1 parent e1fc020 commit 9ea30f3

File tree

2 files changed

+58
-52
lines changed

2 files changed

+58
-52
lines changed
 

‎kernel/x86_64/KERNEL

+2-2
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S
171171
endif
172172

173173
ifndef ISAMINKERNEL
174-
ISAMINKERNEL = iamax_sse.S
174+
ISAMINKERNEL = iamax.S
175175
endif
176176

177177
ifndef IDAMINKERNEL
@@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S
207207
endif
208208

209209
ifndef ISMINKERNEL
210-
ISMINKERNEL = iamax_sse.S
210+
ISMINKERNEL = iamax.S
211211
endif
212212

213213
ifndef IDMINKERNEL

‎kernel/x86_64/iamax_sse.S

+56-50
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
/* or implied, of The University of Texas at Austin. */
3737
/*********************************************************************/
3838

39+
/* This kernel was found to give wrong results when used for ISMIN/ISAMIN
40+
with increment != 1, although it appears to be correct for corresponding
41+
MAX operations. See issue 2116 */
42+
3943
#define ASSEMBLER
4044
#include "common.h"
4145

@@ -48,9 +52,11 @@
4852
#define XX %r10
4953
#define MM %r11
5054

55+
#define MAXPS maxps
56+
#define MAXSS maxss
5157
#ifdef USE_MIN
52-
#define maxps minps
53-
#define maxss minss
58+
#define MAXPS minps
59+
#define MAXSS minss
5460
#endif
5561

5662
#include "l1param.h"
@@ -103,7 +109,7 @@
103109
#ifdef USE_ABS
104110
andps %xmm15, %xmm4
105111
#endif
106-
maxss %xmm4, %xmm0
112+
MAXSS %xmm4, %xmm0
107113
decq M
108114
addq $SIZE, X
109115
ALIGN_3
@@ -117,7 +123,7 @@
117123
#ifdef USE_ABS
118124
andps %xmm15, %xmm4
119125
#endif
120-
maxps %xmm4, %xmm1
126+
MAXPS %xmm4, %xmm1
121127
subq $2, M
122128
addq $2 * SIZE, X
123129
ALIGN_3
@@ -137,25 +143,25 @@
137143
#ifdef USE_ABS
138144
andps %xmm15, %xmm4
139145
#endif
140-
maxps %xmm4, %xmm0
146+
MAXPS %xmm4, %xmm0
141147

142148
movaps 4 * SIZE(X), %xmm5
143149
#ifdef USE_ABS
144150
andps %xmm15, %xmm5
145151
#endif
146-
maxps %xmm5, %xmm1
152+
MAXPS %xmm5, %xmm1
147153

148154
movaps 8 * SIZE(X), %xmm6
149155
#ifdef USE_ABS
150156
andps %xmm15, %xmm6
151157
#endif
152-
maxps %xmm6, %xmm2
158+
MAXPS %xmm6, %xmm2
153159

154160
movaps 12 * SIZE(X), %xmm7
155161
#ifdef USE_ABS
156162
andps %xmm15, %xmm7
157163
#endif
158-
maxps %xmm7, %xmm3
164+
MAXPS %xmm7, %xmm3
159165

160166
addq $16 * SIZE, X
161167
decq I
@@ -173,13 +179,13 @@
173179
#ifdef USE_ABS
174180
andps %xmm15, %xmm4
175181
#endif
176-
maxps %xmm4, %xmm0
182+
MAXPS %xmm4, %xmm0
177183

178184
movaps 4 * SIZE(X), %xmm5
179185
#ifdef USE_ABS
180186
andps %xmm15, %xmm5
181187
#endif
182-
maxps %xmm5, %xmm1
188+
MAXPS %xmm5, %xmm1
183189
addq $8 * SIZE, X
184190
ALIGN_3
185191

@@ -191,7 +197,7 @@
191197
#ifdef USE_ABS
192198
andps %xmm15, %xmm6
193199
#endif
194-
maxps %xmm6, %xmm2
200+
MAXPS %xmm6, %xmm2
195201
addq $4 * SIZE, X
196202
ALIGN_3
197203

@@ -204,7 +210,7 @@
204210
#ifdef USE_ABS
205211
andps %xmm15, %xmm7
206212
#endif
207-
maxps %xmm7, %xmm3
213+
MAXPS %xmm7, %xmm3
208214
addq $2 * SIZE, X
209215

210216
.L18:
@@ -215,22 +221,22 @@
215221
#ifdef USE_ABS
216222
andps %xmm15, %xmm4
217223
#endif
218-
maxss %xmm4, %xmm0
224+
MAXSS %xmm4, %xmm0
219225
ALIGN_3
220226

221227
.L20:
222228
movq XX, X
223229
movq MM, M
224230

225-
maxps %xmm1, %xmm0
226-
maxps %xmm3, %xmm2
227-
maxps %xmm2, %xmm0
231+
MAXPS %xmm1, %xmm0
232+
MAXPS %xmm3, %xmm2
233+
MAXPS %xmm2, %xmm0
228234
movaps %xmm0, %xmm1
229235
movhlps %xmm0, %xmm0
230-
maxps %xmm1, %xmm0
236+
MAXPS %xmm1, %xmm0
231237
movaps %xmm0, %xmm1
232238
shufps $1, %xmm0, %xmm0
233-
maxss %xmm1, %xmm0
239+
MAXSS %xmm1, %xmm0
234240
shufps $0, %xmm0, %xmm0
235241

236242
testq $4, X
@@ -427,28 +433,28 @@
427433
#ifdef USE_ABS
428434
andps %xmm15, %xmm4
429435
#endif
430-
maxps %xmm4, %xmm0
436+
MAXPS %xmm4, %xmm0
431437

432438
movsd 4 * SIZE(X), %xmm5
433439
movhps 6 * SIZE(X), %xmm5
434440
#ifdef USE_ABS
435441
andps %xmm15, %xmm5
436442
#endif
437-
maxps %xmm5, %xmm1
443+
MAXPS %xmm5, %xmm1
438444

439445
movsd 8 * SIZE(X), %xmm6
440446
movhps 10 * SIZE(X), %xmm6
441447
#ifdef USE_ABS
442448
andps %xmm15, %xmm6
443449
#endif
444-
maxps %xmm6, %xmm2
450+
MAXPS %xmm6, %xmm2
445451

446452
movsd 12 * SIZE(X), %xmm7
447453
movhps 14 * SIZE(X), %xmm7
448454
#ifdef USE_ABS
449455
andps %xmm15, %xmm7
450456
#endif
451-
maxps %xmm7, %xmm3
457+
MAXPS %xmm7, %xmm3
452458

453459
addq $16 * SIZE, X
454460
decq I
@@ -467,14 +473,14 @@
467473
#ifdef USE_ABS
468474
andps %xmm15, %xmm4
469475
#endif
470-
maxps %xmm4, %xmm0
476+
MAXPS %xmm4, %xmm0
471477

472478
movsd 4 * SIZE(X), %xmm5
473479
movhps 6 * SIZE(X), %xmm5
474480
#ifdef USE_ABS
475481
andps %xmm15, %xmm5
476482
#endif
477-
maxps %xmm5, %xmm1
483+
MAXPS %xmm5, %xmm1
478484

479485
addq $8 * SIZE, X
480486
ALIGN_3
@@ -488,7 +494,7 @@
488494
#ifdef USE_ABS
489495
andps %xmm15, %xmm6
490496
#endif
491-
maxps %xmm6, %xmm2
497+
MAXPS %xmm6, %xmm2
492498
addq $4 * SIZE, X
493499
ALIGN_3
494500

@@ -501,7 +507,7 @@
501507
#ifdef USE_ABS
502508
andps %xmm15, %xmm7
503509
#endif
504-
maxps %xmm7, %xmm3
510+
MAXPS %xmm7, %xmm3
505511
addq $2 * SIZE, X
506512

507513
.L38:
@@ -512,23 +518,23 @@
512518
#ifdef USE_ABS
513519
andps %xmm15, %xmm4
514520
#endif
515-
maxss %xmm4, %xmm0
521+
MAXSS %xmm4, %xmm0
516522
jmp .L40
517523
ALIGN_4
518524

519525
.L40:
520526
movq XX, X
521527
movq MM, M
522528

523-
maxps %xmm1, %xmm0
524-
maxps %xmm3, %xmm2
525-
maxps %xmm2, %xmm0
529+
MAXPS %xmm1, %xmm0
530+
MAXPS %xmm3, %xmm2
531+
MAXPS %xmm2, %xmm0
526532
movaps %xmm0, %xmm1
527533
movhlps %xmm0, %xmm0
528-
maxps %xmm1, %xmm0
534+
MAXPS %xmm1, %xmm0
529535
movaps %xmm0, %xmm1
530536
shufps $1, %xmm0, %xmm0
531-
maxss %xmm1, %xmm0
537+
MAXSS %xmm1, %xmm0
532538
shufps $0, %xmm0, %xmm0
533539

534540
movq M, I
@@ -687,56 +693,56 @@
687693
#ifdef USE_ABS
688694
andps %xmm15, %xmm4
689695
#endif
690-
maxss %xmm4, %xmm0
696+
MAXSS %xmm4, %xmm0
691697

692698
movss 0 * SIZE(X), %xmm5
693699
addq INCX, X
694700
#ifdef USE_ABS
695701
andps %xmm15, %xmm5
696702
#endif
697-
maxss %xmm5, %xmm1
703+
MAXSS %xmm5, %xmm1
698704

699705
movss 0 * SIZE(X), %xmm6
700706
addq INCX, X
701707
#ifdef USE_ABS
702708
andps %xmm15, %xmm6
703709
#endif
704-
maxss %xmm6, %xmm2
710+
MAXSS %xmm6, %xmm2
705711

706712
movss 0 * SIZE(X), %xmm7
707713
addq INCX, X
708714
#ifdef USE_ABS
709715
andps %xmm15, %xmm7
710716
#endif
711-
maxss %xmm7, %xmm3
717+
MAXSS %xmm7, %xmm3
712718

713719
movss 0 * SIZE(X), %xmm4
714720
addq INCX, X
715721
#ifdef USE_ABS
716722
andps %xmm15, %xmm4
717723
#endif
718-
maxss %xmm4, %xmm0
724+
MAXSS %xmm4, %xmm0
719725

720726
movss 0 * SIZE(X), %xmm5
721727
addq INCX, X
722728
#ifdef USE_ABS
723729
andps %xmm15, %xmm5
724730
#endif
725-
maxss %xmm5, %xmm1
731+
MAXSS %xmm5, %xmm1
726732

727733
movss 0 * SIZE(X), %xmm6
728734
addq INCX, X
729735
#ifdef USE_ABS
730736
andps %xmm15, %xmm6
731737
#endif
732-
maxss %xmm6, %xmm2
738+
MAXSS %xmm6, %xmm2
733739

734740
movss 0 * SIZE(X), %xmm7
735741
addq INCX, X
736742
#ifdef USE_ABS
737743
andps %xmm15, %xmm7
738744
#endif
739-
maxss %xmm7, %xmm3
745+
MAXSS %xmm7, %xmm3
740746

741747
decq I
742748
jg .L81
@@ -754,28 +760,28 @@
754760
#ifdef USE_ABS
755761
andps %xmm15, %xmm4
756762
#endif
757-
maxss %xmm4, %xmm0
763+
MAXSS %xmm4, %xmm0
758764

759765
movss 0 * SIZE(X), %xmm5
760766
addq INCX, X
761767
#ifdef USE_ABS
762768
andps %xmm15, %xmm5
763769
#endif
764-
maxss %xmm5, %xmm1
770+
MAXSS %xmm5, %xmm1
765771

766772
movss 0 * SIZE(X), %xmm6
767773
addq INCX, X
768774
#ifdef USE_ABS
769775
andps %xmm15, %xmm6
770776
#endif
771-
maxss %xmm6, %xmm2
777+
MAXSS %xmm6, %xmm2
772778

773779
movss 0 * SIZE(X), %xmm7
774780
addq INCX, X
775781
#ifdef USE_ABS
776782
andps %xmm15, %xmm7
777783
#endif
778-
maxss %xmm7, %xmm3
784+
MAXSS %xmm7, %xmm3
779785
ALIGN_3
780786

781787
.L86:
@@ -787,14 +793,14 @@
787793
#ifdef USE_ABS
788794
andps %xmm15, %xmm4
789795
#endif
790-
maxss %xmm4, %xmm0
796+
MAXSS %xmm4, %xmm0
791797

792798
movss 0 * SIZE(X), %xmm5
793799
addq INCX, X
794800
#ifdef USE_ABS
795801
andps %xmm15, %xmm5
796802
#endif
797-
maxss %xmm5, %xmm1
803+
MAXSS %xmm5, %xmm1
798804
ALIGN_3
799805

800806
.L87:
@@ -806,16 +812,16 @@
806812
#ifdef USE_ABS
807813
andps %xmm15, %xmm6
808814
#endif
809-
maxss %xmm6, %xmm2
815+
MAXSS %xmm6, %xmm2
810816
ALIGN_4
811817

812818
.L90:
813819
movq XX, X
814820
movq MM, M
815821

816-
maxss %xmm1, %xmm0
817-
maxss %xmm3, %xmm2
818-
maxss %xmm2, %xmm0
822+
MAXSS %xmm1, %xmm0
823+
MAXSS %xmm3, %xmm2
824+
MAXSS %xmm2, %xmm0
819825
shufps $0, %xmm0, %xmm0
820826

821827
movq M, I

0 commit comments

Comments
 (0)
Please sign in to comment.