x86_64: Fix svml_d_atan22_core_sse4.S code formatting

This commit contains following formatting changes

1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
   between it and the first operand.
3. Instruction greater than 7 characters in length have a
   space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-03-07 10:47:10 -08:00
parent f55b59764a
commit 1447e84caf
1 changed files with 384 additions and 385 deletions

View File

@ -60,14 +60,13 @@
#include <sysdep.h> #include <sysdep.h>
.text .section .text.sse4, "ax", @progbits
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN2vv_atan2_sse4) ENTRY(_ZGVbN2vv_atan2_sse4)
subq $88, %rsp subq $88, %rsp
cfi_def_cfa_offset(96) cfi_def_cfa_offset(96)
movaps %xmm1, %xmm11 movaps %xmm1, %xmm11
/* /*
* #define NO_VECTOR_ZERO_ATAN2_ARGS * #define NO_VECTOR_ZERO_ATAN2_ARGS
* Declarations * Declarations
* Variables * Variables
@ -86,13 +85,13 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
movaps %xmm1, %xmm4 movaps %xmm1, %xmm4
cmpnltpd %xmm9, %xmm4 cmpnltpd %xmm9, %xmm4
/* Argument signs */ /* Argument signs */
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5 movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
movaps %xmm4, %xmm0 movaps %xmm4, %xmm0
movaps %xmm5, %xmm8 movaps %xmm5, %xmm8
movaps %xmm5, %xmm7 movaps %xmm5, %xmm7
/* /*
* 1) If y<x then a= y, b=x, PIO2=0 * 1) If y<x then a= y, b=x, PIO2=0
* 2) If y>x then a=-x, b=y, PIO2=Pi/2 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
*/ */
@ -112,7 +111,7 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2 movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
xorl %edx, %edx xorl %edx, %edx
/* Check if y and x are on main path. */ /* Check if y and x are on main path. */
pshufd $221, %xmm9, %xmm3 pshufd $221, %xmm9, %xmm3
xorl %eax, %eax xorl %eax, %eax
pshufd $221, %xmm1, %xmm13 pshufd $221, %xmm1, %xmm13
@ -126,96 +125,96 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
pcmpgtd %xmm12, %xmm14 pcmpgtd %xmm12, %xmm14
pcmpeqd %xmm12, %xmm13 pcmpeqd %xmm12, %xmm13
/* Polynomial. */ /* Polynomial. */
movaps %xmm0, %xmm12 movaps %xmm0, %xmm12
por %xmm3, %xmm4 por %xmm3, %xmm4
mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm12
/* P = A19*R2 + A18 */ /* P = A19*R2 + A18 */
movups dA19+__svml_datan2_data_internal(%rip), %xmm15 movups dA19+__svml_datan2_data_internal(%rip), %xmm15
movaps %xmm11, %xmm2 movaps %xmm11, %xmm2
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA18+__svml_datan2_data_internal(%rip), %xmm15 addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A17 */ /* P = P*R2 + A17 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA17+__svml_datan2_data_internal(%rip), %xmm15 addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A16 */ /* P = P*R2 + A16 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA16+__svml_datan2_data_internal(%rip), %xmm15 addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A15 */ /* P = P*R2 + A15 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA15+__svml_datan2_data_internal(%rip), %xmm15 addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A14 */ /* P = P*R2 + A14 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA14+__svml_datan2_data_internal(%rip), %xmm15 addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A13 */ /* P = P*R2 + A13 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA13+__svml_datan2_data_internal(%rip), %xmm15 addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A12 */ /* P = P*R2 + A12 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA12+__svml_datan2_data_internal(%rip), %xmm15 addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A11 */ /* P = P*R2 + A11 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA11+__svml_datan2_data_internal(%rip), %xmm15 addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A10 */ /* P = P*R2 + A10 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA10+__svml_datan2_data_internal(%rip), %xmm15 addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A09 */ /* P = P*R2 + A09 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA09+__svml_datan2_data_internal(%rip), %xmm15 addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A08 */ /* P = P*R2 + A08 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA08+__svml_datan2_data_internal(%rip), %xmm15 addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A07 */ /* P = P*R2 + A07 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA07+__svml_datan2_data_internal(%rip), %xmm15 addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A06 */ /* P = P*R2 + A06 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA06+__svml_datan2_data_internal(%rip), %xmm15 addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A05 */ /* P = P*R2 + A05 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA05+__svml_datan2_data_internal(%rip), %xmm15 addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A04 */ /* P = P*R2 + A04 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA04+__svml_datan2_data_internal(%rip), %xmm15 addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A03 */ /* P = P*R2 + A03 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA03+__svml_datan2_data_internal(%rip), %xmm15 addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A02 */ /* P = P*R2 + A02 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA02+__svml_datan2_data_internal(%rip), %xmm15 addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A01 */ /* P = P*R2 + A01 */
mulpd %xmm12, %xmm15 mulpd %xmm12, %xmm15
addpd dA01+__svml_datan2_data_internal(%rip), %xmm15 addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 */ /* P = P*R2 */
mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm12
/* /*
* Reconstruction. * Reconstruction.
* dP=(R+R*dP) + dPIO2 * dP=(R+R*dP) + dPIO2
*/ */
mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm12
addpd %xmm12, %xmm0 addpd %xmm12, %xmm0
/* if x<0, dPI = Pi, else dPI =0 */ /* if x<0, dPI = Pi, else dPI =0 */
movups dZERO+__svml_datan2_data_internal(%rip), %xmm3 movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
por %xmm13, %xmm14 por %xmm13, %xmm14
cmplepd %xmm3, %xmm2 cmplepd %xmm3, %xmm2
@ -227,29 +226,29 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
orps %xmm7, %xmm0 orps %xmm7, %xmm0
movmskps %xmm4, %ecx movmskps %xmm4, %ecx
/* Special branch for fast (vector) processing of zero arguments */ /* Special branch for fast (vector) processing of zero arguments */
testb $3, %cl testb $3, %cl
/* Go to auxilary branch */ /* Go to auxilary branch */
jne L(AUX_BRANCH) jne L(AUX_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
/* Return from auxilary branch /* Return from auxilary branch
* for out of main path inputs * for out of main path inputs
*/ */
L(AUX_BRANCH_RETURN): L(AUX_BRANCH_RETURN):
/* /*
* Special branch for fast (vector) processing of zero arguments * Special branch for fast (vector) processing of zero arguments
* The end of implementation * The end of implementation
*/ */
testl %edx, %edx testl %edx, %edx
/* Go to special inputs processing branch */ /* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH) jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
/* Restore registers /* Restore registers
* and exit the function * and exit the function
*/ */
@ -259,7 +258,7 @@ L(EXIT):
ret ret
cfi_def_cfa_offset(96) cfi_def_cfa_offset(96)
/* Branch to process /* Branch to process
* special inputs * special inputs
*/ */
@ -279,18 +278,18 @@ L(SPECIAL_VALUES_BRANCH):
cfi_offset(14, -96) cfi_offset(14, -96)
# LOE rbx rbp r15 r12d r13d # LOE rbx rbp r15 r12d r13d
/* Range mask /* Range mask
* bits check * bits check
*/ */
L(RANGEMASK_CHECK): L(RANGEMASK_CHECK):
btl %r12d, %r13d btl %r12d, %r13d
/* Call scalar math function */ /* Call scalar math function */
jc L(SCALAR_MATH_CALL) jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d # LOE rbx rbp r15 r12d r13d
/* Special inputs /* Special inputs
* processing loop * processing loop
*/ */
@ -298,7 +297,7 @@ L(SPECIAL_VALUES_LOOP):
incl %r12d incl %r12d
cmpl $2, %r12d cmpl $2, %r12d
/* Check bits in range mask */ /* Check bits in range mask */
jl L(RANGEMASK_CHECK) jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d # LOE rbx rbp r15 r12d r13d
@ -310,49 +309,49 @@ L(SPECIAL_VALUES_LOOP):
cfi_restore(14) cfi_restore(14)
movups 64(%rsp), %xmm0 movups 64(%rsp), %xmm0
/* Go to exit */ /* Go to exit */
jmp L(EXIT) jmp L(EXIT)
cfi_offset(12, -80) cfi_offset(12, -80)
cfi_offset(13, -88) cfi_offset(13, -88)
cfi_offset(14, -96) cfi_offset(14, -96)
# LOE rbx rbp r12 r13 r14 r15 xmm0 # LOE rbx rbp r12 r13 r14 r15 xmm0
/* Scalar math fucntion call /* Scalar math fucntion call
* to process special input * to process special input
*/ */
L(SCALAR_MATH_CALL): L(SCALAR_MATH_CALL):
movl %r12d, %r14d movl %r12d, %r14d
movsd 32(%rsp,%r14,8), %xmm0 movsd 32(%rsp, %r14, 8), %xmm0
movsd 48(%rsp,%r14,8), %xmm1 movsd 48(%rsp, %r14, 8), %xmm1
call atan2@PLT call atan2@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0 # LOE rbx rbp r14 r15 r12d r13d xmm0
movsd %xmm0, 64(%rsp,%r14,8) movsd %xmm0, 64(%rsp, %r14, 8)
/* Process special inputs in loop */ /* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP) jmp L(SPECIAL_VALUES_LOOP)
cfi_restore(12) cfi_restore(12)
cfi_restore(13) cfi_restore(13)
cfi_restore(14) cfi_restore(14)
# LOE rbx rbp r15 r12d r13d # LOE rbx rbp r15 r12d r13d
/* Auxilary branch /* Auxilary branch
* for out of main path inputs * for out of main path inputs
*/ */
L(AUX_BRANCH): L(AUX_BRANCH):
/* Check if both X & Y are not NaNs: iXYnotNAN */ /* Check if both X & Y are not NaNs: iXYnotNAN */
movaps %xmm11, %xmm13 movaps %xmm11, %xmm13
movaps %xmm10, %xmm12 movaps %xmm10, %xmm12
cmpordpd %xmm11, %xmm13 cmpordpd %xmm11, %xmm13
cmpordpd %xmm10, %xmm12 cmpordpd %xmm10, %xmm12
/* Check if at least on of Y or Y is zero: iAXAYZERO */ /* Check if at least on of Y or Y is zero: iAXAYZERO */
cmpeqpd %xmm3, %xmm9 cmpeqpd %xmm3, %xmm9
cmpeqpd %xmm3, %xmm1 cmpeqpd %xmm3, %xmm1
/* /*
* Path for zero arguments (at least one of both) * Path for zero arguments (at least one of both)
* Check if both args are zeros (den. is zero) * Check if both args are zeros (den. is zero)
*/ */
@ -362,19 +361,19 @@ L(AUX_BRANCH):
pshufd $221, %xmm9, %xmm1 pshufd $221, %xmm9, %xmm1
pshufd $221, %xmm13, %xmm9 pshufd $221, %xmm13, %xmm9
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
pand %xmm9, %xmm1 pand %xmm9, %xmm1
/* Exclude from previous callout mask zero (and not NaN) arguments */ /* Exclude from previous callout mask zero (and not NaN) arguments */
movdqa %xmm1, %xmm14 movdqa %xmm1, %xmm14
pandn %xmm4, %xmm14 pandn %xmm4, %xmm14
/* Set sPIO2 to zero if den. is zero */ /* Set sPIO2 to zero if den. is zero */
movaps %xmm5, %xmm4 movaps %xmm5, %xmm4
andnps %xmm6, %xmm4 andnps %xmm6, %xmm4
andps %xmm3, %xmm5 andps %xmm3, %xmm5
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
pshufd $221, %xmm3, %xmm3 pshufd $221, %xmm3, %xmm3
orps %xmm5, %xmm4 orps %xmm5, %xmm4
pshufd $221, %xmm11, %xmm5 pshufd $221, %xmm11, %xmm5
@ -384,10 +383,10 @@ L(AUX_BRANCH):
andps %xmm2, %xmm6 andps %xmm2, %xmm6
addpd %xmm6, %xmm4 addpd %xmm6, %xmm4
/* Go to callout */ /* Go to callout */
movmskps %xmm14, %edx movmskps %xmm14, %edx
/* Merge results from main and spec path */ /* Merge results from main and spec path */
pshufd $80, %xmm1, %xmm2 pshufd $80, %xmm1, %xmm2
orps %xmm7, %xmm4 orps %xmm7, %xmm4
movdqa %xmm2, %xmm7 movdqa %xmm2, %xmm7
@ -397,7 +396,7 @@ L(AUX_BRANCH):
movaps %xmm7, %xmm0 movaps %xmm7, %xmm0
orps %xmm4, %xmm0 orps %xmm4, %xmm0
/* Return to main vector processing path */ /* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN) jmp L(AUX_BRANCH_RETURN)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
END(_ZGVbN2vv_atan2_sse4) END(_ZGVbN2vv_atan2_sse4)
@ -438,9 +437,9 @@ typedef struct {
} __svml_datan2_data_internal; } __svml_datan2_data_internal;
#endif #endif
__svml_datan2_data_internal: __svml_datan2_data_internal:
.quad 0x400921FB54442D18, 0x400921FB54442D18 //dPI .quad 0x400921FB54442D18, 0x400921FB54442D18 // dPI
.align 16 .align 16
.quad 0x3FF921FB54442D18, 0x3FF921FB54442D18 //dPIO2 .quad 0x3FF921FB54442D18, 0x3FF921FB54442D18 // dPIO2
.align 16 .align 16
.quad 0xBEF4FDB537ABC7A3, 0xBEF4FDB537ABC7A3 // dA19 .quad 0xBEF4FDB537ABC7A3, 0xBEF4FDB537ABC7A3 // dA19
.align 16 .align 16
@ -482,15 +481,15 @@ __svml_datan2_data_internal:
.align 16 .align 16
.quad 0x3FF0000000000000, 0x3FF0000000000000 // dA00 .quad 0x3FF0000000000000, 0x3FF0000000000000 // dA00
.align 16 .align 16
.quad 0x8000000000000000, 0x8000000000000000 //dSIGN_MASK .quad 0x8000000000000000, 0x8000000000000000 // dSIGN_MASK
.align 16 .align 16
.long 0x80300000, 0x80300000, 0x80300000, 0x80300000 //iCHK_WORK_SUB .long 0x80300000, 0x80300000, 0x80300000, 0x80300000 // iCHK_WORK_SUB
.align 16 .align 16
.long 0xfdd00000, 0xfdd00000, 0xfdd00000, 0xfdd00000 //iCHK_WORK_CMP .long 0xfdd00000, 0xfdd00000, 0xfdd00000, 0xfdd00000 // iCHK_WORK_CMP
.align 16 .align 16
.quad 0x7fffffffffffffff, 0x7fffffffffffffff //dABS_MASK .quad 0x7fffffffffffffff, 0x7fffffffffffffff // dABS_MASK
.align 16 .align 16
.quad 0x0000000000000000, 0x0000000000000000 //dZERO .quad 0x0000000000000000, 0x0000000000000000 // dZERO
.align 16 .align 16
.type __svml_datan2_data_internal,@object .type __svml_datan2_data_internal, @object
.size __svml_datan2_data_internal,.-__svml_datan2_data_internal .size __svml_datan2_data_internal, .-__svml_datan2_data_internal