| extern _ps_am_inv_mant_mask
extern _ps_am_min_norm_pos
extern _ps_am_1
extern _ps_am_0p5
extern _pi32_0x7f
extern _epi32_0x7f
extern _epi32_1
extern _ps_log_p0
extern _ps_log_p1
extern _ps_log_p2
extern _ps_log_q0
extern _ps_log_q1
extern _ps_log_q2
extern _ps_log2_c0
extern _ps_exp2_hi
extern _ps_exp2_lo
extern _ps_exp2_p0
extern _ps_exp2_p1
extern _ps_exp2_p2
extern _ps_exp2_q0
extern _ps_exp2_q1
; __m128 mm_powf_eps(__m128 x, __m128 y);
global mm_powf_eps:function ; sse2 packed
mm_powf_eps:
push ebx
mov ebx, esp
sub esp, 8
and esp, 0FFFFFFF0h
add esp, 4
push ebp
mov ebp, [ebx + 4]
mov [esp + 4], ebp
mov ebp, esp
sub esp, 0B8h
xor eax, ebp
mov [ebp - 4], eax
push esi
push edi
xorps xmm5, xmm5
cmpltps xmm5, xmm0
mov ecx, esp
maxps xmm0, [_ps_am_min_norm_pos]
movaps xmm7, [_ps_am_1]
movaps xmm3, xmm0
and ecx, ~15
andps xmm0, [_ps_am_inv_mant_mask]
orps xmm0, xmm7
movaps [ecx - 16], xmm5
movaps xmm4, xmm0
subps xmm0, xmm7
addps xmm4, xmm7
psrld xmm3, 23
rcpps xmm4, xmm4
mulps xmm0, xmm4
psubd xmm3, [_epi32_0x7f]
addps xmm0, xmm0
movaps xmm2, xmm0
mulps xmm0, xmm0
movaps xmm4, [_ps_log_p0]
movaps xmm6, [_ps_log_q0]
mulps xmm4, xmm0
movaps xmm5, [_ps_log_p1]
mulps xmm6, xmm0
movaps xmm7, [_ps_log_q1]
addps xmm4, xmm5
addps xmm6, xmm7
movaps xmm5, [_ps_log_p2]
mulps xmm4, xmm0
movaps xmm7, [_ps_log_q2]
mulps xmm6, xmm0
addps xmm4, xmm5
movaps xmm5, [_ps_log2_c0]
addps xmm6, xmm7
cvtdq2ps xmm7, xmm3
mulps xmm0, xmm4
rcpps xmm6, xmm6
mulps xmm0, xmm6
movaps xmm4, [_ps_exp2_hi]
mulps xmm0, xmm2
movaps xmm6, [_ps_exp2_lo]
mulps xmm2, xmm5
mulps xmm0, xmm5
addps xmm2, xmm7
movaps xmm3, [_ps_am_0p5]
addps xmm0, xmm2
xorps xmm2, xmm2
mulps xmm0, xmm1
minps xmm0, xmm4
movaps xmm4, [_ps_exp2_p0]
maxps xmm0, xmm6
movaps xmm6, [_ps_exp2_q0]
addps xmm3, xmm0
cmpnltps xmm2, xmm3
pand xmm2, [_epi32_1]
cvttps2dq xmm3, xmm3
psubd xmm3, xmm2
movaps xmm5, [_ps_exp2_p1]
cvtdq2ps xmm2, xmm3
movaps xmm7, [_ps_exp2_q1]
subps xmm0, xmm2
movaps xmm2, xmm0
mulps xmm0, xmm0
paddd xmm3, [_epi32_0x7f]
mulps xmm4, xmm0
mulps xmm6, xmm0
addps xmm4, xmm5
addps xmm6, xmm7
mulps xmm4, xmm0
movaps xmm5, [ecx - 16]
pslld xmm3, 23
addps xmm4, [_ps_exp2_p2]
mulps xmm2, xmm4
movaps xmm0, [_ps_am_1]
subps xmm6, xmm2
andps xmm3, xmm5
rcpps xmm6, xmm6
mulps xmm2, xmm6
addps xmm2, xmm2
addps xmm0, xmm2
mulps xmm0, xmm3
pop edi
pop esi
mov ecx, [ebp - 4]
xor ecx, ebp
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret
; __m128 mm_powf_ess(__m128 x, __m128 y);
global mm_powf_ess:function ; sse2 unpacked
mm_powf_ess:
push ebx
mov ebx, esp
sub esp, 8
and esp, 0FFFFFFF0h
add esp, 4
push ebp
mov ebp, [ebx + 4]
mov [esp + 4], ebp
mov ebp, esp
sub esp, 0B8h
xor eax, ebp
mov [ebp - 4], eax
push esi
push edi
xorps xmm5, xmm5
cmpltps xmm5, xmm0
mov ecx, esp
maxps xmm0, [_ps_am_min_norm_pos]
movaps xmm7, [_ps_am_1]
movaps xmm3, xmm0
and ecx, ~15
andps xmm0, [_ps_am_inv_mant_mask]
orps xmm0, xmm7
movaps [ecx - 16], xmm5
movaps xmm4, xmm0
subps xmm0, xmm7
addps xmm4, xmm7
psrld xmm3, 23
rcpps xmm4, xmm4
mulps xmm0, xmm4
psubd xmm3, [_epi32_0x7f]
addps xmm0, xmm0
movaps xmm2, xmm0
mulps xmm0, xmm0
movaps xmm4, [_ps_log_p0]
movaps xmm6, [_ps_log_q0]
mulps xmm4, xmm0
movaps xmm5, [_ps_log_p1]
mulps xmm6, xmm0
movaps xmm7, [_ps_log_q1]
addps xmm4, xmm5
addps xmm6, xmm7
movaps xmm5, [_ps_log_p2]
mulps xmm4, xmm0
movaps xmm7, [_ps_log_q2]
mulps xmm6, xmm0
addps xmm4, xmm5
movaps xmm5, [_ps_log2_c0]
addps xmm6, xmm7
cvtdq2ps xmm7, xmm3
mulps xmm0, xmm4
rcpps xmm6, xmm6
mulps xmm0, xmm6
movaps xmm4, [_ps_exp2_hi]
mulps xmm0, xmm2
movaps xmm6, [_ps_exp2_lo]
mulps xmm2, xmm5
mulps xmm0, xmm5
addps xmm2, xmm7
movaps xmm3, [_ps_am_0p5]
addps xmm0, xmm2
xorps xmm2, xmm2
mulps xmm0, xmm1
minps xmm0, xmm4
movaps xmm4, [_ps_exp2_p0]
maxps xmm0, xmm6
movaps xmm6, [_ps_exp2_q0]
addps xmm3, xmm0
cmpnltps xmm2, xmm3
pand xmm2, [_epi32_1]
cvttps2dq xmm3, xmm3
psubd xmm3, xmm2
movaps xmm5, [_ps_exp2_p1]
cvtdq2ps xmm2, xmm3
movaps xmm7, [_ps_exp2_q1]
subps xmm0, xmm2
movaps xmm2, xmm0
mulps xmm0, xmm0
paddd xmm3, [_epi32_0x7f]
mulps xmm4, xmm0
mulps xmm6, xmm0
addps xmm4, xmm5
addps xmm6, xmm7
mulps xmm4, xmm0
movaps xmm5, [ecx - 16]
pslld xmm3, 23
addps xmm4, [_ps_exp2_p2]
mulps xmm2, xmm4
movaps xmm0, [_ps_am_1]
subps xmm6, xmm2
andps xmm3, xmm5
rcpps xmm6, xmm6
mulps xmm2, xmm6
addps xmm2, xmm2
addps xmm0, xmm2
mulps xmm0, xmm3
pop edi
pop esi
mov ecx, [ebp - 4]
xor ecx, ebp
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret
; __m128 mm_powf_ps(__m128 x, __m128 y);
global mm_powf_ps:function ; sse packed
mm_powf_ps:
push ebx
mov ebx, esp
sub esp, 8
and esp, 0FFFFFFF0h
add esp, 4
push ebp
mov ebp, [ebx + 4]
mov [esp + 4], ebp
mov ebp, esp
sub esp, 0B8h
xor eax, ebp
mov [ebp - 4], eax
push esi
push edi
xorps xmm4, xmm4
cmpltps xmm4, xmm0
maxps xmm0, [_ps_am_min_norm_pos]
mov ecx, esp
movaps xmm7, [_ps_am_inv_mant_mask]
and ecx, ~15
movaps xmm3, [_ps_am_1]
movaps [ecx - 16], xmm0
andps xmm0, xmm7
orps xmm0, xmm3
movaps xmm7, xmm0
subps xmm0, xmm3
addps xmm7, xmm3
movq mm0, [ecx - 16]
rcpps xmm7, xmm7
mulps xmm0, xmm7
movq mm1, [ecx - 16 + 8]
addps xmm0, xmm0
movq mm7, [_pi32_0x7f]
psrld mm0, 23
psrld mm1, 23
movaps [ecx - 32], xmm4
movaps xmm2, xmm0
psubd mm0, mm7
mulps xmm2, xmm2
psubd mm1, mm7
movaps xmm4, [_ps_log_p0]
movaps xmm6, [_ps_log_q0]
mulps xmm4, xmm2
movaps xmm5, [_ps_log_p1]
mulps xmm6, xmm2
movaps xmm7, [_ps_log_q1]
addps xmm4, xmm5
addps xmm6, xmm7
movaps xmm5, [_ps_log_p2]
mulps xmm4, xmm2
cvtpi2ps xmm3, mm1
movaps xmm7, [_ps_log_q2]
mulps xmm6, xmm2
movlhps xmm3, xmm3
addps xmm4, xmm5
addps xmm6, xmm7
movaps xmm5, [_ps_log2_c0]
mulps xmm4, xmm2
cvtpi2ps xmm3, mm0
rcpps xmm6, xmm6
mulps xmm5, xmm1
mulps xmm4, xmm6
movaps xmm6, [_ps_exp2_hi]
mulps xmm4, xmm0
addps xmm0, xmm4
movaps xmm4, [_ps_exp2_lo]
mulps xmm3, xmm1
mulps xmm0, xmm5
movaps xmm5, [_ps_am_1]
xorps xmm7, xmm7
addps xmm0, xmm3
movaps xmm3, [_ps_am_0p5]
minps xmm0, xmm6
maxps xmm0, xmm4
addps xmm3, xmm0
movaps xmm2, xmm3
cvttps2pi mm0, xmm3
cmpltps xmm2, xmm7
movhlps xmm3, xmm3
andps xmm2, xmm5
cvttps2pi mm1, xmm3
movq mm5, [_pi32_0x7f]
cvtps2pi mm2, xmm2
movhlps xmm2, xmm2
cvtps2pi mm3, xmm2
psubd mm0, mm2
psubd mm1, mm3
cvtpi2ps xmm3, mm1
movlhps xmm3, xmm3
paddd mm1, mm5
cvtpi2ps xmm3, mm0
paddd mm0, mm5
subps xmm0, xmm3
movaps xmm2, xmm0
mulps xmm2, xmm2
movaps xmm6, [_ps_exp2_q0]
movaps xmm4, [_ps_exp2_p0]
mulps xmm6, xmm2
movaps xmm7, [_ps_exp2_q1]
mulps xmm4, xmm2
movaps xmm5, [_ps_exp2_p1]
addps xmm6, xmm7
pslld mm0, 23
addps xmm4, xmm5
movaps xmm5, [_ps_exp2_p2]
mulps xmm4, xmm2
pslld mm1, 23
movaps xmm3, [ecx - 32]
addps xmm4, xmm5
movq [ecx - 16], mm0
mulps xmm4, xmm0
movq [ecx - 16 + 8], mm1
subps xmm6, xmm4
movaps xmm7, [_ps_am_1]
rcpps xmm6, xmm6
mulps xmm4, xmm6
movaps xmm0, [ecx - 16]
addps xmm4, xmm4
addps xmm4, xmm7
mulps xmm0, xmm4
andps xmm0, xmm3
pop edi
pop esi
mov ecx, [ebp - 4]
xor ecx, ebp
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret
; __m128 mm_powf_ss(__m128 x, __m128 y);
global mm_powf_ss:function ; sse unpacked
mm_powf_ss:
push ebx
mov ebx, esp
sub esp, 8
and esp, 0FFFFFFF0h
add esp, 4
push ebp
mov ebp, [ebx + 4]
mov [esp + 4], ebp
mov ebp, esp
sub esp, 0B8h
xor eax, ebp
mov [ebp - 4], eax
push esi
push edi
xorps xmm7, xmm7
comiss xmm7, xmm0
movss xmm7, [_ps_am_inv_mant_mask]
maxss xmm0, [_ps_am_min_norm_pos]
jnc l_zerobase
movss xmm3, [_ps_am_1]
movss [esp - 4], xmm0
andps xmm0, xmm7
orps xmm0, xmm3
movss xmm7, xmm0
addss xmm7, xmm3
subss xmm0, xmm3
mov edx, [esp - 4]
rcpss xmm7, xmm7
mulss xmm0, xmm7
addss xmm0, xmm0
shr edx, 23
movss xmm4, [_ps_log_p0]
movss xmm6, [_ps_log_q0]
sub edx, 0x7f
movss xmm2, xmm0
mulss xmm2, xmm2
mulss xmm4, xmm2
movss xmm5, [_ps_log_p1]
mulss xmm6, xmm2
cvtsi2ss xmm3, edx
movss xmm7, [_ps_log_q1]
addss xmm4, xmm5
mulss xmm3, xmm1
addss xmm6, xmm7
movss xmm5, [_ps_log_p2]
mulss xmm4, xmm2
movss xmm7, [_ps_log_q2]
mulss xmm6, xmm2
addss xmm4, xmm5
mulss xmm1, [_ps_log2_c0]
addss xmm6, xmm7
mulss xmm4, xmm2
rcpss xmm6, xmm6
mulss xmm6, xmm0
mulss xmm4, xmm6
movss xmm6, [_ps_exp2_hi]
addss xmm0, xmm4
movss xmm4, [_ps_exp2_lo]
xorps xmm7, xmm7
movss xmm5, [_ps_am_0p5]
mulss xmm0, xmm1
addss xmm0, xmm3
xor ecx, ecx
minss xmm0, xmm6
mov edx, 1
maxss xmm0, xmm4
addss xmm5, xmm0
comiss xmm5, xmm7
cvttss2si eax, xmm5
cmovc ecx, edx
sub eax, ecx
cvtsi2ss xmm5, eax
add eax, 0x7f
subss xmm0, xmm5
movss xmm2, xmm0
mulss xmm2, xmm2
movss xmm6, [_ps_exp2_q0]
movss xmm4, [_ps_exp2_p0]
mulss xmm6, xmm2
movss xmm7, [_ps_exp2_q1]
mulss xmm4, xmm2
movss xmm5, [_ps_exp2_p1]
shl eax, 23
addss xmm6, xmm7
addss xmm4, xmm5
movss xmm5, [_ps_exp2_p2]
mulss xmm4, xmm2
addss xmm4, xmm5
mulss xmm4, xmm0
mov [esp - 4], eax
subss xmm6, xmm4
movss xmm7, [_ps_am_1]
rcpss xmm6, xmm6
mulss xmm4, xmm6
movss xmm0, [esp - 4]
addss xmm4, xmm4
addss xmm4, xmm7
mulss xmm0, xmm4
jmp l_quit
l_zerobase:
xorps xmm0, xmm0
l_quit:
pop edi
pop esi
mov ecx, [ebp - 4]
xor ecx, ebp
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret
|