[SSE1/SSE2] Возведение в степень
От: Muxa  
Дата: 22.09.10 12:35
Оценка: 16 (2)
Однажды мне понадобилось быстрое возведение в степень при помощи SSE под линукс.
Готового решения я не нашел. Взял виндовую интеловскую библиотеку AMaths, сдернул из нее нужный мне кусок и портировал под ассемблерный компилятор nasm (вроде получилось кросспатформенно, но под винду не пробовал компилировать/запускать).
Делюсь тем что из этого получилось.

Весь код это два файла: mm_powf.s — исходный код на ассемблере и mm_powf.h — заголовочный файл для использования в C/C++.
Заголовочный файл содержит объявления 4-х функций для поэлементного возведения в степень __m128-векторов (sse/sse2 и packed/unpacked версии).
Просто включите этот заголовок в любой C/C++ файл (например, code.c) и используйте соотв. функцию.

Как собирать:
; компилируем ассемблерный код в объектный файл
     nasm mm_powf.s -o mm_powf.o -f elf
; компилируем С код и линкуем, не забывая взять объектник из предыдущего шага
     gcc -O3 -msse2 -mmmx -mfpmath=sse,387 -march=pentium4 -fomit-frame-pointer code.c mm_powf.o -o code.bin


Собственно, сами файлы:
  mm_powf.h
#ifndef __MM_POWF_H
#define __MM_POWF_H

#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif


#include <xmmintrin.h>

#define _PS_CONST(Name, Val) \
ALIGN16_BEG const float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

#define _PS_EXTERN_CONST(Name, Val) \
ALIGN16_BEG const float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

#define _PS_CONST4(Name, Val0, Val1, Val2, Val3) \
ALIGN16_BEG const float _ps_##Name[4] ALIGN16_END = { Val0, Val1, Val2, Val3 }

#define _PS_CONST_TYPE(Name, Type, Val) \
ALIGN16_BEG const Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }; \

#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
ALIGN16_BEG const Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }; \

#define _EPI32_CONST(Name, Val) \
ALIGN16_BEG const int _epi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

#define _PI32_CONST(Name, Val) \
ALIGN16_BEG const int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

_PS_EXTERN_CONST(am_1, 1.0f);
_PS_EXTERN_CONST(am_0p5, 0.5f);
_PS_EXTERN_CONST_TYPE(am_min_norm_pos, int, 0x00800000);
_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int, ~0x7f800000);

_EPI32_CONST(1, 1);
_EPI32_CONST(0x7f, 0x7f);
_PI32_CONST(0x7f, 0x7f);

_PS_CONST(log_p0, -7.89580278884799154124e-1f);
_PS_CONST(log_p1, 1.63866645699558079767e1f);
_PS_CONST(log_p2, -6.41409952958715622951e1f);

_PS_CONST(log_q0, -3.56722798256324312549e1f);
_PS_CONST(log_q1, 3.12093766372244180303e2f);
_PS_CONST(log_q2, -7.69691943550460008604e2f);

_PS_CONST(log2_c0, 1.44269504088896340735992f);

_PS_CONST(exp2_hi, 127.4999961853f);
_PS_CONST(exp2_lo, -127.4999961853f);

_PS_CONST(exp2_p0, 2.30933477057345225087e-2f);
_PS_CONST(exp2_p1, 2.02020656693165307700e1f);
_PS_CONST(exp2_p2, 1.51390680115615096133e3f);

_PS_CONST(exp2_q0, 2.33184211722314911771e2f);
_PS_CONST(exp2_q1, 4.36821166879210612817e3f);

__m128 mm_powf_eps(__m128 x, __m128 y);
__m128 mm_powf_ess(__m128 x, __m128 y);
__m128 mm_powf_ps(__m128 x, __m128 y);
__m128 mm_powf_ss(__m128 x, __m128 y);

#endif // __MM_POWF_H

  mm_powf.s
extern _ps_am_inv_mant_mask
extern _ps_am_min_norm_pos
extern _ps_am_1
extern _ps_am_0p5
extern _pi32_0x7f
extern _epi32_0x7f
extern _epi32_1
extern _ps_log_p0
extern _ps_log_p1
extern _ps_log_p2
extern _ps_log_q0
extern _ps_log_q1
extern _ps_log_q2
extern _ps_log2_c0
extern _ps_exp2_hi
extern _ps_exp2_lo
extern _ps_exp2_p0
extern _ps_exp2_p1
extern _ps_exp2_p2
extern _ps_exp2_q0
extern _ps_exp2_q1

; __m128 mm_powf_eps(__m128 x, __m128 y);
global mm_powf_eps:function ; sse2 packed
mm_powf_eps:
        push            ebx
        mov             ebx, esp
        sub             esp, 8
        and             esp, 0FFFFFFF0h
        add             esp, 4
        push            ebp
        mov             ebp, [ebx + 4]
        mov             [esp + 4], ebp
        mov             ebp, esp
        sub             esp, 0B8h
        xor             eax, ebp
        mov             [ebp - 4], eax
        push            esi
        push            edi


        xorps           xmm5, xmm5
        cmpltps         xmm5, xmm0
        mov             ecx, esp
        maxps           xmm0, [_ps_am_min_norm_pos]
        movaps          xmm7, [_ps_am_1]
        movaps          xmm3, xmm0
        and             ecx, ~15

        andps           xmm0, [_ps_am_inv_mant_mask]
        orps            xmm0, xmm7

        movaps          [ecx - 16], xmm5

        movaps          xmm4, xmm0
        subps           xmm0, xmm7
        addps           xmm4, xmm7
        psrld           xmm3, 23
        rcpps           xmm4, xmm4
        mulps           xmm0, xmm4
        psubd           xmm3, [_epi32_0x7f]
        addps           xmm0, xmm0

        movaps          xmm2, xmm0
        mulps           xmm0, xmm0

        movaps          xmm4, [_ps_log_p0]
        movaps          xmm6, [_ps_log_q0]

        mulps           xmm4, xmm0
        movaps          xmm5, [_ps_log_p1]
        mulps           xmm6, xmm0
        movaps          xmm7, [_ps_log_q1]

        addps           xmm4, xmm5
        addps           xmm6, xmm7

        movaps          xmm5, [_ps_log_p2]
        mulps           xmm4, xmm0
        movaps          xmm7, [_ps_log_q2]
        mulps           xmm6, xmm0

        addps           xmm4, xmm5
        movaps          xmm5, [_ps_log2_c0]
        addps           xmm6, xmm7
        cvtdq2ps        xmm7, xmm3

        mulps           xmm0, xmm4
        rcpps           xmm6, xmm6

        mulps           xmm0, xmm6
        movaps          xmm4, [_ps_exp2_hi]
        mulps           xmm0, xmm2
        movaps          xmm6, [_ps_exp2_lo]
        mulps           xmm2, xmm5
        mulps           xmm0, xmm5
        addps           xmm2, xmm7
        movaps          xmm3, [_ps_am_0p5]
        addps           xmm0, xmm2
        xorps           xmm2, xmm2

        mulps           xmm0, xmm1

        minps           xmm0, xmm4
        movaps          xmm4, [_ps_exp2_p0]
        maxps           xmm0, xmm6
        movaps          xmm6, [_ps_exp2_q0]

        addps           xmm3, xmm0

        cmpnltps        xmm2, xmm3
        pand            xmm2, [_epi32_1]

        cvttps2dq       xmm3, xmm3

        psubd           xmm3, xmm2
        movaps          xmm5, [_ps_exp2_p1]

        cvtdq2ps        xmm2, xmm3
        movaps          xmm7, [_ps_exp2_q1]

        subps           xmm0, xmm2

        movaps          xmm2, xmm0
        mulps           xmm0, xmm0

        paddd           xmm3, [_epi32_0x7f]

        mulps           xmm4, xmm0
        mulps           xmm6, xmm0
        addps           xmm4, xmm5
        addps           xmm6, xmm7

        mulps           xmm4, xmm0
        movaps          xmm5, [ecx - 16]
        pslld           xmm3, 23
        addps           xmm4, [_ps_exp2_p2]

        mulps           xmm2, xmm4

        movaps          xmm0, [_ps_am_1]
        subps           xmm6, xmm2
        andps           xmm3, xmm5
        rcpps           xmm6, xmm6
        mulps           xmm2, xmm6
        addps           xmm2, xmm2
        addps           xmm0, xmm2

        mulps           xmm0, xmm3


        pop             edi
        pop             esi
        mov             ecx, [ebp - 4]
        xor             ecx, ebp
        mov             esp, ebp
        pop             ebp
        mov             esp, ebx
        pop             ebx
        ret

; __m128 mm_powf_ess(__m128 x, __m128 y);
global mm_powf_ess:function ; sse2 unpacked
mm_powf_ess:
        push            ebx
        mov             ebx, esp
        sub             esp, 8
        and             esp, 0FFFFFFF0h
        add             esp, 4
        push            ebp
        mov             ebp, [ebx + 4]
        mov             [esp + 4], ebp
        mov             ebp, esp
        sub             esp, 0B8h
        xor             eax, ebp
        mov             [ebp - 4], eax
        push            esi
        push            edi


        xorps           xmm5, xmm5
        cmpltps         xmm5, xmm0
        mov             ecx, esp
        maxps           xmm0, [_ps_am_min_norm_pos]
        movaps          xmm7, [_ps_am_1]
        movaps          xmm3, xmm0
        and             ecx, ~15

        andps           xmm0, [_ps_am_inv_mant_mask]
        orps            xmm0, xmm7

        movaps          [ecx - 16], xmm5

        movaps          xmm4, xmm0
        subps           xmm0, xmm7
        addps           xmm4, xmm7
        psrld           xmm3, 23
        rcpps           xmm4, xmm4
        mulps           xmm0, xmm4
        psubd           xmm3, [_epi32_0x7f]
        addps           xmm0, xmm0

        movaps          xmm2, xmm0
        mulps           xmm0, xmm0

        movaps          xmm4, [_ps_log_p0]
        movaps          xmm6, [_ps_log_q0]

        mulps           xmm4, xmm0
        movaps          xmm5, [_ps_log_p1]
        mulps           xmm6, xmm0
        movaps          xmm7, [_ps_log_q1]

        addps           xmm4, xmm5
        addps           xmm6, xmm7

        movaps          xmm5, [_ps_log_p2]
        mulps           xmm4, xmm0
        movaps          xmm7, [_ps_log_q2]
        mulps           xmm6, xmm0

        addps           xmm4, xmm5
        movaps          xmm5, [_ps_log2_c0]
        addps           xmm6, xmm7
        cvtdq2ps        xmm7, xmm3

        mulps           xmm0, xmm4
        rcpps           xmm6, xmm6

        mulps           xmm0, xmm6
        movaps          xmm4, [_ps_exp2_hi]
        mulps           xmm0, xmm2
        movaps          xmm6, [_ps_exp2_lo]
        mulps           xmm2, xmm5
        mulps           xmm0, xmm5
        addps           xmm2, xmm7
        movaps          xmm3, [_ps_am_0p5]
        addps           xmm0, xmm2
        xorps           xmm2, xmm2

        mulps           xmm0, xmm1

        minps           xmm0, xmm4
        movaps          xmm4, [_ps_exp2_p0]
        maxps           xmm0, xmm6
        movaps          xmm6, [_ps_exp2_q0]

        addps           xmm3, xmm0

        cmpnltps        xmm2, xmm3
        pand            xmm2, [_epi32_1]

        cvttps2dq       xmm3, xmm3

        psubd           xmm3, xmm2
        movaps          xmm5, [_ps_exp2_p1]

        cvtdq2ps        xmm2, xmm3
        movaps          xmm7, [_ps_exp2_q1]

        subps           xmm0, xmm2

        movaps          xmm2, xmm0
        mulps           xmm0, xmm0

        paddd           xmm3, [_epi32_0x7f]

        mulps           xmm4, xmm0
        mulps           xmm6, xmm0
        addps           xmm4, xmm5
        addps           xmm6, xmm7

        mulps           xmm4, xmm0
        movaps          xmm5, [ecx - 16]
        pslld           xmm3, 23
        addps           xmm4, [_ps_exp2_p2]

        mulps           xmm2, xmm4

        movaps          xmm0, [_ps_am_1]
        subps           xmm6, xmm2
        andps           xmm3, xmm5
        rcpps           xmm6, xmm6
        mulps           xmm2, xmm6
        addps           xmm2, xmm2
        addps           xmm0, xmm2

        mulps           xmm0, xmm3


        pop             edi
        pop             esi
        mov             ecx, [ebp - 4]
        xor             ecx, ebp
        mov             esp, ebp
        pop             ebp
        mov             esp, ebx
        pop             ebx
        ret

; __m128 mm_powf_ps(__m128 x, __m128 y);
global mm_powf_ps:function ; sse packed
mm_powf_ps:
        push            ebx
        mov             ebx, esp
        sub             esp, 8
        and             esp, 0FFFFFFF0h
        add             esp, 4
        push            ebp
        mov             ebp, [ebx + 4]
        mov             [esp + 4], ebp
        mov             ebp, esp
        sub             esp, 0B8h
        xor             eax, ebp
        mov             [ebp - 4], eax
        push            esi
        push            edi


        xorps           xmm4, xmm4
        cmpltps         xmm4, xmm0
        maxps           xmm0, [_ps_am_min_norm_pos]
        mov             ecx, esp
        movaps          xmm7, [_ps_am_inv_mant_mask]
        and             ecx, ~15
        movaps          xmm3, [_ps_am_1]
        movaps          [ecx - 16], xmm0

        andps           xmm0, xmm7
        orps            xmm0, xmm3
        movaps          xmm7, xmm0

        subps           xmm0, xmm3
        addps           xmm7, xmm3
        movq            mm0, [ecx - 16]
        rcpps           xmm7, xmm7  
        mulps           xmm0, xmm7
        movq            mm1, [ecx - 16 + 8]
        addps           xmm0, xmm0

        movq            mm7, [_pi32_0x7f]
        psrld           mm0, 23
        psrld           mm1, 23
        movaps          [ecx - 32], xmm4

        movaps          xmm2, xmm0
        psubd           mm0, mm7
        mulps           xmm2, xmm2
        psubd           mm1, mm7

        movaps          xmm4, [_ps_log_p0]
        movaps          xmm6, [_ps_log_q0]

        mulps           xmm4, xmm2
        movaps          xmm5, [_ps_log_p1]
        mulps           xmm6, xmm2
        movaps          xmm7, [_ps_log_q1]

        addps           xmm4, xmm5
        addps           xmm6, xmm7

        movaps          xmm5, [_ps_log_p2]
        mulps           xmm4, xmm2
        cvtpi2ps                xmm3, mm1
        movaps          xmm7, [_ps_log_q2]
        mulps           xmm6, xmm2

        movlhps         xmm3, xmm3
        addps           xmm4, xmm5
        addps           xmm6, xmm7

        movaps          xmm5, [_ps_log2_c0]
        mulps           xmm4, xmm2
        cvtpi2ps        xmm3, mm0
        rcpps           xmm6, xmm6  

        mulps           xmm5, xmm1
        mulps           xmm4, xmm6
        movaps          xmm6, [_ps_exp2_hi]
        mulps           xmm4, xmm0
        addps           xmm0, xmm4
        movaps          xmm4, [_ps_exp2_lo]
        mulps           xmm3, xmm1
        mulps           xmm0, xmm5
        movaps          xmm5, [_ps_am_1]
        xorps           xmm7, xmm7

        addps           xmm0, xmm3
        movaps          xmm3, [_ps_am_0p5]

        minps           xmm0, xmm6
        maxps           xmm0, xmm4

        addps           xmm3, xmm0

        movaps          xmm2, xmm3

        cvttps2pi       mm0, xmm3
        cmpltps         xmm2, xmm7
        movhlps         xmm3, xmm3
        andps           xmm2, xmm5
        cvttps2pi       mm1, xmm3
        movq            mm5, [_pi32_0x7f]

        cvtps2pi        mm2, xmm2
        movhlps         xmm2, xmm2
        cvtps2pi        mm3, xmm2

        psubd           mm0, mm2
        psubd           mm1, mm3

        cvtpi2ps        xmm3, mm1
        movlhps         xmm3, xmm3
        paddd           mm1, mm5
        cvtpi2ps        xmm3, mm0
        paddd           mm0, mm5

        subps           xmm0, xmm3

        movaps          xmm2, xmm0
        mulps           xmm2, xmm2

        movaps          xmm6, [_ps_exp2_q0]
        movaps          xmm4, [_ps_exp2_p0]

        mulps           xmm6, xmm2
        movaps          xmm7, [_ps_exp2_q1]
        mulps           xmm4, xmm2
        movaps          xmm5, [_ps_exp2_p1]

        addps           xmm6, xmm7
        pslld           mm0, 23
        addps           xmm4, xmm5

        movaps          xmm5, [_ps_exp2_p2]
        mulps           xmm4, xmm2
        pslld           mm1, 23
        movaps          xmm3, [ecx - 32]

        addps           xmm4, xmm5
        movq            [ecx - 16], mm0

        mulps           xmm4, xmm0
        movq            [ecx - 16 + 8], mm1

        subps           xmm6, xmm4
        movaps          xmm7, [_ps_am_1]
        rcpps           xmm6, xmm6  
        mulps           xmm4, xmm6
        movaps          xmm0, [ecx - 16]
        addps           xmm4, xmm4
        addps           xmm4, xmm7

        mulps           xmm0, xmm4
        andps           xmm0, xmm3


        pop             edi
        pop             esi
        mov             ecx, [ebp - 4]
        xor             ecx, ebp
        mov             esp, ebp
        pop             ebp
        mov             esp, ebx
        pop             ebx
        ret

; __m128 mm_powf_ss(__m128 x, __m128 y);
global mm_powf_ss:function ; sse unpacked
mm_powf_ss:
        push            ebx
        mov             ebx, esp
        sub             esp, 8
        and             esp, 0FFFFFFF0h
        add             esp, 4
        push            ebp
        mov             ebp, [ebx + 4]
        mov             [esp + 4], ebp
        mov             ebp, esp
        sub             esp, 0B8h
        xor             eax, ebp
        mov             [ebp - 4], eax
        push            esi
        push            edi


        xorps           xmm7, xmm7
        comiss          xmm7, xmm0
        movss           xmm7, [_ps_am_inv_mant_mask]
        maxss           xmm0, [_ps_am_min_norm_pos]
        jnc             l_zerobase
        movss           xmm3, [_ps_am_1]
        movss           [esp - 4], xmm0

        andps           xmm0, xmm7
        orps            xmm0, xmm3
        movss           xmm7, xmm0

        addss           xmm7, xmm3
        subss           xmm0, xmm3
        mov             edx, [esp - 4]
        rcpss           xmm7, xmm7  
        mulss           xmm0, xmm7
        addss           xmm0, xmm0

        shr             edx, 23

        movss           xmm4, [_ps_log_p0]
        movss           xmm6, [_ps_log_q0]

        sub             edx, 0x7f
        movss           xmm2, xmm0
        mulss           xmm2, xmm2

        mulss           xmm4, xmm2
        movss           xmm5, [_ps_log_p1]
        mulss           xmm6, xmm2
        cvtsi2ss        xmm3, edx
        movss           xmm7, [_ps_log_q1]

        addss           xmm4, xmm5
        mulss           xmm3, xmm1
        addss           xmm6, xmm7

        movss           xmm5, [_ps_log_p2]
        mulss           xmm4, xmm2
        movss           xmm7, [_ps_log_q2]
        mulss           xmm6, xmm2

        addss           xmm4, xmm5
        mulss           xmm1, [_ps_log2_c0]
        addss           xmm6, xmm7

        mulss           xmm4, xmm2
        rcpss           xmm6, xmm6  

        mulss           xmm6, xmm0
        mulss           xmm4, xmm6
        movss           xmm6, [_ps_exp2_hi]
        addss           xmm0, xmm4
        movss           xmm4, [_ps_exp2_lo]
        xorps           xmm7, xmm7
        movss           xmm5, [_ps_am_0p5]
        mulss           xmm0, xmm1

        addss           xmm0, xmm3
        xor             ecx, ecx

        minss           xmm0, xmm6
        mov             edx, 1
        maxss           xmm0, xmm4

        addss           xmm5, xmm0

        comiss          xmm5, xmm7
        cvttss2si       eax, xmm5
        cmovc           ecx, edx
        sub             eax, ecx

        cvtsi2ss        xmm5, eax
        add             eax, 0x7f

        subss           xmm0, xmm5

        movss           xmm2, xmm0
        mulss           xmm2, xmm2

        movss           xmm6, [_ps_exp2_q0]
        movss           xmm4, [_ps_exp2_p0]

        mulss           xmm6, xmm2
        movss           xmm7, [_ps_exp2_q1]
        mulss           xmm4, xmm2
        movss           xmm5, [_ps_exp2_p1]

        shl             eax, 23
        addss           xmm6, xmm7
        addss           xmm4, xmm5

        movss           xmm5, [_ps_exp2_p2]
        mulss           xmm4, xmm2

        addss           xmm4, xmm5

        mulss           xmm4, xmm0

        mov             [esp - 4], eax
        subss           xmm6, xmm4
        movss           xmm7, [_ps_am_1]
        rcpss           xmm6, xmm6  
        mulss           xmm4, xmm6
        movss           xmm0, [esp - 4]
        addss           xmm4, xmm4
        addss           xmm4, xmm7

        mulss           xmm0, xmm4

        jmp             l_quit

l_zerobase:
        xorps           xmm0, xmm0


l_quit:
        pop             edi
        pop             esi
        mov             ecx, [ebp - 4]
        xor             ecx, ebp
        mov             esp, ebp
        pop             ebp
        mov             esp, ebx
        pop             ebx
        ret
sse fast pow linux
 
Подождите ...
Wait...
Пока на собственное сообщение не было ответов, его можно удалить.