diff options
Diffstat (limited to 'distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm')
-rw-r--r-- | distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm | 405 |
1 files changed, 405 insertions, 0 deletions
diff --git a/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm b/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm new file mode 100644 index 0000000..d2d31ec --- /dev/null +++ b/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm @@ -0,0 +1,405 @@ +; +; pII-optimised MMX format converters for HERMES +; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) +; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) +; This source code is licensed under the GNU LGPL +; +; Please refer to the file COPYING.LIB contained in the distribution for +; licensing conditions +; +; COPYRIGHT NOTICE +; +; This file partly contains code that is (c) Intel Corporation, specifically +; the mode detection routine, and the converter to 15 bit (8 pixel +; conversion routine from the mmx programming tutorial pages). +; +; +; These routines aren't exactly pII optimised - it's just that as they +; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to +; optimise them for p5 MMXs.. + +BITS 32 + +%include "common.inc" + +SDL_FUNC _ConvertMMXpII32_24RGB888 +SDL_FUNC _ConvertMMXpII32_16RGB565 +SDL_FUNC _ConvertMMXpII32_16BGR565 +SDL_FUNC _ConvertMMXpII32_16RGB555 +SDL_FUNC _ConvertMMXpII32_16BGR555 + +;; Macros for conversion routines + +%macro _push_immq_mask 1 + push dword %1 + push dword %1 +%endmacro + +%macro load_immq 2 + _push_immq_mask %2 + movq %1, [esp] +%endmacro + +%macro pand_immq 2 + _push_immq_mask %2 + pand %1, [esp] +%endmacro + +%define CLEANUP_IMMQ_LOADS(num) \ + add esp, byte 8 * num + +%define mmx32_rgb888_mask 00ffffffh +%define mmx32_rgb565_b 000000f8h +%define mmx32_rgb565_g 0000fc00h +%define mmx32_rgb565_r 00f80000h + +%define mmx32_rgb555_rb 00f800f8h +%define mmx32_rgb555_g 0000f800h +%define mmx32_rgb555_mul 20000008h +%define mmx32_bgr555_mul 00082000h + +SECTION .text + +_ConvertMMXpII32_24RGB888: + + ; set up mm6 as the mask, mm7 as zero + load_immq mm6, mmx32_rgb888_mask + CLEANUP_IMMQ_LOADS(1) + pxor mm7, mm7 + + mov edx, ecx ; save ecx + and ecx, 0fffffffch ; clear lower two bits + jnz .L1 + jmp .L2 + +.L1: + + movq mm0, [esi] ; A R G B a r g b + pand mm0, mm6 ; 0 R G B 0 r g b + movq mm1, [esi+8] ; A R G B a r g b + pand mm1, mm6 ; 0 R G B 0 r g b + + movq mm2, mm0 ; 0 R G B 0 r g b + punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B + punpckldq mm0, mm7 ; 0 0 0 0 0 r g b + psllq mm2, 24 ; 0 0 R G B 0 0 0 + por mm0, mm2 ; 0 0 R G B r g b + + movq mm3, mm1 ; 0 R G B 0 r g b + psllq mm3, 48 ; g b 0 0 0 0 0 0 + por mm0, mm3 ; g b R G B r g b + + movq mm4, mm1 ; 0 R G B 0 r g b + punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B + punpckldq mm1, mm7 ; 0 0 0 0 0 r g b + psrlq mm1, 16 ; 0 0 0 R G B 0 r + psllq mm4, 8 ; 0 0 0 0 R G B 0 + por mm1, mm4 ; 0 0 0 0 R G B r + + movq [edi], mm0 + add esi, BYTE 16 + movd [edi+8], mm1 + add edi, BYTE 12 + sub ecx, BYTE 4 + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bl, [esi+1] + mov dl, [esi+2] + mov [edi], al + mov [edi+1], bl + mov [edi+2], dl + add esi, BYTE 4 + add edi, BYTE 3 + dec ecx + jnz .L3 +.L4: + return + + + +_ConvertMMXpII32_16RGB565: + + ; set up masks + load_immq mm5, mmx32_rgb565_b + load_immq mm6, mmx32_rgb565_g + load_immq mm7, mmx32_rgb565_r + CLEANUP_IMMQ_LOADS(3) + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 ; not necessary at the moment, but doesn't hurt (much) + +.L1: + movq mm0, [esi] ; argb + movq mm1, mm0 ; argb + pand mm0, mm6 ; 00g0 + movq mm3, mm1 ; argb + pand mm1, mm5 ; 000b + pand mm3, mm7 ; 0r00 + pslld mm1, 2 ; 0 0 000000bb bbb00000 + por mm0, mm1 ; 0 0 ggggggbb bbb00000 + psrld mm0, 5 ; 0 0 00000ggg gggbbbbb + + movq mm4, [esi+8] ; argb + movq mm2, mm4 ; argb + pand mm4, mm6 ; 00g0 + movq mm1, mm2 ; argb + pand mm2, mm5 ; 000b + pand mm1, mm7 ; 0r00 + pslld mm2, 2 ; 0 0 000000bb bbb00000 + por mm4, mm2 ; 0 0 ggggggbb bbb00000 + psrld mm4, 5 ; 0 0 00000ggg gggbbbbb + + packuswb mm3, mm1 ; R 0 r 0 + packssdw mm0, mm4 ; as above.. ish + por mm0, mm3 ; done. + movq [edi], mm0 + + add esi, 16 + add edi, 8 + dec ecx + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bh, [esi+1] + mov ah, [esi+2] + shr al, 3 + and eax, 0F81Fh ; BYTE? + shr ebx, 5 + and ebx, 07E0h ; BYTE? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec ecx + jnz .L3 + +.L4: + retn + + +_ConvertMMXpII32_16BGR565: + + load_immq mm5, mmx32_rgb565_r + load_immq mm6, mmx32_rgb565_g + load_immq mm7, mmx32_rgb565_b + CLEANUP_IMMQ_LOADS(3) + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 + +.L1: + movq mm0, [esi] ; a r g b + movq mm1, mm0 ; a r g b + pand mm0, mm6 ; 0 0 g 0 + movq mm3, mm1 ; a r g b + pand mm1, mm5 ; 0 r 0 0 + pand mm3, mm7 ; 0 0 0 b + + psllq mm3, 16 ; 0 b 0 0 + psrld mm1, 14 ; 0 0 000000rr rrr00000 + por mm0, mm1 ; 0 0 ggggggrr rrr00000 + psrld mm0, 5 ; 0 0 00000ggg gggrrrrr + + movq mm4, [esi+8] ; a r g b + movq mm2, mm4 ; a r g b + pand mm4, mm6 ; 0 0 g 0 + movq mm1, mm2 ; a r g b + pand mm2, mm5 ; 0 r 0 0 + pand mm1, mm7 ; 0 0 0 b + + psllq mm1, 16 ; 0 b 0 0 + psrld mm2, 14 ; 0 0 000000rr rrr00000 + por mm4, mm2 ; 0 0 ggggggrr rrr00000 + psrld mm4, 5 ; 0 0 00000ggg gggrrrrr + + packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 + packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR + por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr + movq [edi], mm0 + + add esi, BYTE 16 + add edi, BYTE 8 + dec ecx + jnz .L1 + +.L2: + and edx, BYTE 3 + jz .L4 +.L3: + mov al, [esi+2] + mov bh, [esi+1] + mov ah, [esi] + shr al, 3 + and eax, 0F81Fh ; BYTE ? + shr ebx, 5 + and ebx, 07E0h ; BYTE ? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec edx + jnz .L3 + +.L4: + retn + +_ConvertMMXpII32_16BGR555: + + ; the 16BGR555 converter is identical to the RGB555 one, + ; except it uses a different multiplier for the pmaddwd + ; instruction. cool huh. + + load_immq mm7, mmx32_bgr555_mul + jmp _convert_bgr555_cheat + +; This is the same as the Intel version.. they obviously went to +; much more trouble to expand/coil the loop than I did, so theirs +; would almost certainly be faster, even if only a little. +; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is +; (I think) a more accurate name.. +_ConvertMMXpII32_16RGB555: + + load_immq mm7, mmx32_rgb555_mul +_convert_bgr555_cheat: + load_immq mm6, mmx32_rgb555_g + CLEANUP_IMMQ_LOADS(2) + + mov edx,ecx ; Save ecx + + and ecx,DWORD 0fffffff8h ; clear lower three bits + jnz .L_OK + jmp near .L2 + +.L_OK: + + movq mm2,[esi+8] + + movq mm0,[esi] + movq mm3,mm2 + + pand_immq mm3, mmx32_rgb555_rb + movq mm1,mm0 + + pand_immq mm1, mmx32_rgb555_rb + pmaddwd mm3,mm7 + + CLEANUP_IMMQ_LOADS(2) + + pmaddwd mm1,mm7 + pand mm2,mm6 + +.L1: + movq mm4,[esi+24] + pand mm0,mm6 + + movq mm5,[esi+16] + por mm3,mm2 + + psrld mm3,6 + por mm1,mm0 + + movq mm0,mm4 + psrld mm1,6 + + pand_immq mm0, mmx32_rgb555_rb + packssdw mm1,mm3 + + movq mm3,mm5 + pmaddwd mm0,mm7 + + pand_immq mm3, mmx32_rgb555_rb + pand mm4,mm6 + + movq [edi],mm1 + pmaddwd mm3,mm7 + + add esi,BYTE 32 + por mm4,mm0 + + pand mm5,mm6 + psrld mm4,6 + + movq mm2,[esi+8] + por mm5,mm3 + + movq mm0,[esi] + psrld mm5,6 + + movq mm3,mm2 + movq mm1,mm0 + + pand_immq mm3, mmx32_rgb555_rb + packssdw mm5,mm4 + + pand_immq mm1, mmx32_rgb555_rb + pand mm2,mm6 + + CLEANUP_IMMQ_LOADS(4) + + movq [edi+8],mm5 + pmaddwd mm3,mm7 + + pmaddwd mm1,mm7 + add edi,BYTE 16 + + sub ecx,BYTE 8 + jz .L2 + jmp .L1 + + +.L2: + mov ecx,edx + + and ecx,BYTE 7 + jz .L4 + +.L3: + mov ebx,[esi] + add esi,BYTE 4 + + mov eax,ebx + mov edx,ebx + + shr eax,3 + shr edx,6 + + and eax,BYTE 0000000000011111b + and edx, 0000001111100000b + + shr ebx,9 + + or eax,edx + + and ebx, 0111110000000000b + + or eax,ebx + + mov [edi],ax + add edi,BYTE 2 + + dec ecx + jnz .L3 + +.L4: + retn + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |