diff options
Diffstat (limited to 'distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm')
-rw-r--r-- | distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm | 405 |
1 files changed, 0 insertions, 405 deletions
diff --git a/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm b/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm deleted file mode 100644 index d2d31ec..0000000 --- a/distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm +++ /dev/null @@ -1,405 +0,0 @@ -; -; pII-optimised MMX format converters for HERMES -; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) -; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) -; This source code is licensed under the GNU LGPL -; -; Please refer to the file COPYING.LIB contained in the distribution for -; licensing conditions -; -; COPYRIGHT NOTICE -; -; This file partly contains code that is (c) Intel Corporation, specifically -; the mode detection routine, and the converter to 15 bit (8 pixel -; conversion routine from the mmx programming tutorial pages). -; -; -; These routines aren't exactly pII optimised - it's just that as they -; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to -; optimise them for p5 MMXs.. - -BITS 32 - -%include "common.inc" - -SDL_FUNC _ConvertMMXpII32_24RGB888 -SDL_FUNC _ConvertMMXpII32_16RGB565 -SDL_FUNC _ConvertMMXpII32_16BGR565 -SDL_FUNC _ConvertMMXpII32_16RGB555 -SDL_FUNC _ConvertMMXpII32_16BGR555 - -;; Macros for conversion routines - -%macro _push_immq_mask 1 - push dword %1 - push dword %1 -%endmacro - -%macro load_immq 2 - _push_immq_mask %2 - movq %1, [esp] -%endmacro - -%macro pand_immq 2 - _push_immq_mask %2 - pand %1, [esp] -%endmacro - -%define CLEANUP_IMMQ_LOADS(num) \ - add esp, byte 8 * num - -%define mmx32_rgb888_mask 00ffffffh -%define mmx32_rgb565_b 000000f8h -%define mmx32_rgb565_g 0000fc00h -%define mmx32_rgb565_r 00f80000h - -%define mmx32_rgb555_rb 00f800f8h -%define mmx32_rgb555_g 0000f800h -%define mmx32_rgb555_mul 20000008h -%define mmx32_bgr555_mul 00082000h - -SECTION .text - -_ConvertMMXpII32_24RGB888: - - ; set up mm6 as the mask, mm7 as zero - load_immq mm6, mmx32_rgb888_mask - CLEANUP_IMMQ_LOADS(1) - pxor mm7, mm7 - - mov edx, ecx ; save ecx - and ecx, 0fffffffch ; clear lower two bits - jnz .L1 - jmp .L2 - -.L1: - - movq mm0, [esi] ; A R G B a r g b - pand mm0, mm6 ; 0 R G B 0 r g b - movq mm1, [esi+8] ; A R G B a r g b - pand mm1, mm6 ; 0 R G B 0 r g b - - movq mm2, mm0 ; 0 R G B 0 r g b - punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B - punpckldq mm0, mm7 ; 0 0 0 0 0 r g b - psllq mm2, 24 ; 0 0 R G B 0 0 0 - por mm0, mm2 ; 0 0 R G B r g b - - movq mm3, mm1 ; 0 R G B 0 r g b - psllq mm3, 48 ; g b 0 0 0 0 0 0 - por mm0, mm3 ; g b R G B r g b - - movq mm4, mm1 ; 0 R G B 0 r g b - punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B - punpckldq mm1, mm7 ; 0 0 0 0 0 r g b - psrlq mm1, 16 ; 0 0 0 R G B 0 r - psllq mm4, 8 ; 0 0 0 0 R G B 0 - por mm1, mm4 ; 0 0 0 0 R G B r - - movq [edi], mm0 - add esi, BYTE 16 - movd [edi+8], mm1 - add edi, BYTE 12 - sub ecx, BYTE 4 - jnz .L1 - -.L2: - mov ecx, edx - and ecx, BYTE 3 - jz .L4 -.L3: - mov al, [esi] - mov bl, [esi+1] - mov dl, [esi+2] - mov [edi], al - mov [edi+1], bl - mov [edi+2], dl - add esi, BYTE 4 - add edi, BYTE 3 - dec ecx - jnz .L3 -.L4: - return - - - -_ConvertMMXpII32_16RGB565: - - ; set up masks - load_immq mm5, mmx32_rgb565_b - load_immq mm6, mmx32_rgb565_g - load_immq mm7, mmx32_rgb565_r - CLEANUP_IMMQ_LOADS(3) - - mov edx, ecx - shr ecx, 2 - jnz .L1 - jmp .L2 ; not necessary at the moment, but doesn't hurt (much) - -.L1: - movq mm0, [esi] ; argb - movq mm1, mm0 ; argb - pand mm0, mm6 ; 00g0 - movq mm3, mm1 ; argb - pand mm1, mm5 ; 000b - pand mm3, mm7 ; 0r00 - pslld mm1, 2 ; 0 0 000000bb bbb00000 - por mm0, mm1 ; 0 0 ggggggbb bbb00000 - psrld mm0, 5 ; 0 0 00000ggg gggbbbbb - - movq mm4, [esi+8] ; argb - movq mm2, mm4 ; argb - pand mm4, mm6 ; 00g0 - movq mm1, mm2 ; argb - pand mm2, mm5 ; 000b - pand mm1, mm7 ; 0r00 - pslld mm2, 2 ; 0 0 000000bb bbb00000 - por mm4, mm2 ; 0 0 ggggggbb bbb00000 - psrld mm4, 5 ; 0 0 00000ggg gggbbbbb - - packuswb mm3, mm1 ; R 0 r 0 - packssdw mm0, mm4 ; as above.. ish - por mm0, mm3 ; done. - movq [edi], mm0 - - add esi, 16 - add edi, 8 - dec ecx - jnz .L1 - -.L2: - mov ecx, edx - and ecx, BYTE 3 - jz .L4 -.L3: - mov al, [esi] - mov bh, [esi+1] - mov ah, [esi+2] - shr al, 3 - and eax, 0F81Fh ; BYTE? - shr ebx, 5 - and ebx, 07E0h ; BYTE? - add eax, ebx - mov [edi], al - mov [edi+1], ah - add esi, BYTE 4 - add edi, BYTE 2 - dec ecx - jnz .L3 - -.L4: - retn - - -_ConvertMMXpII32_16BGR565: - - load_immq mm5, mmx32_rgb565_r - load_immq mm6, mmx32_rgb565_g - load_immq mm7, mmx32_rgb565_b - CLEANUP_IMMQ_LOADS(3) - - mov edx, ecx - shr ecx, 2 - jnz .L1 - jmp .L2 - -.L1: - movq mm0, [esi] ; a r g b - movq mm1, mm0 ; a r g b - pand mm0, mm6 ; 0 0 g 0 - movq mm3, mm1 ; a r g b - pand mm1, mm5 ; 0 r 0 0 - pand mm3, mm7 ; 0 0 0 b - - psllq mm3, 16 ; 0 b 0 0 - psrld mm1, 14 ; 0 0 000000rr rrr00000 - por mm0, mm1 ; 0 0 ggggggrr rrr00000 - psrld mm0, 5 ; 0 0 00000ggg gggrrrrr - - movq mm4, [esi+8] ; a r g b - movq mm2, mm4 ; a r g b - pand mm4, mm6 ; 0 0 g 0 - movq mm1, mm2 ; a r g b - pand mm2, mm5 ; 0 r 0 0 - pand mm1, mm7 ; 0 0 0 b - - psllq mm1, 16 ; 0 b 0 0 - psrld mm2, 14 ; 0 0 000000rr rrr00000 - por mm4, mm2 ; 0 0 ggggggrr rrr00000 - psrld mm4, 5 ; 0 0 00000ggg gggrrrrr - - packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 - packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR - por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr - movq [edi], mm0 - - add esi, BYTE 16 - add edi, BYTE 8 - dec ecx - jnz .L1 - -.L2: - and edx, BYTE 3 - jz .L4 -.L3: - mov al, [esi+2] - mov bh, [esi+1] - mov ah, [esi] - shr al, 3 - and eax, 0F81Fh ; BYTE ? - shr ebx, 5 - and ebx, 07E0h ; BYTE ? - add eax, ebx - mov [edi], al - mov [edi+1], ah - add esi, BYTE 4 - add edi, BYTE 2 - dec edx - jnz .L3 - -.L4: - retn - -_ConvertMMXpII32_16BGR555: - - ; the 16BGR555 converter is identical to the RGB555 one, - ; except it uses a different multiplier for the pmaddwd - ; instruction. cool huh. - - load_immq mm7, mmx32_bgr555_mul - jmp _convert_bgr555_cheat - -; This is the same as the Intel version.. they obviously went to -; much more trouble to expand/coil the loop than I did, so theirs -; would almost certainly be faster, even if only a little. -; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is -; (I think) a more accurate name.. -_ConvertMMXpII32_16RGB555: - - load_immq mm7, mmx32_rgb555_mul -_convert_bgr555_cheat: - load_immq mm6, mmx32_rgb555_g - CLEANUP_IMMQ_LOADS(2) - - mov edx,ecx ; Save ecx - - and ecx,DWORD 0fffffff8h ; clear lower three bits - jnz .L_OK - jmp near .L2 - -.L_OK: - - movq mm2,[esi+8] - - movq mm0,[esi] - movq mm3,mm2 - - pand_immq mm3, mmx32_rgb555_rb - movq mm1,mm0 - - pand_immq mm1, mmx32_rgb555_rb - pmaddwd mm3,mm7 - - CLEANUP_IMMQ_LOADS(2) - - pmaddwd mm1,mm7 - pand mm2,mm6 - -.L1: - movq mm4,[esi+24] - pand mm0,mm6 - - movq mm5,[esi+16] - por mm3,mm2 - - psrld mm3,6 - por mm1,mm0 - - movq mm0,mm4 - psrld mm1,6 - - pand_immq mm0, mmx32_rgb555_rb - packssdw mm1,mm3 - - movq mm3,mm5 - pmaddwd mm0,mm7 - - pand_immq mm3, mmx32_rgb555_rb - pand mm4,mm6 - - movq [edi],mm1 - pmaddwd mm3,mm7 - - add esi,BYTE 32 - por mm4,mm0 - - pand mm5,mm6 - psrld mm4,6 - - movq mm2,[esi+8] - por mm5,mm3 - - movq mm0,[esi] - psrld mm5,6 - - movq mm3,mm2 - movq mm1,mm0 - - pand_immq mm3, mmx32_rgb555_rb - packssdw mm5,mm4 - - pand_immq mm1, mmx32_rgb555_rb - pand mm2,mm6 - - CLEANUP_IMMQ_LOADS(4) - - movq [edi+8],mm5 - pmaddwd mm3,mm7 - - pmaddwd mm1,mm7 - add edi,BYTE 16 - - sub ecx,BYTE 8 - jz .L2 - jmp .L1 - - -.L2: - mov ecx,edx - - and ecx,BYTE 7 - jz .L4 - -.L3: - mov ebx,[esi] - add esi,BYTE 4 - - mov eax,ebx - mov edx,ebx - - shr eax,3 - shr edx,6 - - and eax,BYTE 0000000000011111b - and edx, 0000001111100000b - - shr ebx,9 - - or eax,edx - - and ebx, 0111110000000000b - - or eax,ebx - - mov [edi],ax - add edi,BYTE 2 - - dec ecx - jnz .L3 - -.L4: - retn - -%ifidn __OUTPUT_FORMAT__,elf -section .note.GNU-stack noalloc noexec nowrite progbits -%endif |