From 9682c8870b8ff5e4ac2e4c70b759f791c6f38c1f Mon Sep 17 00:00:00 2001
From: Jesse Hall <jessehall@google.com>
Date: Mon, 9 Jul 2012 11:27:07 -0700
Subject: Import SDL release-1.2.15

Change-Id: I505c4aea24325cad475f217db5589814b4c75dbf
---
 distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm | 405 +++++++++++++++++++++++++++++
 1 file changed, 405 insertions(+)
 create mode 100644 distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm

(limited to 'distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm')

diff --git a/distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm b/distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm
new file mode 100644
index 0000000..20c3277
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/hermes/mmxp2_32.asm
@@ -0,0 +1,405 @@
+;
+; pII-optimised MMX format converters for HERMES
+; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
+;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
+; This source code is licensed under the GNU LGPL
+; 
+; Please refer to the file COPYING.LIB contained in the distribution for
+; licensing conditions		
+;
+; COPYRIGHT NOTICE
+; 
+; This file partly contains code that is (c) Intel Corporation, specifically
+; the mode detection routine, and the converter to 15 bit (8 pixel
+; conversion routine from the mmx programming tutorial pages).
+;
+;
+; These routines aren't exactly pII optimised - it's just that as they
+; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
+; optimise them for p5 MMXs..
+
+BITS 32
+
+%include "common.inc"
+	
+SDL_FUNC _ConvertMMXpII32_24RGB888
+SDL_FUNC _ConvertMMXpII32_16RGB565
+SDL_FUNC _ConvertMMXpII32_16BGR565
+SDL_FUNC _ConvertMMXpII32_16RGB555
+SDL_FUNC _ConvertMMXpII32_16BGR555
+
+;; Macros for conversion routines
+
+%macro _push_immq_mask 1
+	push dword %1
+	push dword %1
+%endmacro
+
+%macro load_immq 2
+	_push_immq_mask %2
+	movq %1, [esp]
+%endmacro
+
+%macro pand_immq 2
+	_push_immq_mask %2
+	pand %1, [esp]
+%endmacro
+
+%define CLEANUP_IMMQ_LOADS(num) \
+	add esp, byte 8 * num
+
+%define mmx32_rgb888_mask 00ffffffh
+%define mmx32_rgb565_b 000000f8h
+%define mmx32_rgb565_g 0000fc00h
+%define mmx32_rgb565_r 00f80000h
+
+%define mmx32_rgb555_rb 00f800f8h
+%define mmx32_rgb555_g 0000f800h
+%define mmx32_rgb555_mul 20000008h
+%define mmx32_bgr555_mul 00082000h
+
+SECTION .text
+
+_ConvertMMXpII32_24RGB888:
+
+        ; set up mm6 as the mask, mm7 as zero
+        load_immq mm6, mmx32_rgb888_mask
+        CLEANUP_IMMQ_LOADS(1)
+        pxor mm7, mm7
+
+        mov edx, ecx                    ; save ecx
+        and ecx, 0fffffffch             ; clear lower two bits
+        jnz .L1
+        jmp .L2
+
+.L1:
+
+        movq mm0, [esi]                 ; A R G B a r g b
+        pand mm0, mm6                   ; 0 R G B 0 r g b
+        movq mm1, [esi+8]               ; A R G B a r g b
+        pand mm1, mm6                   ; 0 R G B 0 r g b
+
+        movq mm2, mm0                   ; 0 R G B 0 r g b
+        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
+        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
+        psllq mm2, 24                   ; 0 0 R G B 0 0 0
+        por mm0, mm2                    ; 0 0 R G B r g b
+
+        movq mm3, mm1                   ; 0 R G B 0 r g b
+        psllq mm3, 48                   ; g b 0 0 0 0 0 0
+        por mm0, mm3                    ; g b R G B r g b
+
+        movq mm4, mm1                   ; 0 R G B 0 r g b
+        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
+        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
+        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
+        psllq mm4, 8                    ; 0 0 0 0 R G B 0
+        por mm1, mm4                    ; 0 0 0 0 R G B r
+
+        movq [edi], mm0
+        add esi, BYTE 16
+        movd [edi+8], mm1
+        add edi, BYTE 12
+        sub ecx, BYTE 4
+        jnz .L1
+
+.L2:
+        mov ecx, edx
+        and ecx, BYTE 3
+        jz .L4
+.L3:
+        mov al, [esi]
+        mov bl, [esi+1]
+        mov dl, [esi+2]
+        mov [edi], al
+        mov [edi+1], bl
+        mov [edi+2], dl
+        add esi, BYTE 4
+        add edi, BYTE 3
+        dec ecx
+        jnz .L3
+.L4:
+        retn
+
+
+
+_ConvertMMXpII32_16RGB565:
+
+        ; set up masks
+        load_immq mm5, mmx32_rgb565_b
+        load_immq mm6, mmx32_rgb565_g
+        load_immq mm7, mmx32_rgb565_r
+        CLEANUP_IMMQ_LOADS(3)
+
+        mov edx, ecx
+        shr ecx, 2
+        jnz .L1
+        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
+
+.L1:
+        movq mm0, [esi]         ; argb
+        movq mm1, mm0           ; argb
+        pand mm0, mm6           ; 00g0
+        movq mm3, mm1           ; argb
+        pand mm1, mm5           ; 000b
+        pand mm3, mm7           ; 0r00
+        pslld mm1, 2            ; 0 0 000000bb bbb00000
+        por mm0, mm1            ; 0 0 ggggggbb bbb00000
+        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
+
+        movq mm4, [esi+8]       ; argb
+        movq mm2, mm4           ; argb
+        pand mm4, mm6           ; 00g0
+        movq mm1, mm2           ; argb
+        pand mm2, mm5           ; 000b
+        pand mm1, mm7           ; 0r00
+        pslld mm2, 2            ; 0 0 000000bb bbb00000
+        por mm4, mm2            ; 0 0 ggggggbb bbb00000
+        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
+
+        packuswb mm3, mm1       ; R 0 r 0
+        packssdw mm0, mm4       ; as above.. ish
+        por mm0, mm3            ; done.
+        movq [edi], mm0
+
+        add esi, 16
+        add edi, 8
+        dec ecx
+        jnz .L1
+
+.L2:
+        mov ecx, edx
+        and ecx, BYTE 3
+        jz .L4
+.L3:
+        mov al, [esi]
+        mov bh, [esi+1]
+        mov ah, [esi+2]
+        shr al, 3
+        and eax, 0F81Fh            ; BYTE?
+        shr ebx, 5
+        and ebx, 07E0h             ; BYTE?
+        add eax, ebx
+        mov [edi], al
+        mov [edi+1], ah
+        add esi, BYTE 4
+        add edi, BYTE 2
+        dec ecx
+        jnz .L3
+
+.L4:
+	retn
+
+	
+_ConvertMMXpII32_16BGR565:
+
+        load_immq mm5, mmx32_rgb565_r
+        load_immq mm6, mmx32_rgb565_g
+        load_immq mm7, mmx32_rgb565_b
+        CLEANUP_IMMQ_LOADS(3)
+
+        mov edx, ecx
+        shr ecx, 2
+        jnz .L1
+        jmp .L2
+
+.L1:
+        movq mm0, [esi]                 ; a r g b
+        movq mm1, mm0                   ; a r g b
+        pand mm0, mm6                   ; 0 0 g 0
+        movq mm3, mm1                   ; a r g b
+        pand mm1, mm5                   ; 0 r 0 0
+        pand mm3, mm7                   ; 0 0 0 b
+
+        psllq mm3, 16                   ; 0 b 0 0
+        psrld mm1, 14                   ; 0 0 000000rr rrr00000
+        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
+        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
+
+        movq mm4, [esi+8]               ; a r g b
+        movq mm2, mm4                   ; a r g b
+        pand mm4, mm6                   ; 0 0 g 0
+        movq mm1, mm2                   ; a r g b
+        pand mm2, mm5                   ; 0 r 0 0
+        pand mm1, mm7                   ; 0 0 0 b
+
+        psllq mm1, 16                   ; 0 b 0 0
+        psrld mm2, 14                   ; 0 0 000000rr rrr00000
+        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
+        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
+
+        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
+        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
+        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
+        movq [edi], mm0
+
+        add esi, BYTE 16
+        add edi, BYTE 8
+        dec ecx
+        jnz .L1
+
+.L2:
+        and edx, BYTE 3
+        jz .L4
+.L3:
+        mov al, [esi+2]
+        mov bh, [esi+1]
+        mov ah, [esi]
+        shr al, 3
+        and eax, 0F81Fh                    ; BYTE ?
+        shr ebx, 5
+        and ebx, 07E0h                     ; BYTE ?
+        add eax, ebx
+        mov [edi], al
+        mov [edi+1], ah
+        add esi, BYTE 4
+        add edi, BYTE 2
+        dec edx
+        jnz .L3
+
+.L4:
+        retn
+
+_ConvertMMXpII32_16BGR555:
+
+        ; the 16BGR555 converter is identical to the RGB555 one,
+        ; except it uses a different multiplier for the pmaddwd
+        ; instruction.  cool huh.
+
+        load_immq mm7, mmx32_bgr555_mul
+        jmp _convert_bgr555_cheat
+
+; This is the same as the Intel version.. they obviously went to
+; much more trouble to expand/coil the loop than I did, so theirs
+; would almost certainly be faster, even if only a little.
+; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
+; (I think) a more accurate name..
+_ConvertMMXpII32_16RGB555:
+
+	load_immq mm7, mmx32_rgb555_mul
+_convert_bgr555_cheat:
+	load_immq mm6, mmx32_rgb555_g
+	CLEANUP_IMMQ_LOADS(2)
+        
+	mov edx,ecx		           ; Save ecx 
+
+        and ecx,DWORD 0fffffff8h            ; clear lower three bits
+	jnz .L_OK
+        jmp near .L2 
+
+.L_OK:
+	
+	movq mm2,[esi+8]
+
+	movq mm0,[esi]
+	movq mm3,mm2
+
+	pand_immq mm3, mmx32_rgb555_rb
+	movq mm1,mm0
+
+	pand_immq mm1, mmx32_rgb555_rb
+	pmaddwd mm3,mm7
+
+	CLEANUP_IMMQ_LOADS(2)
+
+	pmaddwd mm1,mm7
+	pand mm2,mm6
+
+.L1:
+	movq mm4,[esi+24]
+	pand mm0,mm6
+
+	movq mm5,[esi+16]
+	por mm3,mm2
+
+	psrld mm3,6
+	por mm1,mm0
+
+	movq mm0,mm4
+	psrld mm1,6
+
+	pand_immq mm0, mmx32_rgb555_rb
+	packssdw mm1,mm3
+
+	movq mm3,mm5
+	pmaddwd mm0,mm7
+
+	pand_immq mm3, mmx32_rgb555_rb
+	pand mm4,mm6
+
+	movq [edi],mm1			
+	pmaddwd mm3,mm7
+
+        add esi,BYTE 32
+	por mm4,mm0
+
+	pand mm5,mm6
+	psrld mm4,6
+
+	movq mm2,[esi+8]
+	por mm5,mm3
+
+	movq mm0,[esi]
+	psrld mm5,6
+
+	movq mm3,mm2
+	movq mm1,mm0
+
+	pand_immq mm3, mmx32_rgb555_rb
+	packssdw mm5,mm4
+
+	pand_immq mm1, mmx32_rgb555_rb
+	pand mm2,mm6
+
+	CLEANUP_IMMQ_LOADS(4)
+
+	movq [edi+8],mm5
+	pmaddwd mm3,mm7
+
+	pmaddwd mm1,mm7
+        add edi,BYTE 16
+	
+        sub ecx,BYTE 8
+	jz .L2
+        jmp .L1
+
+
+.L2:	
+	mov ecx,edx
+	
+        and ecx,BYTE 7
+	jz .L4
+	
+.L3:	
+	mov ebx,[esi]
+        add esi,BYTE 4
+	
+        mov eax,ebx
+        mov edx,ebx
+
+        shr eax,3
+        shr edx,6
+
+        and eax,BYTE 0000000000011111b
+        and edx,     0000001111100000b
+
+        shr ebx,9
+
+        or eax,edx
+
+        and ebx,     0111110000000000b
+
+        or eax,ebx
+
+        mov [edi],ax
+        add edi,BYTE 2
+
+	dec ecx
+	jnz .L3	
+
+.L4:		
+	retn
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
-- 
cgit v1.1