diff options
Diffstat (limited to 'sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc')
8 files changed, 3861 insertions, 0 deletions
diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk new file mode 100644 index 0000000..4106a68 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk @@ -0,0 +1,36 @@ + +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +LOCAL_MODULE_TAGS := optional + +ifeq ($(ARCH_ARM_HAVE_NEON),true) +LOCAL_SRC_FILES := \ + csc_yuv420_nv12t_y_neon.s \ + csc_yuv420_nv12t_uv_neon.s \ + csc_nv12t_yuv420_y_neon.s \ + csc_nv12t_yuv420_uv_neon.s \ + csc_interleave_memcpy.s \ + csc_deinterleave_memcpy.s + +else +LOCAL_SRC_FILES := \ + color_space_convertor.c + +endif + +LOCAL_MODULE := libseccsc + +LOCAL_CFLAGS := + +LOCAL_ARM_MODE := arm + +LOCAL_STATIC_LIBRARIES := + +LOCAL_SHARED_LIBRARIES := liblog + +LOCAL_C_INCLUDES := \ + $(SEC_CODECS)/video/mfc_c110/include + +include $(BUILD_STATIC_LIBRARY) + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/color_space_convertor.c b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/color_space_convertor.c new file mode 100644 index 0000000..c1ac638 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/color_space_convertor.c @@ -0,0 +1,1092 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file color_space_convertor.c + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +#include "stdlib.h" +#include "color_space_convertor.h" + +#define TILED_SIZE 64*32 + +/* + * De-interleaves src to dest1, dest2 + * + * @param dest1 + * Address of de-interleaved data[out] + * + * @param dest2 + * Address of de-interleaved data[out] + * + * @param src + * Address of interleaved data[in] + * + * @param src_size + * Size of interleaved data[in] + */ +void csc_deinterleave_memcpy(char *dest1, char *dest2, char *src, int src_size) +{ + int i = 0; + for(i=0; i<src_size/2; i++) { + dest1[i] = src[i*2]; + dest2[i] = src[i*2+1]; + } +} + +/* + * Interleaves src1, src2 to dest + * + * @param dest + * Address of interleaved data[out] + * + * @param src1 + * Address of de-interleaved data[in] + * + * @param src2 + * Address of de-interleaved data[in] + * + * @param src_size + * Size of de-interleaved data[in] + */ +void csc_interleave_memcpy(char *dest, char *src1, char *src2, int src_size) +{ + int i = 0; + for(i=0; i<src_size; i++) { + dest[i*2] = src1[i]; + dest[i*2+1] = src2[i]; + } +} + +/* + * Converts tiled data to linear. + * 1. Y of NV12T to Y of YUV420P + * 2. Y of NV12T to Y of YUV420S + * 3. UV of NV12T to UV of YUV420S + * + * @param yuv420_dest + * Y or UV plane address of YUV420[out] + * + * @param nv12t_src + * Y or UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ +void csc_tiled_to_linear(char *yuv420_dest, char *nv12t_src, int yuv420_width, int yuv420_height) +{ + unsigned int i, j; + unsigned int tiled_x_index = 0, tiled_y_index = 0; + unsigned int aligned_x_size = 0; + unsigned int tiled_offset = 0, tiled_offset1 = 0, tiled_offset2 = 0, tiled_offset3 = 0; + unsigned int temp1 = 0, temp2 = 0; + + if (yuv420_width >= 1024) { + for (i=0; i<yuv420_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 8; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + temp2 = 8; + } else { + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 4; + } + } + temp1 = i&0x1F; + memcpy(yuv420_dest+yuv420_width*(i), nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*1, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*2, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*3, nv12t_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + memcpy(yuv420_dest+yuv420_width*(i)+64*4, nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*5, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*6, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*7, nv12t_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + memcpy(yuv420_dest+yuv420_width*(i)+64*8, nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*9, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*10, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*11, nv12t_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + memcpy(yuv420_dest+yuv420_width*(i)+64*12, nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*13, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*14, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+yuv420_width*(i)+64*15, nv12t_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = 1024; + } + + if ((yuv420_width-aligned_x_size) >= 512) { + for (i=0; i<yuv420_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 8; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + temp2 = 8; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 4; + } + } + temp1 = i&0x1F; + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i), nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*1, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*2, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*3, nv12t_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*4, nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*5, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*6, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*7, nv12t_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = aligned_x_size+512; + } + + if ((yuv420_width-aligned_x_size) >= 256) { + for (i=0; i<yuv420_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + } + } + temp1 = i&0x1F; + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i), nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*1, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*2, nv12t_src+tiled_offset2+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64*3, nv12t_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = aligned_x_size+256; + } + + if ((yuv420_width-aligned_x_size) >= 128) { + for (i=0; i<yuv420_height; i=i+2) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + } + } + temp1 = i&0x1F; + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i), nv12t_src+tiled_offset+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i)+64, nv12t_src+tiled_offset1+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i+1), nv12t_src+tiled_offset+64*(temp1+1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i+1)+64, nv12t_src+tiled_offset1+64*(temp1+1), 64); + } + aligned_x_size = aligned_x_size+128; + } + + if ((yuv420_width-aligned_x_size) >= 64) { + for (i=0; i<yuv420_height; i=i+4) { + tiled_offset = 0; + tiled_x_index = aligned_x_size>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + + temp1 = i&0x1F; + temp2 = aligned_x_size&0x3F; + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i), nv12t_src+tiled_offset+temp2+64*(temp1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i+1), nv12t_src+tiled_offset+temp2+64*(temp1+1), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i+2), nv12t_src+tiled_offset+temp2+64*(temp1+2), 64); + memcpy(yuv420_dest+aligned_x_size+yuv420_width*(i+3), nv12t_src+tiled_offset+temp2+64*(temp1+3), 64); + } + aligned_x_size = aligned_x_size+64; + } + + if (yuv420_width != aligned_x_size) { + for (i=0; i<yuv420_height; i=i+4) { + for (j=aligned_x_size; j<yuv420_width; j=j+4) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + + temp1 = i&0x1F; + temp2 = j&0x3F; + memcpy(yuv420_dest+j+yuv420_width*(i), nv12t_src+tiled_offset+temp2+64*(temp1), 4); + memcpy(yuv420_dest+j+yuv420_width*(i+1), nv12t_src+tiled_offset+temp2+64*(temp1+1), 4); + memcpy(yuv420_dest+j+yuv420_width*(i+2), nv12t_src+tiled_offset+temp2+64*(temp1+2), 4); + memcpy(yuv420_dest+j+yuv420_width*(i+3), nv12t_src+tiled_offset+temp2+64*(temp1+3), 4); + } + } + } +} + +/* + * Converts and Deinterleaves tiled data to linear + * 1. UV of NV12T to UV of YUV420P + * + * @param yuv420_u_dest + * U plane address of YUV420P[out] + * + * @param yuv420_v_dest + * V plane address of YUV420P[out] + * + * @param nv12t_src + * UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ +void csc_tiled_to_linear_deinterleave(char *yuv420_u_dest, char *yuv420_v_dest, char *nv12t_uv_src, int yuv420_width, int yuv420_uv_height) +{ + unsigned int i, j; + unsigned int tiled_x_index = 0, tiled_y_index = 0; + unsigned int aligned_x_size = 0; + unsigned int tiled_offset = 0, tiled_offset1 = 0, tiled_offset2 = 0, tiled_offset3 = 0; + unsigned int temp1 = 0, temp2 = 0; + + if (yuv420_width >= 1024) { + for (i=0; i<yuv420_uv_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 8; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + temp2 = 8; + } else { + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 4; + } + } + temp1 = i&0x1F; + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i), yuv420_v_dest+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*1, yuv420_v_dest+yuv420_width/2*(i)+32*1, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*2, yuv420_v_dest+yuv420_width/2*(i)+32*2, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*3, yuv420_v_dest+yuv420_width/2*(i)+32*3, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*4, yuv420_v_dest+yuv420_width/2*(i)+32*4, nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*5, yuv420_v_dest+yuv420_width/2*(i)+32*5, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*6, yuv420_v_dest+yuv420_width/2*(i)+32*6, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*7, yuv420_v_dest+yuv420_width/2*(i)+32*7, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*8, yuv420_v_dest+yuv420_width/2*(i)+32*8, nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*9, yuv420_v_dest+yuv420_width/2*(i)+32*9, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*10, yuv420_v_dest+yuv420_width/2*(i)+32*10, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*11, yuv420_v_dest+yuv420_width/2*(i)+32*11, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*12, yuv420_v_dest+yuv420_width/2*(i)+32*12, nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*13, yuv420_v_dest+yuv420_width/2*(i)+32*13, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*14, yuv420_v_dest+yuv420_width/2*(i)+32*14, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+yuv420_width/2*(i)+32*15, yuv420_v_dest+yuv420_width/2*(i)+32*15, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = 1024; + } + + if ((yuv420_width-aligned_x_size) >= 512) { + for (i=0; i<yuv420_uv_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 8; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + temp2 = 8; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + temp2 = 4; + } + } + temp1 = i&0x1F; + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*2, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*2, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*3, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*3, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + + tiled_offset = tiled_offset+temp2*2048; + tiled_offset1 = tiled_offset1+temp2*2048; + tiled_offset2 = tiled_offset2+temp2*2048; + tiled_offset3 = tiled_offset3+temp2*2048; + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*4, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*4, nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*5, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*5, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*6, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*6, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*7, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*7, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = aligned_x_size+512; + } + + if ((yuv420_width-aligned_x_size) >= 256) { + for (i=0; i<yuv420_uv_height; i=i+1) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*6; + tiled_offset3 = tiled_offset+2048*7; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + tiled_offset2 = tiled_offset+2048*2; + tiled_offset3 = tiled_offset+2048*3; + } + } + temp1 = i&0x1F; + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*2, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*2, nv12t_uv_src+tiled_offset2+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*3, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*3, nv12t_uv_src+tiled_offset3+64*(temp1), 64); + } + aligned_x_size = aligned_x_size+256; + } + + if ((yuv420_width-aligned_x_size) >= 128) { + for (i=0; i<yuv420_uv_height; i=i+2) { + tiled_offset = 0; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+2; + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + tiled_offset1 = tiled_offset+2048*1; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + /* even fomula: x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + if ((i+32)<temp2) { + temp1 = aligned_x_size>>5; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + } else { + temp1 = aligned_x_size>>6; + tiled_offset = tiled_offset+(temp1<<11); + tiled_offset1 = tiled_offset+2048*1; + } + } + temp1 = i&0x1F; + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i)+32*1, nv12t_uv_src+tiled_offset1+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i+1), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i+1), nv12t_uv_src+tiled_offset+64*(temp1+1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i+1)+32*1, yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i+1)+32*1, nv12t_uv_src+tiled_offset1+64*(temp1+1), 64); + } + aligned_x_size = aligned_x_size+128; + } + + if ((yuv420_width-aligned_x_size) >= 64) { + for (i=0; i<yuv420_uv_height; i=i+2) { + tiled_offset = 0; + tiled_x_index = aligned_x_size>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + temp1 = i&0x1F; + temp2 = aligned_x_size&0x3F; + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+64*(temp1), 64); + csc_deinterleave_memcpy(yuv420_u_dest+aligned_x_size/2+yuv420_width/2*(i+1), yuv420_v_dest+aligned_x_size/2+yuv420_width/2*(i+1), nv12t_uv_src+tiled_offset+64*(temp1+1), 64); + } + aligned_x_size = aligned_x_size+64; + } + + if (yuv420_width != aligned_x_size) { + for (i=0; i<yuv420_uv_height; i=i+2) { + for (j=aligned_x_size; j<yuv420_width; j=j+4) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + temp1 = i&0x1F; + temp2 = j&0x3F; + csc_deinterleave_memcpy(yuv420_u_dest+j/2+yuv420_width/2*(i), yuv420_v_dest+j/2+yuv420_width/2*(i), nv12t_uv_src+tiled_offset+temp2+64*(temp1), 4); + csc_deinterleave_memcpy(yuv420_u_dest+j/2+yuv420_width/2*(i+1), yuv420_v_dest+j/2+yuv420_width/2*(i+1), nv12t_uv_src+tiled_offset+temp2+64*(temp1+1), 4); + } + } + } +} + +/* + * Converts linear data to tiled. + * 1. Y of YUV420P to Y of NV12T + * 2. Y of YUV420S to Y of NV12T + * 3. UV of YUV420S to UV of NV12T + * + * @param nv12t_dest + * Y or UV plane address of NV12T[out] + * + * @param yuv420_src + * Y or UV plane address of YUV420P(S)[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ +void csc_linear_to_tiled(char *nv12t_dest, char *yuv420_src, int yuv420_width, int yuv420_height) +{ + unsigned int i, j; + unsigned int tiled_x_index = 0, tiled_y_index = 0; + unsigned int aligned_x_size = 0, aligned_y_size = 0; + unsigned int tiled_offset = 0; + unsigned int temp1 = 0, temp2 = 0; + + aligned_y_size = (yuv420_height>>5)<<5; + aligned_x_size = (yuv420_width>>6)<<6; + + for (i=0; i<aligned_y_size; i=i+32) { + for (j=0; j<aligned_x_size; j=j+64) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + + memcpy(nv12t_dest+tiled_offset, yuv420_src+j+yuv420_width*(i), 64); + memcpy(nv12t_dest+tiled_offset+64*1, yuv420_src+j+yuv420_width*(i+1), 64); + memcpy(nv12t_dest+tiled_offset+64*2, yuv420_src+j+yuv420_width*(i+2), 64); + memcpy(nv12t_dest+tiled_offset+64*3, yuv420_src+j+yuv420_width*(i+3), 64); + memcpy(nv12t_dest+tiled_offset+64*4, yuv420_src+j+yuv420_width*(i+4), 64); + memcpy(nv12t_dest+tiled_offset+64*5, yuv420_src+j+yuv420_width*(i+5), 64); + memcpy(nv12t_dest+tiled_offset+64*6, yuv420_src+j+yuv420_width*(i+6), 64); + memcpy(nv12t_dest+tiled_offset+64*7, yuv420_src+j+yuv420_width*(i+7), 64); + memcpy(nv12t_dest+tiled_offset+64*8, yuv420_src+j+yuv420_width*(i+8), 64); + memcpy(nv12t_dest+tiled_offset+64*9, yuv420_src+j+yuv420_width*(i+9), 64); + memcpy(nv12t_dest+tiled_offset+64*10, yuv420_src+j+yuv420_width*(i+10), 64); + memcpy(nv12t_dest+tiled_offset+64*11, yuv420_src+j+yuv420_width*(i+11), 64); + memcpy(nv12t_dest+tiled_offset+64*12, yuv420_src+j+yuv420_width*(i+12), 64); + memcpy(nv12t_dest+tiled_offset+64*13, yuv420_src+j+yuv420_width*(i+13), 64); + memcpy(nv12t_dest+tiled_offset+64*14, yuv420_src+j+yuv420_width*(i+14), 64); + memcpy(nv12t_dest+tiled_offset+64*15, yuv420_src+j+yuv420_width*(i+15), 64); + memcpy(nv12t_dest+tiled_offset+64*16, yuv420_src+j+yuv420_width*(i+16), 64); + memcpy(nv12t_dest+tiled_offset+64*17, yuv420_src+j+yuv420_width*(i+17), 64); + memcpy(nv12t_dest+tiled_offset+64*18, yuv420_src+j+yuv420_width*(i+18), 64); + memcpy(nv12t_dest+tiled_offset+64*19, yuv420_src+j+yuv420_width*(i+19), 64); + memcpy(nv12t_dest+tiled_offset+64*20, yuv420_src+j+yuv420_width*(i+20), 64); + memcpy(nv12t_dest+tiled_offset+64*21, yuv420_src+j+yuv420_width*(i+21), 64); + memcpy(nv12t_dest+tiled_offset+64*22, yuv420_src+j+yuv420_width*(i+22), 64); + memcpy(nv12t_dest+tiled_offset+64*23, yuv420_src+j+yuv420_width*(i+23), 64); + memcpy(nv12t_dest+tiled_offset+64*24, yuv420_src+j+yuv420_width*(i+24), 64); + memcpy(nv12t_dest+tiled_offset+64*25, yuv420_src+j+yuv420_width*(i+25), 64); + memcpy(nv12t_dest+tiled_offset+64*26, yuv420_src+j+yuv420_width*(i+26), 64); + memcpy(nv12t_dest+tiled_offset+64*27, yuv420_src+j+yuv420_width*(i+27), 64); + memcpy(nv12t_dest+tiled_offset+64*28, yuv420_src+j+yuv420_width*(i+28), 64); + memcpy(nv12t_dest+tiled_offset+64*29, yuv420_src+j+yuv420_width*(i+29), 64); + memcpy(nv12t_dest+tiled_offset+64*30, yuv420_src+j+yuv420_width*(i+30), 64); + memcpy(nv12t_dest+tiled_offset+64*31, yuv420_src+j+yuv420_width*(i+31), 64); + } + } + + for (i=aligned_y_size; i<yuv420_height; i=i+4) { + for (j=0; j<aligned_x_size; j=j+64) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + + temp1 = i&0x1F; + memcpy(nv12t_dest+tiled_offset+64*(temp1), yuv420_src+j+yuv420_width*(i), 64); + memcpy(nv12t_dest+tiled_offset+64*(temp1+1), yuv420_src+j+yuv420_width*(i+1), 64); + memcpy(nv12t_dest+tiled_offset+64*(temp1+2), yuv420_src+j+yuv420_width*(i+2), 64); + memcpy(nv12t_dest+tiled_offset+64*(temp1+3), yuv420_src+j+yuv420_width*(i+3), 64); + } + } + + for (i=0; i<yuv420_height; i=i+4) { + for (j=aligned_x_size; j<yuv420_width; j=j+4) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + + temp1 = i&0x1F; + temp2 = j&0x3F; + memcpy(nv12t_dest+tiled_offset+temp2+64*(temp1), yuv420_src+j+yuv420_width*(i), 4); + memcpy(nv12t_dest+tiled_offset+temp2+64*(temp1+1), yuv420_src+j+yuv420_width*(i+1), 4); + memcpy(nv12t_dest+tiled_offset+temp2+64*(temp1+2), yuv420_src+j+yuv420_width*(i+2), 4); + memcpy(nv12t_dest+tiled_offset+temp2+64*(temp1+3), yuv420_src+j+yuv420_width*(i+3), 4); + } + } +} + +/* + * Converts and Interleaves linear to tiled + * 1. UV of YUV420P to UV of NV12T + * + * @param nv12t_uv_dest + * UV plane address of NV12T[out] + * + * @param yuv420p_u_src + * U plane address of YUV420P[in] + * + * @param yuv420p_v_src + * V plane address of YUV420P[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ +void csc_linear_to_tiled_interleave(char *nv12t_uv_dest, char *yuv420p_u_src, char *yuv420p_v_src, int yuv420_width, int yuv420_uv_height) +{ + unsigned int i, j; + unsigned int tiled_x_index = 0, tiled_y_index = 0; + unsigned int aligned_x_size = 0, aligned_y_size = 0; + unsigned int tiled_offset = 0; + unsigned int temp1 = 0, temp2 = 0; + + aligned_y_size = (yuv420_uv_height>>5)<<5; + aligned_x_size = ((yuv420_width)>>6)<<6; + + for (i=0; i<aligned_y_size; i=i+32) { + for (j=0; j<aligned_x_size; j=j+64) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset, yuv420p_u_src+j/2+yuv420_width/2*(i), yuv420p_v_src+j/2+yuv420_width/2*(i), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*1, yuv420p_u_src+j/2+yuv420_width/2*(i+1), yuv420p_v_src+j/2+yuv420_width/2*(i+1), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*2, yuv420p_u_src+j/2+yuv420_width/2*(i+2), yuv420p_v_src+j/2+yuv420_width/2*(i+2), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*3, yuv420p_u_src+j/2+yuv420_width/2*(i+3), yuv420p_v_src+j/2+yuv420_width/2*(i+3), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*4, yuv420p_u_src+j/2+yuv420_width/2*(i+4), yuv420p_v_src+j/2+yuv420_width/2*(i+4), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*5, yuv420p_u_src+j/2+yuv420_width/2*(i+5), yuv420p_v_src+j/2+yuv420_width/2*(i+5), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*6, yuv420p_u_src+j/2+yuv420_width/2*(i+6), yuv420p_v_src+j/2+yuv420_width/2*(i+6), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*7, yuv420p_u_src+j/2+yuv420_width/2*(i+7), yuv420p_v_src+j/2+yuv420_width/2*(i+7), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*8, yuv420p_u_src+j/2+yuv420_width/2*(i+8), yuv420p_v_src+j/2+yuv420_width/2*(i+8), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*9, yuv420p_u_src+j/2+yuv420_width/2*(i+9), yuv420p_v_src+j/2+yuv420_width/2*(i+9), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*10, yuv420p_u_src+j/2+yuv420_width/2*(i+10), yuv420p_v_src+j/2+yuv420_width/2*(i+10), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*11, yuv420p_u_src+j/2+yuv420_width/2*(i+11), yuv420p_v_src+j/2+yuv420_width/2*(i+11), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*12, yuv420p_u_src+j/2+yuv420_width/2*(i+12), yuv420p_v_src+j/2+yuv420_width/2*(i+12), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*13, yuv420p_u_src+j/2+yuv420_width/2*(i+13), yuv420p_v_src+j/2+yuv420_width/2*(i+13), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*14, yuv420p_u_src+j/2+yuv420_width/2*(i+14), yuv420p_v_src+j/2+yuv420_width/2*(i+14), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*15, yuv420p_u_src+j/2+yuv420_width/2*(i+15), yuv420p_v_src+j/2+yuv420_width/2*(i+15), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*16, yuv420p_u_src+j/2+yuv420_width/2*(i+16), yuv420p_v_src+j/2+yuv420_width/2*(i+16), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*17, yuv420p_u_src+j/2+yuv420_width/2*(i+17), yuv420p_v_src+j/2+yuv420_width/2*(i+17), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*18, yuv420p_u_src+j/2+yuv420_width/2*(i+18), yuv420p_v_src+j/2+yuv420_width/2*(i+18), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*19, yuv420p_u_src+j/2+yuv420_width/2*(i+19), yuv420p_v_src+j/2+yuv420_width/2*(i+19), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*20, yuv420p_u_src+j/2+yuv420_width/2*(i+20), yuv420p_v_src+j/2+yuv420_width/2*(i+20), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*21, yuv420p_u_src+j/2+yuv420_width/2*(i+21), yuv420p_v_src+j/2+yuv420_width/2*(i+21), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*22, yuv420p_u_src+j/2+yuv420_width/2*(i+22), yuv420p_v_src+j/2+yuv420_width/2*(i+22), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*23, yuv420p_u_src+j/2+yuv420_width/2*(i+23), yuv420p_v_src+j/2+yuv420_width/2*(i+23), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*24, yuv420p_u_src+j/2+yuv420_width/2*(i+24), yuv420p_v_src+j/2+yuv420_width/2*(i+24), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*25, yuv420p_u_src+j/2+yuv420_width/2*(i+25), yuv420p_v_src+j/2+yuv420_width/2*(i+25), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*26, yuv420p_u_src+j/2+yuv420_width/2*(i+26), yuv420p_v_src+j/2+yuv420_width/2*(i+26), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*27, yuv420p_u_src+j/2+yuv420_width/2*(i+27), yuv420p_v_src+j/2+yuv420_width/2*(i+27), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*28, yuv420p_u_src+j/2+yuv420_width/2*(i+28), yuv420p_v_src+j/2+yuv420_width/2*(i+28), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*29, yuv420p_u_src+j/2+yuv420_width/2*(i+29), yuv420p_v_src+j/2+yuv420_width/2*(i+29), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*30, yuv420p_u_src+j/2+yuv420_width/2*(i+30), yuv420p_v_src+j/2+yuv420_width/2*(i+30), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*31, yuv420p_u_src+j/2+yuv420_width/2*(i+31), yuv420p_v_src+j/2+yuv420_width/2*(i+31), 32); + } + } + + for (i=aligned_y_size; i<yuv420_uv_height; i=i+4) { + for (j=0; j<aligned_x_size; j=j+64) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + temp1 = i&0x1F; + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*(temp1), yuv420p_u_src+j/2+yuv420_width/2*(i), yuv420p_v_src+j/2+yuv420_width/2*(i), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*(temp1+1), yuv420p_u_src+j/2+yuv420_width/2*(i+1), yuv420p_v_src+j/2+yuv420_width/2*(i+1), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*(temp1+2), yuv420p_u_src+j/2+yuv420_width/2*(i+2), yuv420p_v_src+j/2+yuv420_width/2*(i+2), 32); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+64*(temp1+3), yuv420p_u_src+j/2+yuv420_width/2*(i+3), yuv420p_v_src+j/2+yuv420_width/2*(i+3), 32); + } + } + + for (i=0; i<yuv420_uv_height; i=i+4) { + for (j=aligned_x_size; j<yuv420_width; j=j+4) { + tiled_offset = 0; + tiled_x_index = j>>6; + tiled_y_index = i>>5; + if (tiled_y_index & 0x1) { + /* odd fomula: 2+x+(x>>2)<<2+x_block_num*(y-1) */ + tiled_offset = tiled_y_index-1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset+2; + temp1 = (tiled_x_index>>2)<<2; + tiled_offset = tiled_offset+temp1; + tiled_offset = tiled_offset<<11; + } else { + temp2 = ((yuv420_uv_height+31)>>5)<<5; + if ((i+32)<temp2) { + /* even1 fomula: x+((x+2)>>2)<<2+x_block_num*y */ + temp1 = tiled_x_index+2; + temp1 = (temp1>>2)<<2; + tiled_offset = tiled_x_index+temp1; + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_offset+tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset<<11; + } else { + /* even2 fomula: x+x_block_num*y */ + temp1 = ((yuv420_width+127)>>7)<<7; + tiled_offset = tiled_y_index*(temp1>>6); + tiled_offset = tiled_offset+tiled_x_index; + tiled_offset = tiled_offset<<11; + } + } + temp1 = i&0x1F; + temp2 = j&0x3F; + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+temp2+64*(temp1), yuv420p_u_src+j/2+yuv420_width/2*(i), yuv420p_v_src+j/2+yuv420_width/2*(i), 2); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+temp2+64*(temp1+1), yuv420p_u_src+j/2+yuv420_width/2*(i+1), yuv420p_v_src+j/2+yuv420_width/2*(i+1), 2); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+temp2+64*(temp1+2), yuv420p_u_src+j/2+yuv420_width/2*(i+2), yuv420p_v_src+j/2+yuv420_width/2*(i+2), 2); + csc_interleave_memcpy(nv12t_uv_dest+tiled_offset+temp2+64*(temp1+3), yuv420p_u_src+j/2+yuv420_width/2*(i+3), yuv420p_v_src+j/2+yuv420_width/2*(i+3), 2); + } + } +} + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s new file mode 100644 index 0000000..5b55080 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s @@ -0,0 +1,128 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_deinterleave_memcpy.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + .arch armv7-a + .text + .global csc_deinterleave_memcpy + .type csc_deinterleave_memcpy, %function +csc_deinterleave_memcpy: + .fnstart + + @r0 dest1 + @r1 dest2 + @r2 src + @r3 src_size + @r4 i + @r5 temp1 + @r6 temp2 + @r7 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r4, #0 + cmp r3, #256 + blt LINEAR_SIZE_128 + + bic r5, r3, #0xFF +LINEAR_SIZE_256_LOOP: + pld [r2, #64] + vld2.8 {q0, q1}, [r2]! + pld [r2, #64] + vld2.8 {q2, q3}, [r2]! + pld [r2, #64] + vld2.8 {q4, q5}, [r2]! + pld [r2, #64] + vld2.8 {q6, q7}, [r2]! + pld [r2, #64] + vld2.8 {q8, q9}, [r2]! + pld [r2, #64] + vld2.8 {q10, q11}, [r2]! + vld2.8 {q12, q13}, [r2]! + vld2.8 {q14, q15}, [r2]! + + vst1.8 {q0}, [r0]! + vst1.8 {q2}, [r0]! + vst1.8 {q4}, [r0]! + vst1.8 {q6}, [r0]! + vst1.8 {q8}, [r0]! + vst1.8 {q10}, [r0]! + vst1.8 {q12}, [r0]! + vst1.8 {q14}, [r0]! + + vst1.8 {q1}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q9}, [r1]! + vst1.8 {q11}, [r1]! + vst1.8 {q13}, [r1]! + vst1.8 {q15}, [r1]! + + add r4, #256 + cmp r4, r5 + blt LINEAR_SIZE_256_LOOP + +LINEAR_SIZE_128: + sub r5, r3, r4 + cmp r5, #64 + blt LINEAR_SIZE_4 + pld [r2, #64] + vld2.8 {q0, q1}, [r2]! + pld [r2, #64] + vld2.8 {q2, q3}, [r2]! + vld2.8 {q4, q5}, [r2]! + vld2.8 {q6, q7}, [r2]! + + vst1.8 {q0}, [r0]! + vst1.8 {q4}, [r0]! + vst1.8 {q2}, [r0]! + vst1.8 {q6}, [r0]! + + vst1.8 {q1}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + vst1.8 {q7}, [r1]! + + add r4, #128 + +LINEAR_SIZE_4: + ldrb r6, [r2], #1 + ldrb r7, [r2], #1 + ldrb r8, [r2], #1 + ldrb r9, [r2], #1 + + strb r6, [r0], #1 + strb r8, [r0], #1 + strb r7, [r1], #1 + strb r9, [r1], #1 + + add r4, #4 + cmp r4, r3 + blt LINEAR_SIZE_4 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s new file mode 100644 index 0000000..54f4436 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s @@ -0,0 +1,133 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_interleave_memcpy.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + .arch armv7-a + .text + .global csc_interleave_memcpy + .type csc_interleave_memcpy, %function +csc_interleave_memcpy: + .fnstart + + @r0 dest + @r1 src1 + @r2 src2 + @r3 src_size + @r4 i + @r5 temp1 + @r6 temp2 + @r7 temp3 + @r8 temp2 + @r9 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r4, #0 + cmp r3, #128 + blt LINEAR_SIZE_64 + + bic r5, r3, #0x2F +LINEAR_SIZE_128_LOOP: + pld [r1, #64] + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r1]! + vld1.8 {q4}, [r1]! + vld1.8 {q6}, [r1]! + pld [r2] + vld1.8 {q8}, [r1]! + vld1.8 {q10}, [r1]! + vld1.8 {q12}, [r1]! + vld1.8 {q14}, [r1]! + pld [r2, #64] + vld1.8 {q1}, [r2]! + vld1.8 {q3}, [r2]! + vld1.8 {q5}, [r2]! + vld1.8 {q7}, [r2]! + vld1.8 {q9}, [r2]! + vld1.8 {q11}, [r2]! + vld1.8 {q13}, [r2]! + vld1.8 {q15}, [r2]! + + vst2.8 {q0, q1}, [r0]! + vst2.8 {q2, q3}, [r0]! + vst2.8 {q4, q5}, [r0]! + vst2.8 {q6, q7}, [r0]! + vst2.8 {q8, q9}, [r0]! + vst2.8 {q10, q11}, [r0]! + pld [r1] + vst2.8 {q12, q13}, [r0]! + vst2.8 {q14, q15}, [r0]! + + add r4, #128 + cmp r4, r5 + blt LINEAR_SIZE_128_LOOP + +LINEAR_SIZE_64: + sub r5, r3, r4 + cmp r5, #64 + blt LINEAR_SIZE_2 +LINEAR_SIZE_64_LOOP: + pld [r2] + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r1]! + vld1.8 {q4}, [r1]! + vld1.8 {q6}, [r1]! + vld1.8 {q1}, [r2]! + vld1.8 {q3}, [r2]! + vld1.8 {q5}, [r2]! + vld1.8 {q7}, [r2]! + + vst2.8 {q0, q1}, [r0]! + vst2.8 {q2, q3}, [r0]! + pld [r1] + vst2.8 {q4, q5}, [r0]! + vst2.8 {q6, q7}, [r0]! + + add r4, #64 + cmp r4, r3 + blt LINEAR_SIZE_64_LOOP + +LINEAR_SIZE_2: + sub r5, r3, r4 + cmp r5, #2 + blt RESTORE_REG +LINEAR_SIZE_2_LOOP: + ldrb r6, [r1], #1 + ldrb r7, [r2], #1 + ldrb r8, [r1], #1 + ldrb r9, [r2], #1 + + strb r6, [r0], #1 + strb r7, [r0], #1 + strb r8, [r0], #1 + strb r9, [r0], #1 + + add r4, #2 + cmp r4, r3 + blt LINEAR_SIZE_2_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s new file mode 100644 index 0000000..08e359c --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s @@ -0,0 +1,768 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_nv12t_yuv420_uv_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts and Deinterleaves tiled data to linear + * 1. UV of NV12T to UV of YUV420P + * + * @param yuv420_u_dest + * U plane address of YUV420P[out] + * + * @param yuv420_v_dest + * V plane address of YUV420P[out] + * + * @param nv12t_src + * UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_tiled_to_linear_deinterleave + .type csc_tiled_to_linear_deinterleave, %function +csc_tiled_to_linear_deinterleave: + .fnstart + + @r0 linear_u_dest + @r1 linear_v_dest + @r2 tiled_uv_src + @r3 linear_x_size + @r4 linear_y_size + @r5 j + @r6 i + @r7 tiled_addr + @r8 linear_addr + @r9 aligned_x_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + ldr r4, [sp, #40] @ load linear_y_size to r4 + + mov r9, #0 + +LINEAR_X_SIZE_1024: + cmp r3, #1024 + blt LINEAR_X_SIZE_512 + + mov r6, #0 +LINEAR_X_SIZE_1024_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_1024_LOOP_EVEN +LINEAR_X_SIZE_1024_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + mov r5, #8 + mov r5, r5, lsl #11 + sub r5, r5, #32 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + b LINEAR_X_SIZE_1024_LOOP_MEMCPY + +LINEAR_X_SIZE_1024_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + add r12, r6, #32 + cmp r12, r11 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + movlt r5, #8 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r5, #4 + addge r12, r7, #2048 + addge r14, r7, #2048 + mov r5, r5, lsl #11 + sub r5, r5, #32 + +LINEAR_X_SIZE_1024_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mov r10, r3, asr #1 + vld2.8 {q10, q11}, [r12], r5 + mul r10, r10, r6 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + add r8, r0, r10 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12], r5 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12], r5 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12] + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14] + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_1024_LOOP + + mov r9, #1024 + +LINEAR_X_SIZE_512: + sub r10, r3, r9 + cmp r10, #512 + blt LINEAR_X_SIZE_256 + + mov r6, #0 +LINEAR_X_SIZE_512_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_512_LOOP_EVEN +LINEAR_X_SIZE_512_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + mov r5, #8 + mov r5, r5, lsl #11 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + sub r5, r5, #32 + b LINEAR_X_SIZE_512_LOOP_MEMCPY + +LINEAR_X_SIZE_512_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + add r12, r6, #32 + cmp r12, r11 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + movlt r5, #8 + movlt r10, r9, asr #5 + movge r10, r9, asr #6 + add r7, r7, r10, lsl #11 + add r11, r7, #2048 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r5, #4 + addge r12, r7, #4096 + addge r14, r7, #6144 + mov r5, r5, lsl #11 + sub r5, r5, #32 + +LINEAR_X_SIZE_512_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + mov r10, r3, asr #1 + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r10, r10, r6 + vld2.8 {q10, q11}, [r12], r5 + add r8, r0, r10 + vld2.8 {q12, q13}, [r14]! + add r8, r8, r9, asr #1 + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + add r10, r1, r10 + vst1.8 {q14}, [r8]! + + add r10, r10, r9, asr #1 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12] + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14] + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_512_LOOP + + add r9, r9, #512 + +LINEAR_X_SIZE_256: + sub r10, r3, r9 + cmp r10, #256 + blt LINEAR_X_SIZE_128 + + mov r6, #0 +LINEAR_X_SIZE_256_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_256_LOOP_EVEN +LINEAR_X_SIZE_256_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + b LINEAR_X_SIZE_256_LOOP_MEMCPY + +LINEAR_X_SIZE_256_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r6, #32 + cmp r12, r11 + movlt r10, r9, asr #5 + addlt r7, r7, r10, lsl #11 + addlt r11, r7, #2048 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r10, r9, asr #6 + addge r7, r7, r10, lsl #11 + addge r11, r7, #2048 + addge r12, r7, #4096 + addge r14, r7, #6144 + +LINEAR_X_SIZE_256_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mov r10, r3, asr #1 + vld2.8 {q10, q11}, [r12] + mul r10, r10, r6 + vld2.8 {q12, q13}, [r14]! + add r8, r0, r10 + vld2.8 {q14, q15}, [r14] + + add r8, r8, r9, asr #1 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + add r10, r1, r10 + vst1.8 {q14}, [r8]! + + add r10, r10, r9, asr #1 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + blt LINEAR_X_SIZE_256_LOOP + + add r9, r9, #256 + +LINEAR_X_SIZE_128: + sub r10, r3, r9 + cmp r10, #128 + blt LINEAR_X_SIZE_64 + + mov r6, #0 +LINEAR_X_SIZE_128_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_128_LOOP_EVEN +LINEAR_X_SIZE_128_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + b LINEAR_X_SIZE_128_LOOP_MEMCPY + +LINEAR_X_SIZE_128_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r6, #32 + cmp r12, r11 + movlt r10, r9, asr #5 + movge r10, r9, asr #6 + add r7, r7, r10, lsl #11 + add r11, r7, #2048 + +LINEAR_X_SIZE_128_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7]! + pld [r7] + vld2.8 {q4, q5}, [r11]! + mov r10, r3, asr #1 + pld [r7, #32] + vld2.8 {q6, q7}, [r11]! + mul r10, r10, r6 + pld [r11] + vld2.8 {q8, q9}, [r7]! + add r10, r10, r9, asr #1 + pld [r11, #32] + vld2.8 {q10, q11}, [r7]! + add r8, r0, r10 + vld2.8 {q12, q13}, [r11]! + mov r14, r3, asr #1 + vld2.8 {q14, q15}, [r11]! + + sub r14, r14, #48 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8], r14 + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10], r14 + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #2 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_128_LOOP + + add r9, r9, #128 + +LINEAR_X_SIZE_64: + sub r10, r3, r9 + cmp r10, #64 + blt LINEAR_X_SIZE_4 + + mov r5, r9 + mov r6, #0 + +LINEAR_X_SIZE_64_LOOP: + bl GET_TILED_OFFSET + +LINEAR_X_SIZE_64_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r14, r3, asr #1 + mov r10, r10, lsl #6 + sub r14, r14, #16 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + pld [r7, #64] + vld2.8 {q0, q1}, [r7]! + mov r10, r3, asr #1 + pld [r7, #64] + vld2.8 {q2, q3}, [r7]! + mul r10, r10, r6 + vld2.8 {q4, q5}, [r7]! + add r10, r10, r9, asr #1 + vld2.8 {q6, q7}, [r7]! + add r8, r0, r10 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8], r14 + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8], r14 + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10], r14 + add r6, #2 + vst1.8 {q5}, [r10]! + cmp r6, r4 + vst1.8 {q7}, [r10], r14 + + blt LINEAR_X_SIZE_64_LOOP + + add r9, r9, #64 + +LINEAR_X_SIZE_4: + cmp r9, r3 + beq RESTORE_REG + + mov r6, #0 @ i = 0 +LINEAR_Y_SIZE_4_LOOP: + + mov r5, r9 @ j = aligned_x_size +LINEAR_X_SIZE_4_LOOP: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #1 @ temp2 = linear_x_size-1 + + add r8, r0, r11 @ linear_addr = linear_dest_u+temp1 + add r11, r1, r11 @ temp1 = linear_dest_v+temp1 + add r7, r2, r7 @ tiled_addr = tiled_src+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + and r14, r5, #0x3F @ temp3 = j&0x3F + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + ldrh r10, [r7], #2 + ldrh r14, [r7], #62 + strb r10, [r8], #1 + mov r10, r10, asr #8 + strb r10, [r11], #1 + strb r14, [r8], r12 + mov r14, r14, asr #8 + strb r14, [r11], r12 + + ldrh r10, [r7], #2 + ldrh r14, [r7], #62 + strb r10, [r8], #1 + mov r10, r10, asr #8 + strb r10, [r11], #1 + strb r14, [r8], r12 + mov r14, r14, asr #8 + strb r14, [r11], r12 + + add r5, r5, #4 @ j = j+4 + cmp r5, r3 @ j<linear_x_size + blt LINEAR_X_SIZE_4_LOOP + + add r6, r6, #2 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LINEAR_Y_SIZE_4_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + stmfd sp!, {r14} + + mov r12, r6, asr #5 @ temp2 = i>>5 + mov r11, r5, asr #6 @ temp1 = j>>6 + + and r14, r12, #0x1 @ if (temp2 & 0x1) + cmp r14, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r7, r12, #1 @ tiled_addr = temp2-1 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r7, r14 @ tiled_addr = tiled_addr*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp1 + add r7, r7, #2 @ tiled_addr = tiled_addr+2 + bic r14, r11, #0x3 @ temp3 = (temp1>>2)<<2 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r14, r4, #31 @ temp3 = linear_y_size+31 + bic r14, r14, #0x1F @ temp3 = (temp3>>5)<<5 + sub r14, r14, #32 @ temp3 = temp3 - 32 + cmp r6, r14 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r14, r11, #2 @ temp3 = temp1+2 + bic r14, r14, #3 @ temp3 = (temp3>>2)<<2 + add r7, r11, r14 @ tiled_addr = temp1+temp3 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r12, r12, r14 @ tiled_y_index = tiled_y_index*temp3 + add r7, r7, r12 @ tiled_addr = tiled_addr+tiled_y_index + mov r7, r7, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r12, r14 @ tiled_addr = temp2*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + ldmfd sp!, {r15} @ restore registers + .fnend diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s new file mode 100644 index 0000000..d71ee17 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s @@ -0,0 +1,680 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_nv12t_yuv420_y_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts tiled data to linear. + * 1. Y of NV12T to Y of YUV420P + * 2. Y of NV12T to Y of YUV420S + * 3. UV of NV12T to UV of YUV420S + * + * @param yuv420_dest + * Y or UV plane address of YUV420[out] + * + * @param nv12t_src + * Y or UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_tiled_to_linear + .type csc_tiled_to_linear, %function +csc_tiled_to_linear: + .fnstart + + @r0 linear_dest + @r1 tiled_src + @r2 linear_x_size + @r3 linear_y_size + @r4 j + @r5 i + @r6 tiled_addr + @r7 linear_addr + @r8 aligned_x_size + @r9 aligned_y_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r8, #0 + cmp r2, #1024 + blt LINEAR_X_SIZE_512 + +LINEAR_X_SIZE_1024: + + mov r5, #0 +LINEAR_X_SIZE_1024_LOOP: + mov r6, #0 @ tiled_offset = 0@ + mov r4, r5, asr #5 @ tiled_y_index = i>>5@ + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_1024_LOOP_EVEN +LINEAR_X_SIZE_1024_LOOP_ODD: + sub r6, r4, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + mov r4, #8 + mov r4, r4, lsl #11 + sub r4, r4, #32 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_1024_LOOP_MEMCPY + +LINEAR_X_SIZE_1024_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + add r12, r5, #32 + cmp r12, r11 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r6, #2048 + movlt r4, #8 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + movge r4, #4 + addge r12, r6, #4096 + addge r14, r6, #6144 + mov r4, r4, lsl #11 + sub r4, r4, #32 + +LINEAR_X_SIZE_1024_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r7, r2, r5 + vld1.8 {q10, q11}, [r12], r4 + add r7, r7, r0 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6] + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11] + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12] + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14] + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_1024_LOOP + + mov r8, #1024 + +LINEAR_X_SIZE_512: + + sub r14, r2, r8 + cmp r14, #512 + blt LINEAR_X_SIZE_256 + + mov r5, #0 +LINEAR_X_SIZE_512_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_512_LOOP_EVEN + +LINEAR_X_SIZE_512_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + mov r4, #8 + mov r4, r4, lsl #11 + sub r4, r4, #32 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_512_LOOP_MEMCPY + +LINEAR_X_SIZE_512_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + add r12, r5, #32 + cmp r12, r11 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + movlt r4, #8 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + movge r4, #4 + addge r12, r6, #4096 + addge r14, r6, #6144 + mov r4, r4, lsl #11 + sub r4, r4, #32 + +LINEAR_X_SIZE_512_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r7, r2, r5 + vld1.8 {q10, q11}, [r12], r4 + add r7, r7, r8 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + add r7, r7, r0 + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_512_LOOP + + add r8, r8, #512 + +LINEAR_X_SIZE_256: + + sub r14, r2, r8 + cmp r14, #256 + blt LINEAR_X_SIZE_128 + + mov r5, #0 +LINEAR_X_SIZE_256_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_256_LOOP_EVEN + +LINEAR_X_SIZE_256_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_256_LOOP_MEMCPY + +LINEAR_X_SIZE_256_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r5, #32 + cmp r12, r11 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + addge r12, r6, #4096 + addge r14, r6, #6144 + +LINEAR_X_SIZE_256_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6] + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11] + pld [r14] + mul r7, r2, r5 + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + add r7, r7, r8 + vld1.8 {q10, q11}, [r12] + add r7, r7, r0 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14] + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_256_LOOP + + add r8, r8, #256 + +LINEAR_X_SIZE_128: + + sub r14, r2, r8 + cmp r14, #128 + blt LINEAR_X_SIZE_64 + + mov r5, #0 +LINEAR_X_SIZE_128_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_128_LOOP_EVEN + +LINEAR_X_SIZE_128_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + b LINEAR_X_SIZE_128_LOOP_MEMCPY + +LINEAR_X_SIZE_128_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r5, #32 + cmp r12, r11 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + +LINEAR_X_SIZE_128_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r6, #64] + vld1.8 {q0, q1}, [r6]! + pld [r6, #64] + vld1.8 {q2, q3}, [r6]! + mul r7, r2, r5 + pld [r11] + vld1.8 {q4, q5}, [r6]! + add r7, r7, r8 + pld [r11, #32] + vld1.8 {q6, q7}, [r6] + add r7, r7, r0 + pld [r11, #64] + vld1.8 {q8, q9}, [r11]! + pld [r11, #64] + vld1.8 {q10, q11}, [r11]! + vld1.8 {q12, q13}, [r11]! + vld1.8 {q14, q15}, [r11] + + sub r9, r2, #96 + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7], r9 + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + add r5, #2 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7] + + blt LINEAR_X_SIZE_128_LOOP + + add r8, r8, #128 + +LINEAR_X_SIZE_64: + + sub r14, r2, r8 + cmp r14, #64 + blt LINEAR_X_SIZE_4 + + mov r5, #0 + mov r4, r8 + +LINEAR_X_SIZE_64_LOOP: + + bl GET_TILED_OFFSET + + add r6, r1, r6 @ tiled_addr = tiled_src+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + pld [r6, #64] + vld1.8 {q0, q1}, [r6]! @ store {tiled_addr} + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + pld [r6, #64] + vld1.8 {q2, q3}, [r6]! + pld [r6, #64] + vld1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*1} + pld [r6, #64] + vld1.8 {q6, q7}, [r6]! + pld [r6, #64] + vld1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*2} + pld [r6, #64] + vld1.8 {q10, q11}, [r6]! + add r7, r0, r4 @ linear_addr = linear_dest+j + vld1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*3} + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + vld1.8 {q14, q15}, [r6]! + sub r10, r2, #32 @ temp1 = linear_x_size-32 + + vst1.8 {q0, q1}, [r7]! @ load {linear_src, 64} + vst1.8 {q2, q3}, [r7], r10 + vst1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*1, 64} + vst1.8 {q6, q7}, [r7], r10 + vst1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*2, 64} + vst1.8 {q10, q11}, [r7], r10 + add r5, #4 + vst1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*3, 64} + cmp r5, r3 + vst1.8 {q14, q15}, [r7], r10 + + blt LINEAR_X_SIZE_64_LOOP + + add r8, r8, #64 + +LINEAR_X_SIZE_4: + cmp r8, r2 + beq RESTORE_REG + + mov r5, #0 @ i = 0 +LINEAR_Y_SIZE_4_LOOP: + + mov r4, r8 @ j = aligned_x_size +LINEAR_X_SIZE_4_LOOP: + + bl GET_TILED_OFFSET + + and r10, r5, #0x1F @ temp1 = i&0x1F + and r11, r4, #0x3F @ temp2 = j&0x3F + + add r6, r6, r1 + add r6, r6, r11 + add r6, r6, r10, lsl #6 + + ldr r10, [r6], #64 + add r7, r0, r4 + ldr r11, [r6], #64 + mul r9, r2, r5 + ldr r12, [r6], #64 + add r7, r7, r9 + ldr r14, [r6], #64 + + str r10, [r7], r2 + str r11, [r7], r2 + str r12, [r7], r2 + str r14, [r7], r2 + + add r4, r4, #4 @ j = j+4 + cmp r4, r2 @ j<linear_x_size + blt LINEAR_X_SIZE_4_LOOP + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LINEAR_Y_SIZE_4_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + + mov r11, r5, asr #5 @ temp2 = i>>5 + mov r10, r4, asr #6 @ temp1 = j>>6 + + and r12, r11, #0x1 @ if (temp2 & 0x1) + cmp r12, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r6, r11, #1 @ tiled_addr = temp2-1 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r6, r12 @ tiled_addr = tiled_addr*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp1 + add r6, r6, #2 @ tiled_addr = tiled_addr+2 + bic r12, r10, #0x3 @ temp3 = (temp1>>2)<<2 + add r6, r6, r12 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r12, r3, #31 @ temp3 = linear_y_size+31 + bic r12, r12, #0x1F @ temp3 = (temp3>>5)<<5 + sub r12, r12, #32 @ temp3 = temp3 - 32 + cmp r5, r12 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r12, r10, #2 @ temp3 = temp1+2 + bic r12, r12, #3 @ temp3 = (temp3>>2)<<2 + add r6, r10, r12 @ tiled_addr = temp1+temp3 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r11, r11, r12 @ tiled_y_index = tiled_y_index*temp3 + add r6, r6, r11 @ tiled_addr = tiled_addr+tiled_y_index + mov r6, r6, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r11, r12 @ tiled_addr = temp2*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + mov pc, lr + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s new file mode 100644 index 0000000..dd2c879 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s @@ -0,0 +1,573 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_yuv420_nv12t_uv_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts and Interleaves linear to tiled + * 1. UV of YUV420P to UV of NV12T + * + * @param nv12t_uv_dest + * UV plane address of NV12T[out] + * + * @param yuv420p_u_src + * U plane address of YUV420P[in] + * + * @param yuv420p_v_src + * V plane address of YUV420P[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_linear_to_tiled_interleave + .type csc_linear_to_tiled_interleave, %function +csc_linear_to_tiled_interleave: + .fnstart + + @r0 tiled_dest + @r1 linear_src_u + @r2 linear_src_v + @r3 linear_x_size + @r4 linear_y_size + @r5 j + @r6 i + @r7 tiled_addr + @r8 linear_addr + @r9 aligned_x_size + @r10 aligned_y_size + @r11 temp1 + @r12 temp2 + @r14 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + ldr r4, [sp, #40] @ load linear_y_size to r4 + + bic r10, r4, #0x1F @ aligned_y_size = (linear_y_size>>5)<<5 + bic r9, r3, #0x3F @ aligned_x_size = (linear_x_size>>6)<<6 + + mov r6, #0 @ i = 0 +LOOP_ALIGNED_Y_SIZE: + + mov r5, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #16 @ temp2 = linear_x_size-16 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + add r5, r5, #64 @ j = j+64 + cmp r5, r9 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE + + add r6, r6, #32 @ i = i+32 + cmp r6, r10 @ i<aligned_y_size + blt LOOP_ALIGNED_Y_SIZE + + ldr r4, [sp, #40] @ load linear_y_size to r4 + cmp r6, r4 + beq LOOP_LINEAR_Y_SIZE_2_START + +LOOP_LINEAR_Y_SIZE_1: + + mov r5, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE_1: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #16 @ temp2 = linear_x_size-16 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! @ store {tiled_addr} + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! @ store {tiled_addr+64*1} + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! @ store {tiled_addr+64*2} + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! @ store {tiled_addr+64*3} + vst2.8 {q14, q15}, [r7]! + + add r5, r5, #64 @ j = j+64 + cmp r5, r9 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE_1 + + add r6, r6, #4 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_1 + +LOOP_LINEAR_Y_SIZE_2_START: + cmp r5, r3 + beq RESTORE_REG + + mov r6, #0 @ i = 0 +LOOP_LINEAR_Y_SIZE_2: + + mov r5, r9 @ j = aligned_x_size +LOOP_LINEAR_X_SIZE_2: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #1 @ temp2 = linear_x_size-1 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + and r14, r5, #0x3F @ temp3 = j&0x3F + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + add r5, r5, #4 @ j = j+4 + cmp r5, r3 @ j<linear_x_size + blt LOOP_LINEAR_X_SIZE_2 + + add r6, r6, #4 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_2 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + stmfd sp!, {r14} + + mov r12, r6, asr #5 @ temp2 = i>>5 + mov r11, r5, asr #6 @ temp1 = j>>6 + + and r14, r12, #0x1 @ if (temp2 & 0x1) + cmp r14, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r7, r12, #1 @ tiled_addr = temp2-1 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r7, r14 @ tiled_addr = tiled_addr*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp1 + add r7, r7, #2 @ tiled_addr = tiled_addr+2 + bic r14, r11, #0x3 @ temp3 = (temp1>>2)<<2 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r14, r4, #31 @ temp3 = linear_y_size+31 + bic r14, r14, #0x1F @ temp3 = (temp3>>5)<<5 + sub r14, r14, #32 @ temp3 = temp3 - 32 + cmp r6, r14 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r14, r11, #2 @ temp3 = temp1+2 + bic r14, r14, #3 @ temp3 = (temp3>>2)<<2 + add r7, r11, r14 @ tiled_addr = temp1+temp3 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r12, r12, r14 @ tiled_y_index = tiled_y_index*temp3 + add r7, r7, r12 @ tiled_addr = tiled_addr+tiled_y_index + mov r7, r7, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r12, r14 @ tiled_addr = temp2*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + ldmfd sp!, {r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s new file mode 100644 index 0000000..3f8932a --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s @@ -0,0 +1,451 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_yuv420_nv12t_y_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts linear data to tiled. + * 1. Y of YUV420P to Y of NV12T + * 2. Y of YUV420S to Y of NV12T + * 3. UV of YUV420S to UV of NV12T + * + * @param nv12t_dest + * Y or UV plane address of NV12T[out] + * + * @param yuv420_src + * Y or UV plane address of YUV420P(S)[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_linear_to_tiled + .type csc_linear_to_tiled, %function +csc_linear_to_tiled: + .fnstart + + @r0 tiled_dest + @r1 linear_src + @r2 linear_x_size + @r3 linear_y_size + @r4 j + @r5 i + @r6 nn(tiled_addr) + @r7 mm(linear_addr) + @r8 aligned_x_size + @r9 aligned_y_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + bic r9, r3, #0x1F @ aligned_y_size = (linear_y_size>>5)<<5 + bic r8, r2, #0x3F @ aligned_x_size = (linear_x_size>>6)<<6 + + mov r5, #0 @ i = 0 +LOOP_ALIGNED_Y_SIZE: + + mov r4, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + sub r10, r2, #32 + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + add r4, r4, #64 @ j = j+64 + cmp r4, r8 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE + + add r5, r5, #32 @ i = i+32 + cmp r5, r9 @ i<aligned_y_size + blt LOOP_ALIGNED_Y_SIZE + + cmp r5, r3 + beq LOOP_LINEAR_Y_SIZE_2_START + +LOOP_LINEAR_Y_SIZE_1: + + mov r4, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE_1: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + sub r10, r2, #32 @ temp1 = linear_x_size-32 + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + add r4, r4, #64 @ j = j+64 + cmp r4, r8 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE_1 + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_1 + +LOOP_LINEAR_Y_SIZE_2_START: + cmp r4, r2 + beq RESTORE_REG + + mov r5, #0 @ i = 0 +LOOP_LINEAR_Y_SIZE_2: + + mov r4, r8 @ j = aligned_x_size +LOOP_LINEAR_X_SIZE_2: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + and r11, r4, #0x3F @ temp2 = j&0x3F + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + ldr r10, [r7], r2 + ldr r11, [r7], r2 + ldr r12, [r7], r2 + ldr r14, [r7], r2 + str r10, [r6], #64 + str r11, [r6], #64 + str r12, [r6], #64 + str r14, [r6], #64 + + add r4, r4, #4 @ j = j+4 + cmp r4, r2 @ j<linear_x_size + blt LOOP_LINEAR_X_SIZE_2 + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_2 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + + mov r11, r5, asr #5 @ temp2 = i>>5 + mov r10, r4, asr #6 @ temp1 = j>>6 + + and r12, r11, #0x1 @ if (temp2 & 0x1) + cmp r12, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r6, r11, #1 @ tiled_addr = temp2-1 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r6, r12 @ tiled_addr = tiled_addr*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp1 + add r6, r6, #2 @ tiled_addr = tiled_addr+2 + bic r12, r10, #0x3 @ temp3 = (temp1>>2)<<2 + add r6, r6, r12 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r12, r3, #31 @ temp3 = linear_y_size+31 + bic r12, r12, #0x1F @ temp3 = (temp3>>5)<<5 + sub r12, r12, #32 @ temp3 = temp3 - 32 + cmp r5, r12 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r12, r10, #2 @ temp3 = temp1+2 + bic r12, r12, #3 @ temp3 = (temp3>>2)<<2 + add r6, r10, r12 @ tiled_addr = temp1+temp3 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r11, r11, r12 @ tiled_y_index = tiled_y_index*temp3 + add r6, r6, r11 @ tiled_addr = tiled_addr+tiled_y_index + mov r6, r6, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r11, r12 @ tiled_addr = temp2*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + mov pc, lr + .fnend + |