diff options
author | SeungBeom Kim <sbcrux.kim@samsung.com> | 2011-07-18 11:18:27 +0900 |
---|---|---|
committer | James Dong <jdong@google.com> | 2011-07-21 17:54:41 -0700 |
commit | 7cf106ba5ff2dac2be536d8c84c715ca87d0a2d9 (patch) | |
tree | 46a83e49213a9869d145da4e001d0083ac5fd8cf /sec_mm | |
parent | d49d3345bc416c94bdf7aa4495631d6fab775ad9 (diff) | |
download | device_samsung_crespo-7cf106ba5ff2dac2be536d8c84c715ca87d0a2d9.zip device_samsung_crespo-7cf106ba5ff2dac2be536d8c84c715ca87d0a2d9.tar.gz device_samsung_crespo-7cf106ba5ff2dac2be536d8c84c715ca87d0a2d9.tar.bz2 |
Add color space convertor in SEC_OMX
NV12T to YUV420P
NV12T to YUV420SP
YUV420P to NV12T
YUV420SP to NV12T
Change-Id: I991777f56a9988c8aa0a1b94bf917ffeb148345f
Signed-off-by: SeungBeom Kim <sbcrux.kim@samsung.com>
Diffstat (limited to 'sec_mm')
17 files changed, 2972 insertions, 197 deletions
diff --git a/sec_mm/sec_omx/sec_codecs/Android.mk b/sec_mm/sec_omx/sec_codecs/Android.mk index a51a075..3c163a4 100644 --- a/sec_mm/sec_omx/sec_codecs/Android.mk +++ b/sec_mm/sec_omx/sec_codecs/Android.mk @@ -4,4 +4,4 @@ include $(CLEAR_VARS) include $(SEC_CODECS)/video/mfc_c110/dec/Android.mk include $(SEC_CODECS)/video/mfc_c110/enc/Android.mk - +include $(SEC_CODECS)/video/mfc_c110/csc/Android.mk
\ No newline at end of file diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk new file mode 100644 index 0000000..fee8529 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/Android.mk @@ -0,0 +1,31 @@ + +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +LOCAL_MODULE_TAGS := optional + +LOCAL_SRC_FILES := \ + csc_yuv420_nv12t_y_neon.s \ + csc_yuv420_nv12t_uv_neon.s \ + csc_nv12t_yuv420_y_neon.s \ + csc_nv12t_yuv420_uv_neon.s \ + csc_interleave_memcpy.s \ + csc_deinterleave_memcpy.s + +LOCAL_MODULE := libseccsc + + + +LOCAL_CFLAGS := + +LOCAL_ARM_MODE := arm + +LOCAL_STATIC_LIBRARIES := + +LOCAL_SHARED_LIBRARIES := liblog + +LOCAL_C_INCLUDES := \ + $(SEC_CODECS)/video/mfc_c110/include + +include $(BUILD_STATIC_LIBRARY) + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s new file mode 100644 index 0000000..dc95bfa --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_deinterleave_memcpy.s @@ -0,0 +1,128 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_deinterleave_memcpy.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + .arch armv7-a + .text + .global csc_deinterleave_memcpy_neon + .type csc_deinterleave_memcpy_neon, %function +csc_deinterleave_memcpy_neon: + .fnstart + + @r0 dest1 + @r1 dest2 + @r2 src + @r3 src_size + @r4 i + @r5 temp1 + @r6 temp2 + @r7 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r4, #0 + cmp r3, #256 + blt LINEAR_SIZE_128 + + bic r5, r3, #0xFF +LINEAR_SIZE_256_LOOP: + pld [r2, #64] + vld2.8 {q0, q1}, [r2]! + pld [r2, #64] + vld2.8 {q2, q3}, [r2]! + pld [r2, #64] + vld2.8 {q4, q5}, [r2]! + pld [r2, #64] + vld2.8 {q6, q7}, [r2]! + pld [r2, #64] + vld2.8 {q8, q9}, [r2]! + pld [r2, #64] + vld2.8 {q10, q11}, [r2]! + vld2.8 {q12, q13}, [r2]! + vld2.8 {q14, q15}, [r2]! + + vst1.8 {q0}, [r0]! + vst1.8 {q2}, [r0]! + vst1.8 {q4}, [r0]! + vst1.8 {q6}, [r0]! + vst1.8 {q8}, [r0]! + vst1.8 {q10}, [r0]! + vst1.8 {q12}, [r0]! + vst1.8 {q14}, [r0]! + + vst1.8 {q1}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q9}, [r1]! + vst1.8 {q11}, [r1]! + vst1.8 {q13}, [r1]! + vst1.8 {q15}, [r1]! + + add r4, #256 + cmp r4, r5 + blt LINEAR_SIZE_256_LOOP + +LINEAR_SIZE_128: + sub r5, r3, r4 + cmp r5, #64 + blt LINEAR_SIZE_4 + pld [r2, #64] + vld2.8 {q0, q1}, [r2]! + pld [r2, #64] + vld2.8 {q2, q3}, [r2]! + vld2.8 {q4, q5}, [r2]! + vld2.8 {q6, q7}, [r2]! + + vst1.8 {q0}, [r0]! + vst1.8 {q4}, [r0]! + vst1.8 {q2}, [r0]! + vst1.8 {q6}, [r0]! + + vst1.8 {q1}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + vst1.8 {q7}, [r1]! + + add r4, #128 + +LINEAR_SIZE_4: + ldrb r6, [r2], #1 + ldrb r7, [r2], #1 + ldrb r8, [r2], #1 + ldrb r9, [r2], #1 + + strb r6, [r0], #1 + strb r8, [r0], #1 + strb r7, [r1], #1 + strb r9, [r1], #1 + + add r4, #4 + cmp r4, r3 + blt LINEAR_SIZE_4 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s new file mode 100644 index 0000000..63fb88d --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_interleave_memcpy.s @@ -0,0 +1,133 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_interleave_memcpy.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + .arch armv7-a + .text + .global csc_interleave_memcpy_neon + .type csc_interleave_memcpy_neon, %function +csc_interleave_memcpy_neon: + .fnstart + + @r0 dest + @r1 src1 + @r2 src2 + @r3 src_size + @r4 i + @r5 temp1 + @r6 temp2 + @r7 temp3 + @r8 temp2 + @r9 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r4, #0 + cmp r3, #128 + blt LINEAR_SIZE_64 + + bic r5, r3, #0x2F +LINEAR_SIZE_128_LOOP: + pld [r1, #64] + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r1]! + vld1.8 {q4}, [r1]! + vld1.8 {q6}, [r1]! + pld [r2] + vld1.8 {q8}, [r1]! + vld1.8 {q10}, [r1]! + vld1.8 {q12}, [r1]! + vld1.8 {q14}, [r1]! + pld [r2, #64] + vld1.8 {q1}, [r2]! + vld1.8 {q3}, [r2]! + vld1.8 {q5}, [r2]! + vld1.8 {q7}, [r2]! + vld1.8 {q9}, [r2]! + vld1.8 {q11}, [r2]! + vld1.8 {q13}, [r2]! + vld1.8 {q15}, [r2]! + + vst2.8 {q0, q1}, [r0]! + vst2.8 {q2, q3}, [r0]! + vst2.8 {q4, q5}, [r0]! + vst2.8 {q6, q7}, [r0]! + vst2.8 {q8, q9}, [r0]! + vst2.8 {q10, q11}, [r0]! + pld [r1] + vst2.8 {q12, q13}, [r0]! + vst2.8 {q14, q15}, [r0]! + + add r4, #128 + cmp r4, r5 + blt LINEAR_SIZE_128_LOOP + +LINEAR_SIZE_64: + sub r5, r3, r4 + cmp r5, #64 + blt LINEAR_SIZE_2 +LINEAR_SIZE_64_LOOP: + pld [r2] + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r1]! + vld1.8 {q4}, [r1]! + vld1.8 {q6}, [r1]! + vld1.8 {q1}, [r2]! + vld1.8 {q3}, [r2]! + vld1.8 {q5}, [r2]! + vld1.8 {q7}, [r2]! + + vst2.8 {q0, q1}, [r0]! + vst2.8 {q2, q3}, [r0]! + pld [r1] + vst2.8 {q4, q5}, [r0]! + vst2.8 {q6, q7}, [r0]! + + add r4, #64 + cmp r4, r3 + blt LINEAR_SIZE_64_LOOP + +LINEAR_SIZE_2: + sub r5, r3, r4 + cmp r5, #2 + blt RESTORE_REG +LINEAR_SIZE_2_LOOP: + ldrb r6, [r1], #1 + ldrb r7, [r2], #1 + ldrb r8, [r1], #1 + ldrb r9, [r2], #1 + + strb r6, [r0], #1 + strb r7, [r0], #1 + strb r8, [r0], #1 + strb r9, [r0], #1 + + add r4, #2 + cmp r4, r3 + blt LINEAR_SIZE_2_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s new file mode 100644 index 0000000..d1e0d2f --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_uv_neon.s @@ -0,0 +1,768 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_nv12t_yuv420_uv_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts and Deinterleaves tiled data to linear + * 1. UV of NV12T to UV of YUV420P + * + * @param yuv420_u_dest + * U plane address of YUV420P[out] + * + * @param yuv420_v_dest + * V plane address of YUV420P[out] + * + * @param nv12t_src + * UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_tiled_to_linear_deinterleave_neon + .type csc_tiled_to_linear_deinterleave_neon, %function +csc_tiled_to_linear_deinterleave_neon: + .fnstart + + @r0 linear_u_dest + @r1 linear_v_dest + @r2 tiled_uv_src + @r3 linear_x_size + @r4 linear_y_size + @r5 j + @r6 i + @r7 tiled_addr + @r8 linear_addr + @r9 aligned_x_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + ldr r4, [sp, #40] @ load linear_y_size to r4 + + mov r9, #0 + +LINEAR_X_SIZE_1024: + cmp r3, #1024 + blt LINEAR_X_SIZE_512 + + mov r6, #0 +LINEAR_X_SIZE_1024_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_1024_LOOP_EVEN +LINEAR_X_SIZE_1024_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + mov r5, #8 + mov r5, r5, lsl #11 + sub r5, r5, #32 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + b LINEAR_X_SIZE_1024_LOOP_MEMCPY + +LINEAR_X_SIZE_1024_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + add r12, r6, #32 + cmp r12, r11 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + movlt r5, #8 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r5, #4 + addge r12, r7, #2048 + addge r14, r7, #2048 + mov r5, r5, lsl #11 + sub r5, r5, #32 + +LINEAR_X_SIZE_1024_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mov r10, r3, asr #1 + vld2.8 {q10, q11}, [r12], r5 + mul r10, r10, r6 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + add r8, r0, r10 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12], r5 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12], r5 + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12] + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14] + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_1024_LOOP + + mov r9, #1024 + +LINEAR_X_SIZE_512: + sub r10, r3, r9 + cmp r10, #512 + blt LINEAR_X_SIZE_256 + + mov r6, #0 +LINEAR_X_SIZE_512_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_512_LOOP_EVEN +LINEAR_X_SIZE_512_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + mov r5, #8 + mov r5, r5, lsl #11 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + sub r5, r5, #32 + b LINEAR_X_SIZE_512_LOOP_MEMCPY + +LINEAR_X_SIZE_512_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + add r12, r6, #32 + cmp r12, r11 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + movlt r5, #8 + movlt r10, r9, asr #5 + movge r10, r9, asr #6 + add r7, r7, r10, lsl #11 + add r11, r7, #2048 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r5, #4 + addge r12, r7, #4096 + addge r14, r7, #6144 + mov r5, r5, lsl #11 + sub r5, r5, #32 + +LINEAR_X_SIZE_512_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7], r5 + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11], r5 + pld [r14] + mov r10, r3, asr #1 + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r10, r10, r6 + vld2.8 {q10, q11}, [r12], r5 + add r8, r0, r10 + vld2.8 {q12, q13}, [r14]! + add r8, r8, r9, asr #1 + vld2.8 {q14, q15}, [r14], r5 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + add r10, r1, r10 + vst1.8 {q14}, [r8]! + + add r10, r10, r9, asr #1 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + pld [r7] + vst1.8 {q13}, [r10]! + pld [r7, #32] + vst1.8 {q15}, [r10]! + + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + vld2.8 {q10, q11}, [r12] + vld2.8 {q12, q13}, [r14]! + vld2.8 {q14, q15}, [r14] + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_512_LOOP + + add r9, r9, #512 + +LINEAR_X_SIZE_256: + sub r10, r3, r9 + cmp r10, #256 + blt LINEAR_X_SIZE_128 + + mov r6, #0 +LINEAR_X_SIZE_256_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_256_LOOP_EVEN +LINEAR_X_SIZE_256_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + add r12, r7, #4096 + add r14, r7, #6144 + b LINEAR_X_SIZE_256_LOOP_MEMCPY + +LINEAR_X_SIZE_256_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r6, #32 + cmp r12, r11 + movlt r10, r9, asr #5 + addlt r7, r7, r10, lsl #11 + addlt r11, r7, #2048 + addlt r12, r7, #12288 + addlt r14, r7, #14336 + movge r10, r9, asr #6 + addge r7, r7, r10, lsl #11 + addge r11, r7, #2048 + addge r12, r7, #4096 + addge r14, r7, #6144 + +LINEAR_X_SIZE_256_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld2.8 {q2, q3}, [r7] + pld [r12] + vld2.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld2.8 {q6, q7}, [r11] + pld [r14] + vld2.8 {q8, q9}, [r12]! + pld [r14, #32] + mov r10, r3, asr #1 + vld2.8 {q10, q11}, [r12] + mul r10, r10, r6 + vld2.8 {q12, q13}, [r14]! + add r8, r0, r10 + vld2.8 {q14, q15}, [r14] + + add r8, r8, r9, asr #1 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8]! + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + add r10, r1, r10 + vst1.8 {q14}, [r8]! + + add r10, r10, r9, asr #1 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10]! + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #1 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + blt LINEAR_X_SIZE_256_LOOP + + add r9, r9, #256 + +LINEAR_X_SIZE_128: + sub r10, r3, r9 + cmp r10, #128 + blt LINEAR_X_SIZE_64 + + mov r6, #0 +LINEAR_X_SIZE_128_LOOP: + mov r7, #0 @ tiled_offset = 0@ + mov r5, r6, asr #5 @ tiled_y_index = i>>5@ + and r10, r5, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_128_LOOP_EVEN +LINEAR_X_SIZE_128_LOOP_ODD: + sub r7, r5, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r7, r7, r10 + add r7, r7, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r9, asr #5 + add r7, r7, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r7, #2048 + b LINEAR_X_SIZE_128_LOOP_MEMCPY + +LINEAR_X_SIZE_128_LOOP_EVEN: + add r11, r4, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r3, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r7, r5, r10 + mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r6, #32 + cmp r12, r11 + movlt r10, r9, asr #5 + movge r10, r9, asr #6 + add r7, r7, r10, lsl #11 + add r11, r7, #2048 + +LINEAR_X_SIZE_128_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r10, r10, lsl #6 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld2.8 {q0, q1}, [r7]! + pld [r11, #32] + vld2.8 {q2, q3}, [r7]! + pld [r7] + vld2.8 {q4, q5}, [r11]! + mov r10, r3, asr #1 + pld [r7, #32] + vld2.8 {q6, q7}, [r11]! + mul r10, r10, r6 + pld [r11] + vld2.8 {q8, q9}, [r7]! + add r10, r10, r9, asr #1 + pld [r11, #32] + vld2.8 {q10, q11}, [r7]! + add r8, r0, r10 + vld2.8 {q12, q13}, [r11]! + mov r14, r3, asr #1 + vld2.8 {q14, q15}, [r11]! + + sub r14, r14, #48 + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8]! + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8], r14 + vst1.8 {q8}, [r8]! + vst1.8 {q10}, [r8]! + vst1.8 {q12}, [r8]! + vst1.8 {q14}, [r8]! + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10]! + vst1.8 {q5}, [r10]! + vst1.8 {q7}, [r10], r14 + vst1.8 {q9}, [r10]! + vst1.8 {q11}, [r10]! + add r6, #2 + vst1.8 {q13}, [r10]! + cmp r6, r4 + vst1.8 {q15}, [r10]! + + blt LINEAR_X_SIZE_128_LOOP + + add r9, r9, #128 + +LINEAR_X_SIZE_64: + sub r10, r3, r9 + cmp r10, #64 + blt LINEAR_X_SIZE_4 + + mov r5, r9 + mov r6, #0 + +LINEAR_X_SIZE_64_LOOP: + bl GET_TILED_OFFSET + +LINEAR_X_SIZE_64_LOOP_MEMCPY: + and r10, r6, #0x1F + mov r14, r3, asr #1 + mov r10, r10, lsl #6 + sub r14, r14, #16 + add r10, r2, r10 + + add r7, r7, r10 @ tiled_addr = tiled_src+64*(temp1) + pld [r7, #64] + vld2.8 {q0, q1}, [r7]! + mov r10, r3, asr #1 + pld [r7, #64] + vld2.8 {q2, q3}, [r7]! + mul r10, r10, r6 + vld2.8 {q4, q5}, [r7]! + add r10, r10, r9, asr #1 + vld2.8 {q6, q7}, [r7]! + add r8, r0, r10 + + vst1.8 {q0}, [r8]! + vst1.8 {q2}, [r8], r14 + vst1.8 {q4}, [r8]! + vst1.8 {q6}, [r8], r14 + + add r10, r1, r10 + vst1.8 {q1}, [r10]! + vst1.8 {q3}, [r10], r14 + add r6, #2 + vst1.8 {q5}, [r10]! + cmp r6, r4 + vst1.8 {q7}, [r10], r14 + + blt LINEAR_X_SIZE_64_LOOP + + add r9, r9, #64 + +LINEAR_X_SIZE_4: + cmp r9, r3 + beq RESTORE_REG + + mov r6, #0 @ i = 0 +LINEAR_Y_SIZE_4_LOOP: + + mov r5, r9 @ j = aligned_x_size +LINEAR_X_SIZE_4_LOOP: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #1 @ temp2 = linear_x_size-1 + + add r8, r0, r11 @ linear_addr = linear_dest_u+temp1 + add r11, r1, r11 @ temp1 = linear_dest_v+temp1 + add r7, r2, r7 @ tiled_addr = tiled_src+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + and r14, r5, #0x3F @ temp3 = j&0x3F + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + ldrh r10, [r7], #2 + ldrh r14, [r7], #62 + strb r10, [r8], #1 + mov r10, r10, asr #8 + strb r10, [r11], #1 + strb r14, [r8], r12 + mov r14, r14, asr #8 + strb r14, [r11], r12 + + ldrh r10, [r7], #2 + ldrh r14, [r7], #62 + strb r10, [r8], #1 + mov r10, r10, asr #8 + strb r10, [r11], #1 + strb r14, [r8], r12 + mov r14, r14, asr #8 + strb r14, [r11], r12 + + add r5, r5, #4 @ j = j+4 + cmp r5, r3 @ j<linear_x_size + blt LINEAR_X_SIZE_4_LOOP + + add r6, r6, #2 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LINEAR_Y_SIZE_4_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + stmfd sp!, {r14} + + mov r12, r6, asr #5 @ temp2 = i>>5 + mov r11, r5, asr #6 @ temp1 = j>>6 + + and r14, r12, #0x1 @ if (temp2 & 0x1) + cmp r14, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r7, r12, #1 @ tiled_addr = temp2-1 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r7, r14 @ tiled_addr = tiled_addr*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp1 + add r7, r7, #2 @ tiled_addr = tiled_addr+2 + bic r14, r11, #0x3 @ temp3 = (temp1>>2)<<2 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r14, r4, #31 @ temp3 = linear_y_size+31 + bic r14, r14, #0x1F @ temp3 = (temp3>>5)<<5 + sub r14, r14, #32 @ temp3 = temp3 - 32 + cmp r6, r14 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r14, r11, #2 @ temp3 = temp1+2 + bic r14, r14, #3 @ temp3 = (temp3>>2)<<2 + add r7, r11, r14 @ tiled_addr = temp1+temp3 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r12, r12, r14 @ tiled_y_index = tiled_y_index*temp3 + add r7, r7, r12 @ tiled_addr = tiled_addr+tiled_y_index + mov r7, r7, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r12, r14 @ tiled_addr = temp2*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + ldmfd sp!, {r15} @ restore registers + .fnend diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s new file mode 100644 index 0000000..a378579 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_nv12t_yuv420_y_neon.s @@ -0,0 +1,680 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_nv12t_yuv420_y_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts tiled data to linear. + * 1. Y of NV12T to Y of YUV420P + * 2. Y of NV12T to Y of YUV420S + * 3. UV of NV12T to UV of YUV420S + * + * @param yuv420_dest + * Y or UV plane address of YUV420[out] + * + * @param nv12t_src + * Y or UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_tiled_to_linear_neon + .type csc_tiled_to_linear_neon, %function +csc_tiled_to_linear_neon: + .fnstart + + @r0 linear_dest + @r1 tiled_src + @r2 linear_x_size + @r3 linear_y_size + @r4 j + @r5 i + @r6 tiled_addr + @r7 linear_addr + @r8 aligned_x_size + @r9 aligned_y_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + mov r8, #0 + cmp r2, #1024 + blt LINEAR_X_SIZE_512 + +LINEAR_X_SIZE_1024: + + mov r5, #0 +LINEAR_X_SIZE_1024_LOOP: + mov r6, #0 @ tiled_offset = 0@ + mov r4, r5, asr #5 @ tiled_y_index = i>>5@ + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_1024_LOOP_EVEN +LINEAR_X_SIZE_1024_LOOP_ODD: + sub r6, r4, #1 @ tiled_offset = tiled_y_index-1@ + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + mov r4, #8 + mov r4, r4, lsl #11 + sub r4, r4, #32 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_1024_LOOP_MEMCPY + +LINEAR_X_SIZE_1024_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + add r12, r5, #32 + cmp r12, r11 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r11, r6, #2048 + movlt r4, #8 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + movge r4, #4 + addge r12, r6, #4096 + addge r14, r6, #6144 + mov r4, r4, lsl #11 + sub r4, r4, #32 + +LINEAR_X_SIZE_1024_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r7, r2, r5 + vld1.8 {q10, q11}, [r12], r4 + add r7, r7, r0 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6] + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11] + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12] + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14] + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_1024_LOOP + + mov r8, #1024 + +LINEAR_X_SIZE_512: + + sub r14, r2, r8 + cmp r14, #512 + blt LINEAR_X_SIZE_256 + + mov r5, #0 +LINEAR_X_SIZE_512_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_512_LOOP_EVEN + +LINEAR_X_SIZE_512_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + mov r4, #8 + mov r4, r4, lsl #11 + sub r4, r4, #32 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_512_LOOP_MEMCPY + +LINEAR_X_SIZE_512_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + add r12, r5, #32 + cmp r12, r11 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + movlt r4, #8 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + movge r4, #4 + addge r12, r6, #4096 + addge r14, r6, #6144 + mov r4, r4, lsl #11 + sub r4, r4, #32 + +LINEAR_X_SIZE_512_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + mul r7, r2, r5 + vld1.8 {q10, q11}, [r12], r4 + add r7, r7, r8 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + add r7, r7, r0 + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + pld [r6] + vst1.8 {q12, q13}, [r7]! + pld [r6, #32] + vst1.8 {q14, q15}, [r7]! + + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + vld1.8 {q2, q3}, [r6], r4 + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + vld1.8 {q6, q7}, [r11], r4 + pld [r14] + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + vld1.8 {q10, q11}, [r12], r4 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14], r4 + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_512_LOOP + + add r8, r8, #512 + +LINEAR_X_SIZE_256: + + sub r14, r2, r8 + cmp r14, #256 + blt LINEAR_X_SIZE_128 + + mov r5, #0 +LINEAR_X_SIZE_256_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_256_LOOP_EVEN + +LINEAR_X_SIZE_256_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + add r12, r6, #4096 + add r14, r6, #6144 + b LINEAR_X_SIZE_256_LOOP_MEMCPY + +LINEAR_X_SIZE_256_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r5, #32 + cmp r12, r11 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + addlt r12, r6, #12288 + addlt r14, r6, #14336 + addge r12, r6, #4096 + addge r14, r6, #6144 + +LINEAR_X_SIZE_256_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r11] + vld1.8 {q0, q1}, [r6]! + pld [r11, #32] + add r12, r12, r10 @ tiled_addr2 = tiled_src+64*(temp1) + vld1.8 {q2, q3}, [r6] + pld [r12] + vld1.8 {q4, q5}, [r11]! + pld [r12, #32] + add r14, r14, r10 @ tiled_addr3 = tiled_src+64*(temp1) + vld1.8 {q6, q7}, [r11] + pld [r14] + mul r7, r2, r5 + vld1.8 {q8, q9}, [r12]! + pld [r14, #32] + add r7, r7, r8 + vld1.8 {q10, q11}, [r12] + add r7, r7, r0 + vld1.8 {q12, q13}, [r14]! + vld1.8 {q14, q15}, [r14] + + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7]! + add r5, #1 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7]! + + blt LINEAR_X_SIZE_256_LOOP + + add r8, r8, #256 + +LINEAR_X_SIZE_128: + + sub r14, r2, r8 + cmp r14, #128 + blt LINEAR_X_SIZE_64 + + mov r5, #0 +LINEAR_X_SIZE_128_LOOP: + mov r6, #0 + mov r4, r5, asr #5 @ tiled_y_index = i>>5 + and r10, r4, #0x1 + cmp r10, #0x1 + bne LINEAR_X_SIZE_128_LOOP_EVEN + +LINEAR_X_SIZE_128_LOOP_ODD: + sub r6, r4, #1 + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)@ + mul r6, r6, r10 + add r6, r6, #2 @ tiled_offset = tiled_offset+2@ + mov r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + add r6, r6, r10 @ tiled_offset = tiled_offset+temp1@ + mov r6, r6, lsl #11 + add r11, r6, #2048 + b LINEAR_X_SIZE_128_LOOP_MEMCPY + +LINEAR_X_SIZE_128_LOOP_EVEN: + add r11, r3, #31 @ temp2 = ((linear_y_size+31)>>5)<<5@ + bic r11, r11, #0x1F + add r10, r2, #127 @ temp1 = ((linear_x_size+127)>>7)<<7@ + bic r10, #0x7F + mov r10, r10, asr #6 @ tiled_offset = tiled_y_index*(temp1>>6)@ + mul r6, r4, r10 + mov r6, r6, lsl #11 @ tiled_offset = tiled_offset<<11@ + add r12, r5, #32 + cmp r12, r11 + movlt r10, r8, asr #5 @ temp1 = aligned_x_size>>5@ + movge r10, r8, asr #6 @ temp1 = aligned_x_size>>6@ + add r6, r6, r10, lsl #11 @ tiled_offset = tiled_offset+(temp1<<11)@ + add r11, r6, #2048 + +LINEAR_X_SIZE_128_LOOP_MEMCPY: + and r10, r5, #0x1F + mov r10, r10, lsl #6 + add r10, r1, r10 + + add r6, r6, r10 @ tiled_addr = tiled_src+64*(temp1) + add r11, r11, r10 @ tiled_addr1 = tiled_src+64*(temp1) + pld [r6, #64] + vld1.8 {q0, q1}, [r6]! + pld [r6, #64] + vld1.8 {q2, q3}, [r6]! + mul r7, r2, r5 + pld [r11] + vld1.8 {q4, q5}, [r6]! + add r7, r7, r8 + pld [r11, #32] + vld1.8 {q6, q7}, [r6] + add r7, r7, r0 + pld [r11, #64] + vld1.8 {q8, q9}, [r11]! + pld [r11, #64] + vld1.8 {q10, q11}, [r11]! + vld1.8 {q12, q13}, [r11]! + vld1.8 {q14, q15}, [r11] + + sub r9, r2, #96 + vst1.8 {q0, q1}, [r7]! + vst1.8 {q2, q3}, [r7]! + vst1.8 {q8, q9}, [r7]! + vst1.8 {q10, q11}, [r7], r9 + vst1.8 {q4, q5}, [r7]! + vst1.8 {q6, q7}, [r7]! + add r5, #2 + vst1.8 {q12, q13}, [r7]! + cmp r5, r3 + vst1.8 {q14, q15}, [r7] + + blt LINEAR_X_SIZE_128_LOOP + + add r8, r8, #128 + +LINEAR_X_SIZE_64: + + sub r14, r2, r8 + cmp r14, #64 + blt LINEAR_X_SIZE_4 + + mov r5, #0 + mov r4, r8 + +LINEAR_X_SIZE_64_LOOP: + + bl GET_TILED_OFFSET + + add r6, r1, r6 @ tiled_addr = tiled_src+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + pld [r6, #64] + vld1.8 {q0, q1}, [r6]! @ store {tiled_addr} + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + pld [r6, #64] + vld1.8 {q2, q3}, [r6]! + pld [r6, #64] + vld1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*1} + pld [r6, #64] + vld1.8 {q6, q7}, [r6]! + pld [r6, #64] + vld1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*2} + pld [r6, #64] + vld1.8 {q10, q11}, [r6]! + add r7, r0, r4 @ linear_addr = linear_dest+j + vld1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*3} + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + vld1.8 {q14, q15}, [r6]! + sub r10, r2, #32 @ temp1 = linear_x_size-32 + + vst1.8 {q0, q1}, [r7]! @ load {linear_src, 64} + vst1.8 {q2, q3}, [r7], r10 + vst1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*1, 64} + vst1.8 {q6, q7}, [r7], r10 + vst1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*2, 64} + vst1.8 {q10, q11}, [r7], r10 + add r5, #4 + vst1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*3, 64} + cmp r5, r3 + vst1.8 {q14, q15}, [r7], r10 + + blt LINEAR_X_SIZE_64_LOOP + + add r8, r8, #64 + +LINEAR_X_SIZE_4: + cmp r8, r2 + beq RESTORE_REG + + mov r5, #0 @ i = 0 +LINEAR_Y_SIZE_4_LOOP: + + mov r4, r8 @ j = aligned_x_size +LINEAR_X_SIZE_4_LOOP: + + bl GET_TILED_OFFSET + + and r10, r5, #0x1F @ temp1 = i&0x1F + and r11, r4, #0x3F @ temp2 = j&0x3F + + add r6, r6, r1 + add r6, r6, r11 + add r6, r6, r10, lsl #6 + + ldr r10, [r6], #64 + add r7, r0, r4 + ldr r11, [r6], #64 + mul r9, r2, r5 + ldr r12, [r6], #64 + add r7, r7, r9 + ldr r14, [r6], #64 + + str r10, [r7], r2 + str r11, [r7], r2 + str r12, [r7], r2 + str r14, [r7], r2 + + add r4, r4, #4 @ j = j+4 + cmp r4, r2 @ j<linear_x_size + blt LINEAR_X_SIZE_4_LOOP + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LINEAR_Y_SIZE_4_LOOP + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + + mov r11, r5, asr #5 @ temp2 = i>>5 + mov r10, r4, asr #6 @ temp1 = j>>6 + + and r12, r11, #0x1 @ if (temp2 & 0x1) + cmp r12, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r6, r11, #1 @ tiled_addr = temp2-1 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r6, r12 @ tiled_addr = tiled_addr*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp1 + add r6, r6, #2 @ tiled_addr = tiled_addr+2 + bic r12, r10, #0x3 @ temp3 = (temp1>>2)<<2 + add r6, r6, r12 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r12, r3, #31 @ temp3 = linear_y_size+31 + bic r12, r12, #0x1F @ temp3 = (temp3>>5)<<5 + sub r12, r12, #32 @ temp3 = temp3 - 32 + cmp r5, r12 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r12, r10, #2 @ temp3 = temp1+2 + bic r12, r12, #3 @ temp3 = (temp3>>2)<<2 + add r6, r10, r12 @ tiled_addr = temp1+temp3 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r11, r11, r12 @ tiled_y_index = tiled_y_index*temp3 + add r6, r6, r11 @ tiled_addr = tiled_addr+tiled_y_index + mov r6, r6, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r11, r12 @ tiled_addr = temp2*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + mov pc, lr + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s new file mode 100644 index 0000000..114e9bb --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_uv_neon.s @@ -0,0 +1,573 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_yuv420_nv12t_uv_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts and Interleaves linear to tiled + * 1. UV of YUV420P to UV of NV12T + * + * @param nv12t_uv_dest + * UV plane address of NV12T[out] + * + * @param yuv420p_u_src + * U plane address of YUV420P[in] + * + * @param yuv420p_v_src + * V plane address of YUV420P[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_linear_to_tiled_interleave_neon + .type csc_linear_to_tiled_interleave_neon, %function +csc_linear_to_tiled_interleave_neon: + .fnstart + + @r0 tiled_dest + @r1 linear_src_u + @r2 linear_src_v + @r3 linear_x_size + @r4 linear_y_size + @r5 j + @r6 i + @r7 tiled_addr + @r8 linear_addr + @r9 aligned_x_size + @r10 aligned_y_size + @r11 temp1 + @r12 temp2 + @r14 temp3 + + stmfd sp!, {r4-r12,r14} @ backup registers + + ldr r4, [sp, #40] @ load linear_y_size to r4 + + bic r10, r4, #0x1F @ aligned_y_size = (linear_y_size>>5)<<5 + bic r9, r3, #0x3F @ aligned_x_size = (linear_x_size>>6)<<6 + + mov r6, #0 @ i = 0 +LOOP_ALIGNED_Y_SIZE: + + mov r5, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #16 @ temp2 = linear_x_size-16 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! + vst2.8 {q14, q15}, [r7]! + + add r5, r5, #64 @ j = j+64 + cmp r5, r9 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE + + add r6, r6, #32 @ i = i+32 + cmp r6, r10 @ i<aligned_y_size + blt LOOP_ALIGNED_Y_SIZE + + ldr r4, [sp, #40] @ load linear_y_size to r4 + cmp r6, r4 + beq LOOP_LINEAR_Y_SIZE_2_START + +LOOP_LINEAR_Y_SIZE_1: + + mov r5, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE_1: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #16 @ temp2 = linear_x_size-16 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + pld [r8, r3] + vld1.8 {q0}, [r8]! + vld1.8 {q2}, [r8], r12 + pld [r11, r3] + vld1.8 {q1}, [r11]! + vld1.8 {q3}, [r11], r12 + pld [r8, r3] + vld1.8 {q4}, [r8]! + vld1.8 {q6}, [r8], r12 + pld [r11, r3] + vld1.8 {q5}, [r11]! + vld1.8 {q7}, [r11], r12 + pld [r8, r3] + vld1.8 {q8}, [r8]! + vld1.8 {q10}, [r8], r12 + pld [r11, r3] + vld1.8 {q9}, [r11]! + vld1.8 {q11}, [r11], r12 + pld [r8, r3] + vld1.8 {q12}, [r8]! + vld1.8 {q14}, [r8], r12 + pld [r11, r3] + vld1.8 {q13}, [r11]! + vld1.8 {q15}, [r11], r12 + + vst2.8 {q0, q1}, [r7]! @ store {tiled_addr} + vst2.8 {q2, q3}, [r7]! + vst2.8 {q4, q5}, [r7]! @ store {tiled_addr+64*1} + vst2.8 {q6, q7}, [r7]! + vst2.8 {q8, q9}, [r7]! @ store {tiled_addr+64*2} + vst2.8 {q10, q11}, [r7]! + vst2.8 {q12, q13}, [r7]! @ store {tiled_addr+64*3} + vst2.8 {q14, q15}, [r7]! + + add r5, r5, #64 @ j = j+64 + cmp r5, r9 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE_1 + + add r6, r6, #4 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_1 + +LOOP_LINEAR_Y_SIZE_2_START: + cmp r5, r3 + beq RESTORE_REG + + mov r6, #0 @ i = 0 +LOOP_LINEAR_Y_SIZE_2: + + mov r5, r9 @ j = aligned_x_size +LOOP_LINEAR_X_SIZE_2: + + bl GET_TILED_OFFSET + + mov r11, r3, asr #1 @ temp1 = linear_x_size/2 + mul r11, r11, r6 @ temp1 = temp1*(i) + add r11, r11, r5, asr #1 @ temp1 = temp1+j/2 + mov r12, r3, asr #1 @ temp2 = linear_x_size/2 + sub r12, r12, #1 @ temp2 = linear_x_size-1 + + add r8, r1, r11 @ linear_addr = linear_src_u+temp1 + add r11, r2, r11 @ temp1 = linear_src_v+temp1 + add r7, r0, r7 @ tiled_addr = tiled_dest+tiled_addr + and r14, r6, #0x1F @ temp3 = i&0x1F@ + mov r14, r14, lsl #6 @ temp3 = temp3*64 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + and r14, r5, #0x3F @ temp3 = j&0x3F + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + ldrb r10, [r8], #1 + ldrb r14, [r11], #1 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #2 + ldrb r10, [r8], r12 + ldrb r14, [r11], r12 + mov r14, r14, lsl #8 + orr r10, r10, r14 + strh r10, [r7], #62 + + add r5, r5, #4 @ j = j+4 + cmp r5, r3 @ j<linear_x_size + blt LOOP_LINEAR_X_SIZE_2 + + add r6, r6, #4 @ i = i+4 + cmp r6, r4 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_2 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + stmfd sp!, {r14} + + mov r12, r6, asr #5 @ temp2 = i>>5 + mov r11, r5, asr #6 @ temp1 = j>>6 + + and r14, r12, #0x1 @ if (temp2 & 0x1) + cmp r14, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r7, r12, #1 @ tiled_addr = temp2-1 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r7, r14 @ tiled_addr = tiled_addr*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp1 + add r7, r7, #2 @ tiled_addr = tiled_addr+2 + bic r14, r11, #0x3 @ temp3 = (temp1>>2)<<2 + add r7, r7, r14 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r14, r4, #31 @ temp3 = linear_y_size+31 + bic r14, r14, #0x1F @ temp3 = (temp3>>5)<<5 + sub r14, r14, #32 @ temp3 = temp3 - 32 + cmp r6, r14 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r14, r11, #2 @ temp3 = temp1+2 + bic r14, r14, #3 @ temp3 = (temp3>>2)<<2 + add r7, r11, r14 @ tiled_addr = temp1+temp3 + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r12, r12, r14 @ tiled_y_index = tiled_y_index*temp3 + add r7, r7, r12 @ tiled_addr = tiled_addr+tiled_y_index + mov r7, r7, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r14, r3, #127 @ temp3 = linear_x_size+127 + bic r14, r14, #0x7F @ temp3 = (temp3>>7)<<7 + mov r14, r14, asr #6 @ temp3 = temp3>>6 + mul r7, r12, r14 @ tiled_addr = temp2*temp3 + add r7, r7, r11 @ tiled_addr = tiled_addr+temp3 + mov r7, r7, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + ldmfd sp!, {r15} @ restore registers + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s new file mode 100644 index 0000000..fb69a02 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/csc/csc_yuv420_nv12t_y_neon.s @@ -0,0 +1,451 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file csc_yuv420_nv12t_y_neon.s + * @brief SEC_OMX specific define + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +/* + * Converts linear data to tiled. + * 1. Y of YUV420P to Y of NV12T + * 2. Y of YUV420S to Y of NV12T + * 3. UV of YUV420S to UV of NV12T + * + * @param nv12t_dest + * Y or UV plane address of NV12T[out] + * + * @param yuv420_src + * Y or UV plane address of YUV420P(S)[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ + + .arch armv7-a + .text + .global csc_linear_to_tiled_neon + .type csc_linear_to_tiled_neon, %function +csc_linear_to_tiled_neon: + .fnstart + + @r0 tiled_dest + @r1 linear_src + @r2 linear_x_size + @r3 linear_y_size + @r4 j + @r5 i + @r6 nn(tiled_addr) + @r7 mm(linear_addr) + @r8 aligned_x_size + @r9 aligned_y_size + @r10 temp1 + @r11 temp2 + @r12 temp3 + @r14 temp4 + + stmfd sp!, {r4-r12,r14} @ backup registers + + bic r9, r3, #0x1F @ aligned_y_size = (linear_y_size>>5)<<5 + bic r8, r2, #0x3F @ aligned_x_size = (linear_x_size>>6)<<6 + + mov r5, #0 @ i = 0 +LOOP_ALIGNED_Y_SIZE: + + mov r4, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + sub r10, r2, #32 + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + add r4, r4, #64 @ j = j+64 + cmp r4, r8 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE + + add r5, r5, #32 @ i = i+32 + cmp r5, r9 @ i<aligned_y_size + blt LOOP_ALIGNED_Y_SIZE + + cmp r5, r3 + beq LOOP_LINEAR_Y_SIZE_2_START + +LOOP_LINEAR_Y_SIZE_1: + + mov r4, #0 @ j = 0 +LOOP_ALIGNED_X_SIZE_1: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + sub r10, r2, #32 @ temp1 = linear_x_size-32 + + pld [r7, r2, lsl #1] + vld1.8 {q0, q1}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q2, q3}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q4, q5}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q6, q7}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q8, q9}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q10, q11}, [r7], r10 + pld [r7, r2, lsl #1] + vld1.8 {q12, q13}, [r7]! + pld [r7, r2, lsl #1] + vld1.8 {q14, q15}, [r7], r10 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + vst1.8 {q0, q1}, [r6]! + vst1.8 {q2, q3}, [r6]! + vst1.8 {q4, q5}, [r6]! + vst1.8 {q6, q7}, [r6]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q14, q15}, [r6]! + + add r4, r4, #64 @ j = j+64 + cmp r4, r8 @ j<aligned_x_size + blt LOOP_ALIGNED_X_SIZE_1 + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_1 + +LOOP_LINEAR_Y_SIZE_2_START: + cmp r4, r2 + beq RESTORE_REG + + mov r5, #0 @ i = 0 +LOOP_LINEAR_Y_SIZE_2: + + mov r4, r8 @ j = aligned_x_size +LOOP_LINEAR_X_SIZE_2: + + bl GET_TILED_OFFSET + + mul r10, r2, r5 @ temp1 = linear_x_size*(i) + add r7, r1, r4 @ linear_addr = linear_src+j + add r7, r7, r10 @ linear_addr = linear_addr+temp1 + + add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr + and r11, r5, #0x1F @ temp2 = i&0x1F + mov r11, r11, lsl #6 @ temp2 = 64*temp2 + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + and r11, r4, #0x3F @ temp2 = j&0x3F + add r6, r6, r11 @ tiled_addr = tiled_addr+temp2 + + ldr r10, [r7], r2 + ldr r11, [r7], r2 + ldr r12, [r7], r2 + ldr r14, [r7], r2 + str r10, [r6], #64 + str r11, [r6], #64 + str r12, [r6], #64 + str r14, [r6], #64 + + add r4, r4, #4 @ j = j+4 + cmp r4, r2 @ j<linear_x_size + blt LOOP_LINEAR_X_SIZE_2 + + add r5, r5, #4 @ i = i+4 + cmp r5, r3 @ i<linear_y_size + blt LOOP_LINEAR_Y_SIZE_2 + +RESTORE_REG: + ldmfd sp!, {r4-r12,r15} @ restore registers + +GET_TILED_OFFSET: + + mov r11, r5, asr #5 @ temp2 = i>>5 + mov r10, r4, asr #6 @ temp1 = j>>6 + + and r12, r11, #0x1 @ if (temp2 & 0x1) + cmp r12, #0x1 + bne GET_TILED_OFFSET_EVEN_FORMULA_1 + +GET_TILED_OFFSET_ODD_FORMULA: + sub r6, r11, #1 @ tiled_addr = temp2-1 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3 >>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r6, r12 @ tiled_addr = tiled_addr*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp1 + add r6, r6, #2 @ tiled_addr = tiled_addr+2 + bic r12, r10, #0x3 @ temp3 = (temp1>>2)<<2 + add r6, r6, r12 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11 + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_1: + add r12, r3, #31 @ temp3 = linear_y_size+31 + bic r12, r12, #0x1F @ temp3 = (temp3>>5)<<5 + sub r12, r12, #32 @ temp3 = temp3 - 32 + cmp r5, r12 @ if (i<(temp3-32)) { + bge GET_TILED_OFFSET_EVEN_FORMULA_2 + add r12, r10, #2 @ temp3 = temp1+2 + bic r12, r12, #3 @ temp3 = (temp3>>2)<<2 + add r6, r10, r12 @ tiled_addr = temp1+temp3 + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r11, r11, r12 @ tiled_y_index = tiled_y_index*temp3 + add r6, r6, r11 @ tiled_addr = tiled_addr+tiled_y_index + mov r6, r6, lsl #11 @ + b GET_TILED_OFFSET_RETURN + +GET_TILED_OFFSET_EVEN_FORMULA_2: + add r12, r2, #127 @ temp3 = linear_x_size+127 + bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7 + mov r12, r12, asr #6 @ temp3 = temp3>>6 + mul r6, r11, r12 @ tiled_addr = temp2*temp3 + add r6, r6, r10 @ tiled_addr = tiled_addr+temp3 + mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11@ + +GET_TILED_OFFSET_RETURN: + mov pc, lr + .fnend + diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/dec/src/SsbSipMfcDecAPI.c b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/dec/src/SsbSipMfcDecAPI.c index 508f290..19b63b0 100644 --- a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/dec/src/SsbSipMfcDecAPI.c +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/dec/src/SsbSipMfcDecAPI.c @@ -503,173 +503,3 @@ SSBSIP_MFC_ERROR_CODE SsbSipMfcDecGetConfig(void *openHandle, SSBSIP_MFC_DEC_CON return MFC_RET_OK; } - -int tile_4x2_read(int x_size, int y_size, int x_pos, int y_pos) -{ - int pixel_x_m1, pixel_y_m1; - int roundup_x, roundup_y; - int linear_addr0, linear_addr1, bank_addr ; - int x_addr; - int trans_addr; - - pixel_x_m1 = x_size -1; - pixel_y_m1 = y_size -1; - - roundup_x = ((pixel_x_m1 >> 7) + 1); - roundup_y = ((pixel_x_m1 >> 6) + 1); - - x_addr = x_pos >> 2; - - if ((y_size <= y_pos+32) && ( y_pos < y_size) && - (((pixel_y_m1 >> 5) & 0x1) == 0) && (((y_pos >> 5) & 0x1) == 0)) { - linear_addr0 = (((y_pos & 0x1f) <<4) | (x_addr & 0xf)); - linear_addr1 = (((y_pos >> 6) & 0xff) * roundup_x + ((x_addr >> 6) & 0x3f)); - - if (((x_addr >> 5) & 0x1) == ((y_pos >> 5) & 0x1)) - bank_addr = ((x_addr >> 4) & 0x1); - else - bank_addr = 0x2 | ((x_addr >> 4) & 0x1); - } else { - linear_addr0 = (((y_pos & 0x1f) << 4) | (x_addr & 0xf)); - linear_addr1 = (((y_pos >> 6) & 0xff) * roundup_x + ((x_addr >> 5) & 0x7f)); - - if (((x_addr >> 5) & 0x1) == ((y_pos >> 5) & 0x1)) - bank_addr = ((x_addr >> 4) & 0x1); - else - bank_addr = 0x2 | ((x_addr >> 4) & 0x1); - } - - linear_addr0 = linear_addr0 << 2; - trans_addr = (linear_addr1 <<13) | (bank_addr << 11) | linear_addr0; - - return trans_addr; -} - -void Y_tile_to_linear_4x2(unsigned char *p_linear_addr, unsigned char *p_tiled_addr, unsigned int x_size, unsigned int y_size) -{ - int trans_addr; - unsigned int i, j, k, index; - unsigned char data8[4]; - unsigned int max_index = x_size * y_size; - - for (i = 0; i < y_size; i = i + 16) { - for (j = 0; j < x_size; j = j + 16) { - trans_addr = tile_4x2_read(x_size, y_size, j, i); - for (k = 0; k < 16; k++) { - /* limit check - prohibit segmentation fault */ - index = (i * x_size) + (x_size * k) + j; - /* remove equal condition to solve thumbnail bug */ - if (index + 16 > max_index) { - continue; - } - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 0]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 1]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 2]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 3]; - - p_linear_addr[index] = data8[0]; - p_linear_addr[index + 1] = data8[1]; - p_linear_addr[index + 2] = data8[2]; - p_linear_addr[index + 3] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 4]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 5]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 6]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 7]; - - p_linear_addr[index + 4] = data8[0]; - p_linear_addr[index + 5] = data8[1]; - p_linear_addr[index + 6] = data8[2]; - p_linear_addr[index + 7] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 8]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 9]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 10]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 11]; - - p_linear_addr[index + 8] = data8[0]; - p_linear_addr[index + 9] = data8[1]; - p_linear_addr[index + 10] = data8[2]; - p_linear_addr[index + 11] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 12]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 13]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 14]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 15]; - - p_linear_addr[index + 12] = data8[0]; - p_linear_addr[index + 13] = data8[1]; - p_linear_addr[index + 14] = data8[2]; - p_linear_addr[index + 15] = data8[3]; - } - } - } -} - -void CbCr_tile_to_linear_4x2(unsigned char *p_linear_addr, unsigned char *p_tiled_addr, unsigned int x_size, unsigned int y_size) -{ - int trans_addr; - unsigned int i, j, k, index; - unsigned char data8[4]; - unsigned int half_y_size = y_size / 2; - unsigned int max_index = x_size * half_y_size; - unsigned char *pUVAddr[2]; - - pUVAddr[0] = p_linear_addr; - pUVAddr[1] = p_linear_addr + ((x_size * half_y_size) / 2); - - for (i = 0; i < half_y_size; i = i + 16) { - for (j = 0; j < x_size; j = j + 16) { - trans_addr = tile_4x2_read(x_size, half_y_size, j, i); - for (k = 0; k < 16; k++) { - /* limit check - prohibit segmentation fault */ - index = (i * x_size) + (x_size * k) + j; - /* remove equal condition to solve thumbnail bug */ - if (index + 16 > max_index) { - continue; - } - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 0]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 1]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 2]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 3]; - - pUVAddr[index%2][index/2] = data8[0]; - pUVAddr[(index+1)%2][(index+1)/2] = data8[1]; - pUVAddr[(index+2)%2][(index+2)/2] = data8[2]; - pUVAddr[(index+3)%2][(index+3)/2] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 4]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 5]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 6]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 7]; - - pUVAddr[(index+4)%2][(index+4)/2] = data8[0]; - pUVAddr[(index+5)%2][(index+5)/2] = data8[1]; - pUVAddr[(index+6)%2][(index+6)/2] = data8[2]; - pUVAddr[(index+7)%2][(index+7)/2] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 8]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 9]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 10]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 11]; - - pUVAddr[(index+8)%2][(index+8)/2] = data8[0]; - pUVAddr[(index+9)%2][(index+9)/2] = data8[1]; - pUVAddr[(index+10)%2][(index+10)/2] = data8[2]; - pUVAddr[(index+11)%2][(index+11)/2] = data8[3]; - - data8[0] = p_tiled_addr[trans_addr + 64 * k + 12]; - data8[1] = p_tiled_addr[trans_addr + 64 * k + 13]; - data8[2] = p_tiled_addr[trans_addr + 64 * k + 14]; - data8[3] = p_tiled_addr[trans_addr + 64 * k + 15]; - - pUVAddr[(index+12)%2][(index+12)/2] = data8[0]; - pUVAddr[(index+13)%2][(index+13)/2] = data8[1]; - pUVAddr[(index+14)%2][(index+14)/2] = data8[2]; - pUVAddr[(index+15)%2][(index+15)/2] = data8[3]; - } - } - } -} diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/enc/src/SsbSipMfcEncAPI.c b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/enc/src/SsbSipMfcEncAPI.c index 2c33c5b..c31e522 100644 --- a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/enc/src/SsbSipMfcEncAPI.c +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/enc/src/SsbSipMfcEncAPI.c @@ -467,6 +467,9 @@ SSBSIP_MFC_ERROR_CODE SsbSipMfcEncGetInBuf(void *openHandle, SSBSIP_MFC_ENC_INPU input_info->YVirAddr = (void*)pCTX->virFrmBuf.luma; input_info->CVirAddr = (void*)pCTX->virFrmBuf.chroma; + input_info->YSize = aligned_y_size; + input_info->CSize = aligned_c_size; + return MFC_RET_OK; } diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/SsbSipMfcApi.h b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/SsbSipMfcApi.h index e083998..87e9b2d 100644 --- a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/SsbSipMfcApi.h +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/SsbSipMfcApi.h @@ -285,12 +285,6 @@ extern "C" { #endif /*--------------------------------------------------------------------------------*/ -/* Format Conversion API */ -/*--------------------------------------------------------------------------------*/ -void Y_tile_to_linear_4x2(unsigned char *p_linear_addr, unsigned char *p_tiled_addr, unsigned int x_size, unsigned int y_size); -void CbCr_tile_to_linear_4x2(unsigned char *p_linear_addr, unsigned char *p_tiled_addr, unsigned int x_size, unsigned int y_size); - -/*--------------------------------------------------------------------------------*/ /* Decoding APIs */ /*--------------------------------------------------------------------------------*/ void *SsbSipMfcDecOpen(void); diff --git a/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/color_space_convertor.h b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/color_space_convertor.h new file mode 100644 index 0000000..c5cef08 --- /dev/null +++ b/sec_mm/sec_omx/sec_codecs/video/mfc_c110/include/color_space_convertor.h @@ -0,0 +1,176 @@ +/* + * + * Copyright 2011 Samsung Electronics S.LSI Co. LTD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @file color_space_convertor.h + * @brief SEC_OMX specific define. + * NV12T(tiled) layout: + * Each element is not pixel. It is 64x32 pixel block. + * uv pixel block is interleaved as u v u v u v ... + * y1 y2 y7 y8 y9 y10 y15 y16 + * y3 y4 y5 y6 y11 y12 y13 y14 + * y17 y18 y23 y24 y25 y26 y31 y32 + * y19 y20 y21 y22 y27 y28 y29 y30 + * uv1 uv2 uv7 uv8 uv9 uv10 uv15 uv16 + * uv3 uv4 uv5 uv6 uv11 uv12 uv13 uv14 + * YUV420Planar(linear) layout: + * Each element is not pixel. It is 64x32 pixel block. + * y1 y2 y3 y4 y5 y6 y7 y8 + * y9 y10 y11 y12 y13 y14 y15 y16 + * y17 y18 y19 y20 y21 y22 y23 y24 + * y25 y26 y27 y28 y29 y30 y31 y32 + * u1 u2 u3 u4 u5 u6 u7 u8 + * v1 v2 v3 v4 v5 v6 v7 v8 + * YUV420Semiplanar(linear) layout: + * Each element is not pixel. It is 64x32 pixel block. + * uv pixel block is interleaved as u v u v u v ... + * y1 y2 y3 y4 y5 y6 y7 y8 + * y9 y10 y11 y12 y13 y14 y15 y16 + * y17 y18 y19 y20 y21 y22 y23 y24 + * y25 y26 y27 y28 y29 y30 y31 y32 + * uv1 uv2 uv3 uv4 uv5 uv6 uv7 uv8 + * uv9 uv10 uv11 uv12 uv13 uv14 uv15 uv16 + * @author ShinWon Lee (shinwon.lee@samsung.com) + * @version 1.0 + * @history + * 2011.7.01 : Create + */ + +#ifndef COLOR_SPACE_CONVERTOR_H_ +#define COLOR_SPACE_CONVERTOR_H_ + +/*--------------------------------------------------------------------------------*/ +/* Format Conversion API */ +/*--------------------------------------------------------------------------------*/ +/* Neon Code */ +/* + * De-interleaves src to dest1, dest2 + * + * @param dest1 + * Address of de-interleaved data[out] + * + * @param dest2 + * Address of de-interleaved data[out] + * + * @param src + * Address of interleaved data[in] + * + * @param src_size + * Size of interleaved data[in] + */ +void csc_deinterleave_memcpy_neon(char *dest1, char *dest2, char *src, int src_size); + +/* + * Interleaves src1, src2 to dest + * + * @param dest + * Address of interleaved data[out] + * + * @param src1 + * Address of de-interleaved data[in] + * + * @param src2 + * Address of de-interleaved data[in] + * + * @param src_size + * Size of de-interleaved data[in] + */ +void csc_interleave_memcpy_neon(char *dest, char *src1, char *src2, int src_size); + +/* + * Converts tiled data to linear. + * 1. Y of NV12T to Y of YUV420P + * 2. Y of NV12T to Y of YUV420S + * 3. UV of NV12T to UV of YUV420S + * + * @param yuv420_dest + * Y or UV plane address of YUV420[out] + * + * @param nv12t_src + * Y or UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ +void csc_tiled_to_linear_neon(char *yuv420p_y_dest, char *nv12t_y_src, int yuv420p_width, int yuv420p_y_height); + +/* + * Converts and Deinterleaves tiled data to linear + * 1. UV of NV12T to UV of YUV420P + * + * @param yuv420_u_dest + * U plane address of YUV420P[out] + * + * @param yuv420_v_dest + * V plane address of YUV420P[out] + * + * @param nv12t_src + * UV plane address of NV12T[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ +void csc_tiled_to_linear_deinterleave_neon(char *yuv420p_u_dest, char *yuv420p_v_dest, char *nv12t_uv_src, int yuv420p_width, int yuv420p_uv_height); + +/* + * Converts linear data to tiled. + * 1. Y of YUV420P to Y of NV12T + * 2. Y of YUV420S to Y of NV12T + * 3. UV of YUV420S to UV of NV12T + * + * @param nv12t_dest + * Y or UV plane address of NV12T[out] + * + * @param yuv420_src + * Y or UV plane address of YUV420P(S)[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_height + * Y: Height of YUV420, UV: Height/2 of YUV420[in] + */ +void csc_linear_to_tiled_neon(char *nv12t_dest, char *yuv420p_src, int yuv420p_width, int yuv420p_y_height); + +/* + * Converts and Interleaves linear to tiled + * 1. UV of YUV420P to UV of NV12T + * + * @param nv12t_uv_dest + * UV plane address of NV12T[out] + * + * @param yuv420p_u_src + * U plane address of YUV420P[in] + * + * @param yuv420p_v_src + * V plane address of YUV420P[in] + * + * @param yuv420_width + * Width of YUV420[in] + * + * @param yuv420_uv_height + * Height/2 of YUV420[in] + */ +void csc_linear_to_tiled_interleave_neon(char *nv12t_uv_dest, char *yuv420p_u_src, char *yuv420p_v_src, int yuv420p_width, int yuv420p_uv_height); + +#endif /*COLOR_SPACE_CONVERTOR_H_*/ diff --git a/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/Android.mk b/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/Android.mk index 08e9874..cdf345c 100644 --- a/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/Android.mk +++ b/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/Android.mk @@ -14,7 +14,7 @@ LOCAL_CFLAGS := LOCAL_ARM_MODE := arm -LOCAL_STATIC_LIBRARIES := libSEC_OMX_Vdec libsecosal libsecbasecomponent libsecmfcdecapi +LOCAL_STATIC_LIBRARIES := libSEC_OMX_Vdec libsecosal libsecbasecomponent libsecmfcdecapi libseccsc LOCAL_SHARED_LIBRARIES := libc libdl libcutils libutils LOCAL_C_INCLUDES := $(SEC_OMX_INC)/khronos \ @@ -22,7 +22,7 @@ LOCAL_C_INCLUDES := $(SEC_OMX_INC)/khronos \ $(SEC_OMX_TOP)/sec_osal \ $(SEC_OMX_TOP)/sec_omx_core \ $(SEC_OMX_COMPONENT)/common \ - $(SEC_OMX_COMPONENT)/video/dec \ + $(SEC_OMX_COMPONENT)/video/dec LOCAL_C_INCLUDES += $(SEC_OMX_TOP)/sec_codecs/video/mfc_c110/include diff --git a/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/SEC_OMX_H264dec.c b/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/SEC_OMX_H264dec.c index 9acae4e..e1edf95 100644 --- a/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/SEC_OMX_H264dec.c +++ b/sec_mm/sec_omx/sec_omx_component/video/dec/h264dec/SEC_OMX_H264dec.c @@ -35,6 +35,7 @@ #include "library_register.h" #include "SEC_OMX_H264dec.h" #include "SsbSipMfcApi.h" +#include "color_space_convertor.h" #undef SEC_LOG_TAG #define SEC_LOG_TAG "SEC_H264_DEC" @@ -955,14 +956,17 @@ OMX_ERRORTYPE SEC_MFC_H264_Decode(OMX_COMPONENTTYPE *pOMXComponent, SEC_OMX_DATA SEC_OSAL_Memcpy(pOutBuf + sizeof(frameSize) + (sizeof(void *) * 3), &(outputInfo.CVirAddr), sizeof(outputInfo.CVirAddr)); } else { SEC_OSAL_Log(SEC_LOG_TRACE, "YUV420 out for ThumbnailMode"); - Y_tile_to_linear_4x2( - (unsigned char *)pOutBuf, - (unsigned char *)outputInfo.YVirAddr, - bufWidth, bufHeight); - CbCr_tile_to_linear_4x2( - ((unsigned char *)pOutBuf) + frameSize, - (unsigned char *)outputInfo.CVirAddr, - bufWidth, bufHeight); + csc_tiled_to_linear_neon( + (unsigned char *)pOutBuf, + (unsigned char *)outputInfo.YVirAddr, + bufWidth, + bufHeight); + csc_tiled_to_linear_deinterleave_neon( + (unsigned char *)pOutBuf + frameSize, + (unsigned char *)pOutBuf + (frameSize * 5) / 4, + (unsigned char *)outputInfo.CVirAddr, + bufWidth, + bufHeight >> 1); } } diff --git a/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/Android.mk b/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/Android.mk index 92891a7..66353d6 100644 --- a/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/Android.mk +++ b/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/Android.mk @@ -14,7 +14,7 @@ LOCAL_CFLAGS := LOCAL_ARM_MODE := arm -LOCAL_STATIC_LIBRARIES := libSEC_OMX_Vdec libsecosal libsecbasecomponent libsecmfcdecapi +LOCAL_STATIC_LIBRARIES := libSEC_OMX_Vdec libsecosal libsecbasecomponent libsecmfcdecapi libseccsc LOCAL_SHARED_LIBRARIES := libc libdl libcutils libutils LOCAL_C_INCLUDES := $(SEC_OMX_INC)/khronos \ diff --git a/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/SEC_OMX_Mpeg4dec.c b/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/SEC_OMX_Mpeg4dec.c index 7396a2c..d7ac10a 100644 --- a/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/SEC_OMX_Mpeg4dec.c +++ b/sec_mm/sec_omx/sec_omx_component/video/dec/mpeg4dec/SEC_OMX_Mpeg4dec.c @@ -35,6 +35,7 @@ #include "library_register.h" #include "SEC_OMX_Mpeg4dec.h" #include "SsbSipMfcApi.h" +#include "color_space_convertor.h" #undef SEC_LOG_TAG #define SEC_LOG_TAG "SEC_MPEG4_DEC" @@ -1135,14 +1136,17 @@ OMX_ERRORTYPE SEC_MFC_Mpeg4_Decode(OMX_COMPONENTTYPE *pOMXComponent, SEC_OMX_DAT SEC_OSAL_Memcpy(pOutputBuf + sizeof(frameSize) + (sizeof(void *) * 3), &(outputInfo.CVirAddr), sizeof(outputInfo.CVirAddr)); } else { SEC_OSAL_Log(SEC_LOG_TRACE, "YUV420 out for ThumbnailMode"); - Y_tile_to_linear_4x2( - (unsigned char *)pOutputBuf, - (unsigned char *)outputInfo.YVirAddr, - bufWidth, bufHeight); - CbCr_tile_to_linear_4x2( - ((unsigned char *)pOutputBuf) + frameSize, - (unsigned char *)outputInfo.CVirAddr, - bufWidth, bufHeight); + csc_tiled_to_linear_neon( + (unsigned char *)pOutputBuf, + (unsigned char *)outputInfo.YVirAddr, + bufWidth, + bufHeight); + csc_tiled_to_linear_deinterleave_neon( + (unsigned char *)pOutputBuf + frameSize, + (unsigned char *)pOutputBuf + (frameSize * 5) / 4, + (unsigned char *)outputInfo.CVirAddr, + bufWidth, + bufHeight >> 1); } } diff --git a/sec_mm/sec_omx/sec_omx_component/video/enc/h264enc/Android.mk b/sec_mm/sec_omx/sec_omx_component/video/enc/h264enc/Android.mk index 3edcb58..cf91356 100644 --- a/sec_mm/sec_omx/sec_omx_component/video/enc/h264enc/Android.mk +++ b/sec_mm/sec_omx/sec_omx_component/video/enc/h264enc/Android.mk @@ -22,7 +22,7 @@ LOCAL_C_INCLUDES := $(SEC_OMX_INC)/khronos \ $(SEC_OMX_TOP)/sec_osal \ $(SEC_OMX_TOP)/sec_omx_core \ $(SEC_OMX_COMPONENT)/common \ - $(SEC_OMX_COMPONENT)/video/enc \ + $(SEC_OMX_COMPONENT)/video/enc LOCAL_C_INCLUDES += $(SEC_OMX_TOP)/sec_codecs/video/mfc_c110/include |