summaryrefslogtreecommitdiffstats
path: root/libcutils
diff options
context:
space:
mode:
authorDuane Sand <duane.sand@imgtec.com>2015-04-16 18:10:37 -0700
committerDuane Sand <duane.sand@imgtec.com>2015-04-16 18:26:22 -0700
commit1ef9ccd7726cfa0af88f7dcff4cf9106059807ea (patch)
treed99d09da903010982b32f393fc736d9b91ab0db6 /libcutils
parentc47b0d5278fc403e6a30ea6f4225699b16613879 (diff)
downloadsystem_core-1ef9ccd7726cfa0af88f7dcff4cf9106059807ea.zip
system_core-1ef9ccd7726cfa0af88f7dcff4cf9106059807ea.tar.gz
system_core-1ef9ccd7726cfa0af88f7dcff4cf9106059807ea.tar.bz2
[MIPS] Fast android_memset for Mips64, Mipsr6
Fix broken mips64 build by replacing mips32r2-only android_memset.S. Use HW-bonded pairs of 64-bit stores to fill 128 bits/cycle. Rely on HW automatic cache prefetch optimizations. Software cache prefetching is counterproductive on next mips cores. New method is coded in C, and also works okay on non-Mips architectures. Change-Id: Id7153a8fe11538fe25287e101375661b0e99e2a2
Diffstat (limited to 'libcutils')
-rw-r--r--libcutils/Android.mk5
-rw-r--r--libcutils/arch-mips/android_memset.S323
-rw-r--r--libcutils/arch-mips/android_memset.c93
3 files changed, 96 insertions, 325 deletions
diff --git a/libcutils/Android.mk b/libcutils/Android.mk
index c636196..9dc15d1 100644
--- a/libcutils/Android.mk
+++ b/libcutils/Android.mk
@@ -111,8 +111,9 @@ LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
LOCAL_SRC_FILES_arm += arch-arm/memset32.S
LOCAL_SRC_FILES_arm64 += arch-arm64/android_memset.S
-LOCAL_SRC_FILES_mips += arch-mips/android_memset.S
-LOCAL_SRC_FILES_mips64 += arch-mips/android_memset.S
+
+LOCAL_SRC_FILES_mips += arch-mips/android_memset.c
+LOCAL_SRC_FILES_mips64 += arch-mips/android_memset.c
LOCAL_SRC_FILES_x86 += \
arch-x86/android_memset16.S \
diff --git a/libcutils/arch-mips/android_memset.S b/libcutils/arch-mips/android_memset.S
deleted file mode 100644
index 6811de0..0000000
--- a/libcutils/arch-mips/android_memset.S
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2009
- * MIPS Technologies, Inc., California.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/************************************************************************
- *
- * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
- * Version: "043009"
- *
- ************************************************************************/
-
-
-/************************************************************************
- * Include files
- ************************************************************************/
-
-#include <machine/asm.h>
-#define END(f) .cfi_endproc; .size f, .-f; .end f
-
-/*
- * This routine could be optimized for MIPS64. The current code only
- * uses MIPS32 instructions.
- */
-
-#if defined(__MIPSEB__)
-# define SWHI swl /* high part is left in big-endian */
-# define SWLO swr /* low part is right in big-endian */
-#endif
-
-#if defined(__MIPSEL__)
-# define SWHI swr /* high part is right in little-endian */
-# define SWLO swl /* low part is left in little-endian */
-#endif
-
-#if !(defined(XGPROF) || defined(XPROF))
-#undef SETUP_GP
-#define SETUP_GP
-#endif
-
-#ifdef NDEBUG
-#define DBG #
-#else
-#define DBG
-#endif
-
-/*
- * void android_memset16(uint16_t* dst, uint16_t value, size_t size);
- */
-
-LEAF(android_memset16,0)
- .set noreorder
-DBG /* Check parameters */
-DBG andi t0,a0,1 # a0 must be halfword aligned
-DBG tne t0,zero
-DBG andi t2,a2,1 # a2 must be even
-DBG tne t2,zero
-
-#ifdef FIXARGS
- # ensure count is even
-#if (__mips==32) && (__mips_isa_rev>=2)
- ins a2,zero,0,1
-#else
- ori a2,1
- xori a2,1
-#endif
-#endif
-
-#if (__mips==32) && (__mips_isa_rev>=2)
- ins a1,a1,16,16
-#else
- andi a1,0xffff
- sll t3,a1,16
- or a1,t3
-#endif
-
- beqz a2,.Ldone
- andi t1,a0,2
- beqz t1,.Lalignok
- addu t0,a0,a2 # t0 is the "past the end" address
- sh a1,0(a0) # store one halfword to get aligned
- addu a0,2
- subu a2,2
-.Lalignok:
- slti t1,a2,4 # .Laligned for 4 or more bytes
- beqz t1,.Laligned
- sne t1,a2,2 # one more halfword?
- bnez t1,.Ldone
- nop
- sh a1,0(a0)
-.Ldone:
- j ra
- nop
- .set reorder
-END(android_memset16)
-
-/*
- * void android_memset32(uint32_t* dst, uint32_t value, size_t size);
- */
-
-LEAF(android_memset32,0)
- .set noreorder
-DBG /* Check parameters */
-DBG andi t0,a0,3 # a0 must be word aligned
-DBG tne t0,zero
-DBG andi t2,a2,3 # a2 must be a multiple of 4 bytes
-DBG tne t2,zero
-
-#ifdef FIXARGS
- # ensure count is a multiple of 4
-#if (__mips==32) && (__mips_isa_rev>=2)
- ins $a2,$0,0,2
-#else
- ori a2,3
- xori a2,3
-#endif
-#endif
-
- bnez a2,.Laligned # any work to do?
- addu t0,a0,a2 # t0 is the "past the end" address
-
- j ra
- nop
- .set reorder
-END(android_memset32)
-
-LEAF(memset,0)
-
- .set noreorder
- .set noat
-
- addu t0,a0,a2 # t0 is the "past the end" address
- slti AT,a2,4 # is a2 less than 4?
- bne AT,zero,.Llast4 # if yes, go to last4
- move v0,a0 # memset returns the dst pointer
-
- beq a1,zero,.Lset0
- subu v1,zero,a0
-
- # smear byte into 32 bit word
-#if (__mips==32) && (__mips_isa_rev>=2)
- ins a1, a1, 8, 8 # Replicate fill byte into half-word.
- ins a1, a1, 16, 16 # Replicate fill byte into word.
-#else
- and a1,0xff
- sll AT,a1,8
- or a1,AT
- sll AT,a1,16
- or a1,AT
-#endif
-
-.Lset0:
- andi v1,v1,0x3 # word-unaligned address?
- beq v1,zero,.Laligned # v1 is the unalignment count
- subu a2,a2,v1
- SWHI a1,0(a0)
- addu a0,a0,v1
-
-# Here we have the "word-aligned" a0 (until the "last4")
-.Laligned:
- andi t8,a2,0x3f # any 64-byte chunks?
- # t8 is the byte count past 64-byte chunks
- beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks
- # There will be at most 1 32-byte chunk then
- subu a3,a2,t8 # subtract from a2 the reminder
- # Here a3 counts bytes in 16w chunks
- addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
-
-# Find out, if there are any 64-byte chunks after which will be still at least
-# 96 bytes left. The value "96" is calculated as needed buffer for
-# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
-# incrementing "a0" by 64.
-# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
-#
- sltiu v1,a2,160
- bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)"
- subu t7,a2,96 # subtract "pref 30 unsafe" region
- # below we have at least 1 64-byte chunk which is "pref 30 safe"
- andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder
- subu t5,t7,t6 # subtract from t7 the reminder
- # Here t5 counts bytes in 16w "safe" chunks
- addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks
-
-# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
-# pref 30,0(a0)
-# Here we are in the region, where it is safe to use "pref 30,64(a0)"
-.Lloop16w:
- addiu a0,a0,64
- pref 30,-32(a0) # continue setting up the dest, addr 64-32
- sw a1,-64(a0)
- sw a1,-60(a0)
- sw a1,-56(a0)
- sw a1,-52(a0)
- sw a1,-48(a0)
- sw a1,-44(a0)
- sw a1,-40(a0)
- sw a1,-36(a0)
- nop
- nop # the extra nop instructions help to balance
- nop # cycles needed for "store" + "fill" + "evict"
- nop # For 64byte store there are needed 8 fill
- nop # and 8 evict cycles, i.e. at least 32 instr.
- nop
- nop
- pref 30,0(a0) # continue setting up the dest, addr 64-0
- sw a1,-32(a0)
- sw a1,-28(a0)
- sw a1,-24(a0)
- sw a1,-20(a0)
- sw a1,-16(a0)
- sw a1,-12(a0)
- sw a1,-8(a0)
- sw a1,-4(a0)
- nop
- nop
- nop
- nop # NOTE: adding 14 nop-s instead of 12 nop-s
- nop # gives better results for "fast" memory
- nop
- bne a0,t4,.Lloop16w
- nop
-
- beq a0,a3,.Lchk8w # maybe no more 64-byte chunks?
- nop # this "delayed slot" is useless ...
-
-.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks
- addiu a0,a0,64
- sw a1,-64(a0)
- sw a1,-60(a0)
- sw a1,-56(a0)
- sw a1,-52(a0)
- sw a1,-48(a0)
- sw a1,-44(a0)
- sw a1,-40(a0)
- sw a1,-36(a0)
- sw a1,-32(a0)
- sw a1,-28(a0)
- sw a1,-24(a0)
- sw a1,-20(a0)
- sw a1,-16(a0)
- sw a1,-12(a0)
- sw a1,-8(a0)
- bne a0,a3,.Lloop16w_nopref30
- sw a1,-4(a0)
-
-.Lchk8w: # t8 here is the byte count past 64-byte chunks
-
- andi t7,t8,0x1f # is there a 32-byte chunk?
- # the t7 is the reminder count past 32-bytes
- beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk
- move a2,t7
-
- sw a1,0(a0)
- sw a1,4(a0)
- sw a1,8(a0)
- sw a1,12(a0)
- sw a1,16(a0)
- sw a1,20(a0)
- sw a1,24(a0)
- sw a1,28(a0)
- addiu a0,a0,32
-
-.Lchk1w:
- andi t8,a2,0x3 # now t8 is the reminder past 1w chunks
- beq a2,t8,.Llast4aligned
- subu a3,a2,t8 # a3 is the count of bytes in 1w chunks
- addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
-
-# copying in words (4-byte chunks)
-.LwordCopy_loop:
- addiu a0,a0,4
- bne a0,a3,.LwordCopy_loop
- sw a1,-4(a0)
-
-# store last 0-3 bytes
-# this will repeat the last store if the memset finishes on a word boundary
-.Llast4aligned:
- j ra
- SWLO a1,-1(t0)
-
-.Llast4:
- beq a0,t0,.Llast4e
-.Llast4l:
- addiu a0,a0,1
- bne a0,t0,.Llast4l
- sb a1,-1(a0)
-.Llast4e:
- j ra
- nop
-
- .set at
- .set reorder
-
-END(memset)
-
-
-/************************************************************************
- * Implementation : Static functions
- ************************************************************************/
diff --git a/libcutils/arch-mips/android_memset.c b/libcutils/arch-mips/android_memset.c
new file mode 100644
index 0000000..a6b7496
--- /dev/null
+++ b/libcutils/arch-mips/android_memset.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* generic C version for any machine */
+
+#include <cutils/memory.h>
+
+void android_memset16(uint16_t* dst, uint16_t value, size_t size)
+{
+ /* optimized version of
+ size >>= 1;
+ while (size--)
+ *dst++ = value;
+ */
+
+ size >>= 1;
+ if (((uintptr_t)dst & 2) && size) {
+ /* fill unpaired first elem separately */
+ *dst++ = value;
+ size--;
+ }
+ /* dst is now 32-bit-aligned */
+ /* fill body with 32-bit pairs */
+ uint32_t value32 = (value << 16) | value;
+ android_memset32((uint32_t*) dst, value32, size<<1);
+ if (size & 1) {
+ dst[size-1] = value; /* fill unpaired last elem */
+ }
+}
+
+
+void android_memset32(uint32_t* dst, uint32_t value, size_t size)
+{
+ /* optimized version of
+ size >>= 2;
+ while (size--)
+ *dst++ = value;
+ */
+
+ size >>= 2;
+ if (((uintptr_t)dst & 4) && size) {
+ /* fill unpaired first 32-bit elem separately */
+ *dst++ = value;
+ size--;
+ }
+ /* dst is now 64-bit aligned */
+ /* fill body with 64-bit pairs */
+ uint64_t value64 = (((uint64_t)value)<<32) | value;
+ uint64_t* dst64 = (uint64_t*)dst;
+
+ while (size >= 12) {
+ dst64[0] = value64;
+ dst64[1] = value64;
+ dst64[2] = value64;
+ dst64[3] = value64;
+ dst64[4] = value64;
+ dst64[5] = value64;
+ size -= 12;
+ dst64 += 6;
+ }
+
+ /* fill remainder with original 32-bit single-elem loop */
+ dst = (uint32_t*) dst64;
+ while (size--) {
+ *dst++ = value;
+ }
+
+}