diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2008-10-21 07:00:00 -0700 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2008-10-21 07:00:00 -0700 |
commit | 4f6e8d7a00cbeda1e70cc15be9c4af1018bdad53 (patch) | |
tree | 54fd1b2695a591d2306d41264df67c53077b752c /libpixelflinger/codeflinger | |
download | system_core-4f6e8d7a00cbeda1e70cc15be9c4af1018bdad53.zip system_core-4f6e8d7a00cbeda1e70cc15be9c4af1018bdad53.tar.gz system_core-4f6e8d7a00cbeda1e70cc15be9c4af1018bdad53.tar.bz2 |
Initial Contribution
Diffstat (limited to 'libpixelflinger/codeflinger')
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssembler.cpp | 428 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssembler.h | 155 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssemblerInterface.cpp | 173 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssemblerInterface.h | 324 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssemblerProxy.cpp | 200 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/ARMAssemblerProxy.h | 123 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/CodeCache.cpp | 151 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/CodeCache.h | 134 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/GGLAssembler.cpp | 1135 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/GGLAssembler.h | 549 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/armreg.h | 300 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/blending.cpp | 676 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/disassem.c | 702 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/disassem.h | 65 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/load_store.cpp | 378 | ||||
-rw-r--r-- | libpixelflinger/codeflinger/texturing.cpp | 1208 |
16 files changed, 6701 insertions, 0 deletions
diff --git a/libpixelflinger/codeflinger/ARMAssembler.cpp b/libpixelflinger/codeflinger/ARMAssembler.cpp new file mode 100644 index 0000000..c5edadf --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssembler.cpp @@ -0,0 +1,428 @@ +/* libs/pixelflinger/codeflinger/ARMAssembler.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#define LOG_TAG "ARMAssembler" + +#include <stdio.h> +#include <stdlib.h> +#include <cutils/log.h> +#include <cutils/properties.h> + +#if defined(WITH_LIB_HARDWARE) +#include <hardware/qemu_tracing.h> +#endif + +#include <private/pixelflinger/ggl_context.h> + +#include "codeflinger/ARMAssembler.h" +#include "codeflinger/CodeCache.h" +#include "codeflinger/disassem.h" + +// ---------------------------------------------------------------------------- + +namespace android { + +// ---------------------------------------------------------------------------- +#if 0 +#pragma mark - +#pragma mark ARMAssembler... +#endif + +ARMAssembler::ARMAssembler(const sp<Assembly>& assembly) + : ARMAssemblerInterface(), + mAssembly(assembly) +{ + mBase = mPC = (uint32_t *)assembly->base(); + mDuration = ggl_system_time(); +#if defined(WITH_LIB_HARDWARE) + mQemuTracing = true; +#endif +} + +ARMAssembler::~ARMAssembler() +{ +} + +uint32_t* ARMAssembler::pc() const +{ + return mPC; +} + +uint32_t* ARMAssembler::base() const +{ + return mBase; +} + +void ARMAssembler::reset() +{ + mBase = mPC = (uint32_t *)mAssembly->base(); + mBranchTargets.clear(); + mLabels.clear(); + mLabelsInverseMapping.clear(); + mComments.clear(); +} + +// ---------------------------------------------------------------------------- + +void ARMAssembler::disassemble(const char* name) +{ + if (name) { + printf("%s:\n", name); + } + size_t count = pc()-base(); + uint32_t* i = base(); + while (count--) { + ssize_t label = mLabelsInverseMapping.indexOfKey(i); + if (label >= 0) { + printf("%s:\n", mLabelsInverseMapping.valueAt(label)); + } + ssize_t comment = mComments.indexOfKey(i); + if (comment >= 0) { + printf("; %s\n", mComments.valueAt(comment)); + } + printf("%08x: %08x ", int(i), int(i[0])); + ::disassemble((u_int)i); + i++; + } +} + +void ARMAssembler::comment(const char* string) +{ + mComments.add(mPC, string); +} + +void ARMAssembler::label(const char* theLabel) +{ + mLabels.add(theLabel, mPC); + mLabelsInverseMapping.add(mPC, theLabel); +} + +void ARMAssembler::B(int cc, const char* label) +{ + mBranchTargets.add(branch_target_t(label, mPC)); + *mPC++ = (cc<<28) | (0xA<<24) | 0; +} + +void ARMAssembler::BL(int cc, const char* label) +{ + mBranchTargets.add(branch_target_t(label, mPC)); + *mPC++ = (cc<<28) | (0xB<<24) | 0; +} + +#if 0 +#pragma mark - +#pragma mark Prolog/Epilog & Generate... +#endif + + +void ARMAssembler::prolog() +{ + // write dummy prolog code + mPrologPC = mPC; + STM(AL, FD, SP, 1, LSAVED); +} + +void ARMAssembler::epilog(uint32_t touched) +{ + touched &= LSAVED; + if (touched) { + // write prolog code + uint32_t* pc = mPC; + mPC = mPrologPC; + STM(AL, FD, SP, 1, touched | LLR); + mPC = pc; + // write epilog code + LDM(AL, FD, SP, 1, touched | LLR); + BX(AL, LR); + } else { // heh, no registers to save! + // write prolog code + uint32_t* pc = mPC; + mPC = mPrologPC; + MOV(AL, 0, R0, R0); // NOP + mPC = pc; + // write epilog code + BX(AL, LR); + } +} + +int ARMAssembler::generate(const char* name) +{ + // fixup all the branches + size_t count = mBranchTargets.size(); + while (count--) { + const branch_target_t& bt = mBranchTargets[count]; + uint32_t* target_pc = mLabels.valueFor(bt.label); + LOG_ALWAYS_FATAL_IF(!target_pc, + "error resolving branch targets, target_pc is null"); + int32_t offset = int32_t(target_pc - (bt.pc+2)); + *bt.pc |= offset & 0xFFFFFF; + } + + mAssembly->resize( int(pc()-base())*4 ); + + // the instruction cache is flushed by CodeCache + const int64_t duration = ggl_system_time() - mDuration; + const char * const format = "generated %s (%d ins) at [%p:%p] in %lld ns\n"; + LOGI(format, name, int(pc()-base()), base(), pc(), duration); + +#if defined(WITH_LIB_HARDWARE) + if (__builtin_expect(mQemuTracing, 0)) { + int err = qemu_add_mapping(int(base()), name); + mQemuTracing = (err >= 0); + } +#endif + + char value[PROPERTY_VALUE_MAX]; + property_get("debug.pf.disasm", value, "0"); + if (atoi(value) != 0) { + printf(format, name, int(pc()-base()), base(), pc(), duration); + disassemble(name); + } + + return NO_ERROR; +} + +uint32_t* ARMAssembler::pcForLabel(const char* label) +{ + return mLabels.valueFor(label); +} + +// ---------------------------------------------------------------------------- + +#if 0 +#pragma mark - +#pragma mark Data Processing... +#endif + +void ARMAssembler::dataProcessing(int opcode, int cc, + int s, int Rd, int Rn, uint32_t Op2) +{ + *mPC++ = (cc<<28) | (opcode<<21) | (s<<20) | (Rn<<16) | (Rd<<12) | Op2; +} + +#if 0 +#pragma mark - +#pragma mark Multiply... +#endif + +// multiply... +void ARMAssembler::MLA(int cc, int s, + int Rd, int Rm, int Rs, int Rn) { + if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } + LOG_FATAL_IF(Rd==Rm, "MLA(r%u,r%u,r%u,r%u)", Rd,Rm,Rs,Rn); + *mPC++ = (cc<<28) | (1<<21) | (s<<20) | + (Rd<<16) | (Rn<<12) | (Rs<<8) | 0x90 | Rm; +} +void ARMAssembler::MUL(int cc, int s, + int Rd, int Rm, int Rs) { + if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } + LOG_FATAL_IF(Rd==Rm, "MUL(r%u,r%u,r%u)", Rd,Rm,Rs); + *mPC++ = (cc<<28) | (s<<20) | (Rd<<16) | (Rs<<8) | 0x90 | Rm; +} +void ARMAssembler::UMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi, + "UMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs); + *mPC++ = (cc<<28) | (1<<23) | (s<<20) | + (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm; +} +void ARMAssembler::UMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi, + "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs); + *mPC++ = (cc<<28) | (1<<23) | (1<<21) | (s<<20) | + (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm; +} +void ARMAssembler::SMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi, + "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs); + *mPC++ = (cc<<28) | (1<<23) | (1<<22) | (s<<20) | + (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm; +} +void ARMAssembler::SMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi, + "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs); + *mPC++ = (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) | + (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm; +} + +#if 0 +#pragma mark - +#pragma mark Branches... +#endif + +// branches... +void ARMAssembler::B(int cc, uint32_t* pc) +{ + int32_t offset = int32_t(pc - (mPC+2)); + *mPC++ = (cc<<28) | (0xA<<24) | (offset & 0xFFFFFF); +} + +void ARMAssembler::BL(int cc, uint32_t* pc) +{ + int32_t offset = int32_t(pc - (mPC+2)); + *mPC++ = (cc<<28) | (0xB<<24) | (offset & 0xFFFFFF); +} + +void ARMAssembler::BX(int cc, int Rn) +{ + *mPC++ = (cc<<28) | 0x12FFF10 | Rn; +} + +#if 0 +#pragma mark - +#pragma mark Data Transfer... +#endif + +// data transfert... +void ARMAssembler::LDR(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<26) | (1<<20) | (Rn<<16) | (Rd<<12) | offset; +} +void ARMAssembler::LDRB(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (1<<20) | (Rn<<16) | (Rd<<12) | offset; +} +void ARMAssembler::STR(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<26) | (Rn<<16) | (Rd<<12) | offset; +} +void ARMAssembler::STRB(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (Rn<<16) | (Rd<<12) | offset; +} + +void ARMAssembler::LDRH(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xB0 | offset; +} +void ARMAssembler::LDRSB(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xD0 | offset; +} +void ARMAssembler::LDRSH(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xF0 | offset; +} +void ARMAssembler::STRH(int cc, int Rd, int Rn, uint32_t offset) { + *mPC++ = (cc<<28) | (Rn<<16) | (Rd<<12) | 0xB0 | offset; +} + +#if 0 +#pragma mark - +#pragma mark Block Data Transfer... +#endif + +// block data transfer... +void ARMAssembler::LDM(int cc, int dir, + int Rn, int W, uint32_t reg_list) +{ // ED FD EA FA IB IA DB DA + const uint8_t P[8] = { 1, 0, 1, 0, 1, 0, 1, 0 }; + const uint8_t U[8] = { 1, 1, 0, 0, 1, 1, 0, 0 }; + *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) | + (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list; +} + +void ARMAssembler::STM(int cc, int dir, + int Rn, int W, uint32_t reg_list) +{ // FA EA FD ED IB IA DB DA + const uint8_t P[8] = { 0, 1, 0, 1, 1, 0, 1, 0 }; + const uint8_t U[8] = { 0, 0, 1, 1, 1, 1, 0, 0 }; + *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) | + (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list; +} + +#if 0 +#pragma mark - +#pragma mark Special... +#endif + +// special... +void ARMAssembler::SWP(int cc, int Rn, int Rd, int Rm) { + *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm; +} +void ARMAssembler::SWPB(int cc, int Rn, int Rd, int Rm) { + *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm; +} +void ARMAssembler::SWI(int cc, uint32_t comment) { + *mPC++ = (cc<<28) | (0xF<<24) | comment; +} + +#if 0 +#pragma mark - +#pragma mark DSP instructions... +#endif + +// DSP instructions... +void ARMAssembler::PLD(int Rn, uint32_t offset) { + LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))), + "PLD only P=1, W=0"); + *mPC++ = 0xF550F000 | (Rn<<16) | offset; +} + +void ARMAssembler::CLZ(int cc, int Rd, int Rm) +{ + *mPC++ = (cc<<28) | 0x16F0F10| (Rd<<12) | Rm; +} + +void ARMAssembler::QADD(int cc, int Rd, int Rm, int Rn) +{ + *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm; +} + +void ARMAssembler::QDADD(int cc, int Rd, int Rm, int Rn) +{ + *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm; +} + +void ARMAssembler::QSUB(int cc, int Rd, int Rm, int Rn) +{ + *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm; +} + +void ARMAssembler::QDSUB(int cc, int Rd, int Rm, int Rn) +{ + *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm; +} + +void ARMAssembler::SMUL(int cc, int xy, + int Rd, int Rm, int Rs) +{ + *mPC++ = (cc<<28) | 0x1600080 | (Rd<<16) | (Rs<<8) | (xy<<4) | Rm; +} + +void ARMAssembler::SMULW(int cc, int y, + int Rd, int Rm, int Rs) +{ + *mPC++ = (cc<<28) | 0x12000A0 | (Rd<<16) | (Rs<<8) | (y<<4) | Rm; +} + +void ARMAssembler::SMLA(int cc, int xy, + int Rd, int Rm, int Rs, int Rn) +{ + *mPC++ = (cc<<28) | 0x1000080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (xy<<4) | Rm; +} + +void ARMAssembler::SMLAL(int cc, int xy, + int RdHi, int RdLo, int Rs, int Rm) +{ + *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm; +} + +void ARMAssembler::SMLAW(int cc, int y, + int Rd, int Rm, int Rs, int Rn) +{ + *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm; +} + +}; // namespace android + diff --git a/libpixelflinger/codeflinger/ARMAssembler.h b/libpixelflinger/codeflinger/ARMAssembler.h new file mode 100644 index 0000000..8837e07 --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssembler.h @@ -0,0 +1,155 @@ +/* libs/pixelflinger/codeflinger/ARMAssembler.h +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#ifndef ANDROID_ARMASSEMBLER_H +#define ANDROID_ARMASSEMBLER_H + +#include <stdint.h> +#include <sys/types.h> + +#include <utils/Vector.h> +#include <utils/KeyedVector.h> + +#include "tinyutils/smartpointer.h" +#include "codeflinger/ARMAssemblerInterface.h" +#include "codeflinger/CodeCache.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +class ARMAssembler : public ARMAssemblerInterface +{ +public: + ARMAssembler(const sp<Assembly>& assembly); + virtual ~ARMAssembler(); + + uint32_t* base() const; + uint32_t* pc() const; + + + void disassemble(const char* name); + + // ------------------------------------------------------------------------ + // ARMAssemblerInterface... + // ------------------------------------------------------------------------ + + virtual void reset(); + + virtual int generate(const char* name); + + virtual void prolog(); + virtual void epilog(uint32_t touched); + virtual void comment(const char* string); + + virtual void dataProcessing(int opcode, int cc, int s, + int Rd, int Rn, + uint32_t Op2); + virtual void MLA(int cc, int s, + int Rd, int Rm, int Rs, int Rn); + virtual void MUL(int cc, int s, + int Rd, int Rm, int Rs); + virtual void UMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void UMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void SMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void SMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + + virtual void B(int cc, uint32_t* pc); + virtual void BL(int cc, uint32_t* pc); + virtual void BX(int cc, int Rn); + virtual void label(const char* theLabel); + virtual void B(int cc, const char* label); + virtual void BL(int cc, const char* label); + + virtual uint32_t* pcForLabel(const char* label); + + virtual void LDR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void LDRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void STR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void STRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void LDRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDRSB(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDRSH(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void STRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDM(int cc, int dir, + int Rn, int W, uint32_t reg_list); + virtual void STM(int cc, int dir, + int Rn, int W, uint32_t reg_list); + + virtual void SWP(int cc, int Rn, int Rd, int Rm); + virtual void SWPB(int cc, int Rn, int Rd, int Rm); + virtual void SWI(int cc, uint32_t comment); + + virtual void PLD(int Rn, uint32_t offset); + virtual void CLZ(int cc, int Rd, int Rm); + virtual void QADD(int cc, int Rd, int Rm, int Rn); + virtual void QDADD(int cc, int Rd, int Rm, int Rn); + virtual void QSUB(int cc, int Rd, int Rm, int Rn); + virtual void QDSUB(int cc, int Rd, int Rm, int Rn); + virtual void SMUL(int cc, int xy, + int Rd, int Rm, int Rs); + virtual void SMULW(int cc, int y, + int Rd, int Rm, int Rs); + virtual void SMLA(int cc, int xy, + int Rd, int Rm, int Rs, int Rn); + virtual void SMLAL(int cc, int xy, + int RdHi, int RdLo, int Rs, int Rm); + virtual void SMLAW(int cc, int y, + int Rd, int Rm, int Rs, int Rn); + +private: + ARMAssembler(const ARMAssembler& rhs); + ARMAssembler& operator = (const ARMAssembler& rhs); + + sp<Assembly> mAssembly; + uint32_t* mBase; + uint32_t* mPC; + uint32_t* mPrologPC; + int64_t mDuration; +#if defined(WITH_LIB_HARDWARE) + bool mQemuTracing; +#endif + + struct branch_target_t { + inline branch_target_t() : label(0), pc(0) { } + inline branch_target_t(const char* l, uint32_t* p) + : label(l), pc(p) { } + const char* label; + uint32_t* pc; + }; + + Vector<branch_target_t> mBranchTargets; + KeyedVector< const char*, uint32_t* > mLabels; + KeyedVector< uint32_t*, const char* > mLabelsInverseMapping; + KeyedVector< uint32_t*, const char* > mComments; +}; + +}; // namespace android + +#endif //ANDROID_ARMASSEMBLER_H diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp new file mode 100644 index 0000000..7fa0de0 --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp @@ -0,0 +1,173 @@ +/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#include <errno.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/types.h> + +#include <cutils/log.h> +#include "codeflinger/ARMAssemblerInterface.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +ARMAssemblerInterface::~ARMAssemblerInterface() +{ +} + +int ARMAssemblerInterface::buildImmediate( + uint32_t immediate, uint32_t& rot, uint32_t& imm) +{ + rot = 0; + imm = immediate; + if (imm > 0x7F) { // skip the easy cases + while (!(imm&3) || (imm&0xFC000000)) { + uint32_t newval; + newval = imm >> 2; + newval |= (imm&3) << 30; + imm = newval; + rot += 2; + if (rot == 32) { + rot = 0; + break; + } + } + } + rot = (16 - (rot>>1)) & 0xF; + + if (imm>=0x100) + return -EINVAL; + + if (((imm>>(rot<<1)) | (imm<<(32-(rot<<1)))) != immediate) + return -1; + + return 0; +} + +// shifters... + +bool ARMAssemblerInterface::isValidImmediate(uint32_t immediate) +{ + uint32_t rot, imm; + return buildImmediate(immediate, rot, imm) == 0; +} + +uint32_t ARMAssemblerInterface::imm(uint32_t immediate) +{ + uint32_t rot, imm; + int err = buildImmediate(immediate, rot, imm); + + LOG_ALWAYS_FATAL_IF(err==-EINVAL, + "immediate %08x cannot be encoded", + immediate); + + LOG_ALWAYS_FATAL_IF(err, + "immediate (%08x) encoding bogus!", + immediate); + + return (1<<25) | (rot<<8) | imm; +} + +uint32_t ARMAssemblerInterface::reg_imm(int Rm, int type, uint32_t shift) +{ + return ((shift&0x1F)<<7) | ((type&0x3)<<5) | (Rm&0xF); +} + +uint32_t ARMAssemblerInterface::reg_rrx(int Rm) +{ + return (ROR<<5) | (Rm&0xF); +} + +uint32_t ARMAssemblerInterface::reg_reg(int Rm, int type, int Rs) +{ + return ((Rs&0xF)<<8) | ((type&0x3)<<5) | (1<<4) | (Rm&0xF); +} + +// addressing modes... +// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0) +uint32_t ARMAssemblerInterface::immed12_pre(int32_t immed12, int W) +{ + LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800, + "LDR(B)/STR(B)/PLD immediate too big (%08x)", + immed12); + return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) | + ((W&1)<<21) | (abs(immed12)&0x7FF); +} + +uint32_t ARMAssemblerInterface::immed12_post(int32_t immed12) +{ + LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800, + "LDR(B)/STR(B)/PLD immediate too big (%08x)", + immed12); + + return (((uint32_t(immed12)>>31)^1)<<23) | (abs(immed12)&0x7FF); +} + +uint32_t ARMAssemblerInterface::reg_scale_pre(int Rm, int type, + uint32_t shift, int W) +{ + return (1<<25) | (1<<24) | + (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | + reg_imm(abs(Rm), type, shift); +} + +uint32_t ARMAssemblerInterface::reg_scale_post(int Rm, int type, uint32_t shift) +{ + return (1<<25) | (((uint32_t(Rm)>>31)^1)<<23) | reg_imm(abs(Rm), type, shift); +} + +// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0) +uint32_t ARMAssemblerInterface::immed8_pre(int32_t immed8, int W) +{ + uint32_t offset = abs(immed8); + + LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100, + "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)", + immed8); + + return (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) | + ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF)); +} + +uint32_t ARMAssemblerInterface::immed8_post(int32_t immed8) +{ + uint32_t offset = abs(immed8); + + LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100, + "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)", + immed8); + + return (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) | + (((offset&0xF0)<<4) | (offset&0xF)); +} + +uint32_t ARMAssemblerInterface::reg_pre(int Rm, int W) +{ + return (1<<24) | (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | (abs(Rm)&0xF); +} + +uint32_t ARMAssemblerInterface::reg_post(int Rm) +{ + return (((uint32_t(Rm)>>31)^1)<<23) | (abs(Rm)&0xF); +} + + +}; // namespace android + diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.h b/libpixelflinger/codeflinger/ARMAssemblerInterface.h new file mode 100644 index 0000000..465b3bd --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.h @@ -0,0 +1,324 @@ +/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.h +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#ifndef ANDROID_ARMASSEMBLER_INTERFACE_H +#define ANDROID_ARMASSEMBLER_INTERFACE_H + +#include <stdint.h> +#include <sys/types.h> + +namespace android { + +// ---------------------------------------------------------------------------- + +class ARMAssemblerInterface +{ +public: + virtual ~ARMAssemblerInterface(); + + enum { + EQ, NE, CS, CC, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV, + HS = CS, + LO = CC + }; + enum { + S = 1 + }; + enum { + LSL, LSR, ASR, ROR + }; + enum { + ED, FD, EA, FA, + IB, IA, DB, DA + }; + enum { + R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, + SP = R13, + LR = R14, + PC = R15 + }; + enum { + #define LIST(rr) L##rr=1<<rr + LIST(R0), LIST(R1), LIST(R2), LIST(R3), LIST(R4), LIST(R5), LIST(R6), + LIST(R7), LIST(R8), LIST(R9), LIST(R10), LIST(R11), LIST(R12), + LIST(R13), LIST(R14), LIST(R15), + LIST(SP), LIST(LR), LIST(PC), + #undef LIST + LSAVED = LR4|LR5|LR6|LR7|LR8|LR9|LR10|LR11 | LLR + }; + + // ----------------------------------------------------------------------- + // shifters and addressing modes + // ----------------------------------------------------------------------- + + // shifters... + static bool isValidImmediate(uint32_t immed); + static int buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm); + + static uint32_t imm(uint32_t immediate); + static uint32_t reg_imm(int Rm, int type, uint32_t shift); + static uint32_t reg_rrx(int Rm); + static uint32_t reg_reg(int Rm, int type, int Rs); + + // addressing modes... + // LDR(B)/STR(B)/PLD + // (immediate and Rm can be negative, which indicates U=0) + static uint32_t immed12_pre(int32_t immed12, int W=0); + static uint32_t immed12_post(int32_t immed12); + static uint32_t reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0); + static uint32_t reg_scale_post(int Rm, int type=0, uint32_t shift=0); + + // LDRH/LDRSB/LDRSH/STRH + // (immediate and Rm can be negative, which indicates U=0) + static uint32_t immed8_pre(int32_t immed8, int W=0); + static uint32_t immed8_post(int32_t immed8); + static uint32_t reg_pre(int Rm, int W=0); + static uint32_t reg_post(int Rm); + + // ----------------------------------------------------------------------- + // basic instructions & code generation + // ----------------------------------------------------------------------- + + // generate the code + virtual void reset() = 0; + virtual int generate(const char* name) = 0; + virtual void disassemble(const char* name) = 0; + + // construct prolog and epilog + virtual void prolog() = 0; + virtual void epilog(uint32_t touched) = 0; + virtual void comment(const char* string) = 0; + + // data processing... + enum { + opAND, opEOR, opSUB, opRSB, opADD, opADC, opSBC, opRSC, + opTST, opTEQ, opCMP, opCMN, opORR, opMOV, opBIC, opMVN + }; + + virtual void + dataProcessing( int opcode, int cc, int s, + int Rd, int Rn, + uint32_t Op2) = 0; + + // multiply... + virtual void MLA(int cc, int s, + int Rd, int Rm, int Rs, int Rn) = 0; + virtual void MUL(int cc, int s, + int Rd, int Rm, int Rs) = 0; + virtual void UMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) = 0; + virtual void UMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) = 0; + virtual void SMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) = 0; + virtual void SMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) = 0; + + // branches... + virtual void B(int cc, uint32_t* pc) = 0; + virtual void BL(int cc, uint32_t* pc) = 0; + virtual void BX(int cc, int Rn) = 0; + + virtual void label(const char* theLabel) = 0; + virtual void B(int cc, const char* label) = 0; + virtual void BL(int cc, const char* label) = 0; + + // valid only after generate() has been called + virtual uint32_t* pcForLabel(const char* label) = 0; + + // data transfer... + virtual void LDR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)) = 0; + virtual void LDRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)) = 0; + virtual void STR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)) = 0; + virtual void STRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)) = 0; + + virtual void LDRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)) = 0; + virtual void LDRSB(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)) = 0; + virtual void LDRSH(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)) = 0; + virtual void STRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)) = 0; + + // block data transfer... + virtual void LDM(int cc, int dir, + int Rn, int W, uint32_t reg_list) = 0; + virtual void STM(int cc, int dir, + int Rn, int W, uint32_t reg_list) = 0; + + // special... + virtual void SWP(int cc, int Rn, int Rd, int Rm) = 0; + virtual void SWPB(int cc, int Rn, int Rd, int Rm) = 0; + virtual void SWI(int cc, uint32_t comment) = 0; + + // DSP instructions... + enum { + // B=0, T=1 + // yx + xyBB = 0, // 0000, + xyTB = 2, // 0010, + xyBT = 4, // 0100, + xyTT = 6, // 0110, + yB = 0, // 0000, + yT = 4, // 0100 + }; + + virtual void PLD(int Rn, uint32_t offset) = 0; + + virtual void CLZ(int cc, int Rd, int Rm) = 0; + + virtual void QADD(int cc, int Rd, int Rm, int Rn) = 0; + virtual void QDADD(int cc, int Rd, int Rm, int Rn) = 0; + virtual void QSUB(int cc, int Rd, int Rm, int Rn) = 0; + virtual void QDSUB(int cc, int Rd, int Rm, int Rn) = 0; + + virtual void SMUL(int cc, int xy, + int Rd, int Rm, int Rs) = 0; + virtual void SMULW(int cc, int y, + int Rd, int Rm, int Rs) = 0; + virtual void SMLA(int cc, int xy, + int Rd, int Rm, int Rs, int Rn) = 0; + virtual void SMLAL(int cc, int xy, + int RdHi, int RdLo, int Rs, int Rm) = 0; + virtual void SMLAW(int cc, int y, + int Rd, int Rm, int Rs, int Rn) = 0; + + // ----------------------------------------------------------------------- + // convenience... + // ----------------------------------------------------------------------- + inline void + ADC(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opADC, cc, s, Rd, Rn, Op2); + } + inline void + ADD(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opADD, cc, s, Rd, Rn, Op2); + } + inline void + AND(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opAND, cc, s, Rd, Rn, Op2); + } + inline void + BIC(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opBIC, cc, s, Rd, Rn, Op2); + } + inline void + EOR(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opEOR, cc, s, Rd, Rn, Op2); + } + inline void + MOV(int cc, int s, int Rd, uint32_t Op2) { + dataProcessing(opMOV, cc, s, Rd, 0, Op2); + } + inline void + MVN(int cc, int s, int Rd, uint32_t Op2) { + dataProcessing(opMVN, cc, s, Rd, 0, Op2); + } + inline void + ORR(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opORR, cc, s, Rd, Rn, Op2); + } + inline void + RSB(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opRSB, cc, s, Rd, Rn, Op2); + } + inline void + RSC(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opRSC, cc, s, Rd, Rn, Op2); + } + inline void + SBC(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opSBC, cc, s, Rd, Rn, Op2); + } + inline void + SUB(int cc, int s, int Rd, int Rn, uint32_t Op2) { + dataProcessing(opSUB, cc, s, Rd, Rn, Op2); + } + inline void + TEQ(int cc, int Rn, uint32_t Op2) { + dataProcessing(opTEQ, cc, 1, 0, Rn, Op2); + } + inline void + TST(int cc, int Rn, uint32_t Op2) { + dataProcessing(opTST, cc, 1, 0, Rn, Op2); + } + inline void + CMP(int cc, int Rn, uint32_t Op2) { + dataProcessing(opCMP, cc, 1, 0, Rn, Op2); + } + inline void + CMN(int cc, int Rn, uint32_t Op2) { + dataProcessing(opCMN, cc, 1, 0, Rn, Op2); + } + + inline void SMULBB(int cc, int Rd, int Rm, int Rs) { + SMUL(cc, xyBB, Rd, Rm, Rs); } + inline void SMULTB(int cc, int Rd, int Rm, int Rs) { + SMUL(cc, xyTB, Rd, Rm, Rs); } + inline void SMULBT(int cc, int Rd, int Rm, int Rs) { + SMUL(cc, xyBT, Rd, Rm, Rs); } + inline void SMULTT(int cc, int Rd, int Rm, int Rs) { + SMUL(cc, xyTT, Rd, Rm, Rs); } + + inline void SMULWB(int cc, int Rd, int Rm, int Rs) { + SMULW(cc, yB, Rd, Rm, Rs); } + inline void SMULWT(int cc, int Rd, int Rm, int Rs) { + SMULW(cc, yT, Rd, Rm, Rs); } + + inline void + SMLABB(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLA(cc, xyBB, Rd, Rm, Rs, Rn); } + inline void + SMLATB(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLA(cc, xyTB, Rd, Rm, Rs, Rn); } + inline void + SMLABT(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLA(cc, xyBT, Rd, Rm, Rs, Rn); } + inline void + SMLATT(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLA(cc, xyTT, Rd, Rm, Rs, Rn); } + + inline void + SMLALBB(int cc, int RdHi, int RdLo, int Rs, int Rm) { + SMLAL(cc, xyBB, RdHi, RdLo, Rs, Rm); } + inline void + SMLALTB(int cc, int RdHi, int RdLo, int Rs, int Rm) { + SMLAL(cc, xyTB, RdHi, RdLo, Rs, Rm); } + inline void + SMLALBT(int cc, int RdHi, int RdLo, int Rs, int Rm) { + SMLAL(cc, xyBT, RdHi, RdLo, Rs, Rm); } + inline void + SMLALTT(int cc, int RdHi, int RdLo, int Rs, int Rm) { + SMLAL(cc, xyTT, RdHi, RdLo, Rs, Rm); } + + inline void + SMLAWB(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLAW(cc, yB, Rd, Rm, Rs, Rn); } + inline void + SMLAWT(int cc, int Rd, int Rm, int Rs, int Rn) { + SMLAW(cc, yT, Rd, Rm, Rs, Rn); } +}; + +}; // namespace android + +#endif //ANDROID_ARMASSEMBLER_INTERFACE_H diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp new file mode 100644 index 0000000..18c4618 --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp @@ -0,0 +1,200 @@ +/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#include <stdint.h> +#include <sys/types.h> + +#include "codeflinger/ARMAssemblerProxy.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +ARMAssemblerProxy::ARMAssemblerProxy() + : mTarget(0) +{ +} + +ARMAssemblerProxy::ARMAssemblerProxy(ARMAssemblerInterface* target) + : mTarget(target) +{ +} + +ARMAssemblerProxy::~ARMAssemblerProxy() +{ + delete mTarget; +} + +void ARMAssemblerProxy::setTarget(ARMAssemblerInterface* target) +{ + delete mTarget; + mTarget = target; +} + +void ARMAssemblerProxy::reset() { + mTarget->reset(); +} +int ARMAssemblerProxy::generate(const char* name) { + return mTarget->generate(name); +} +void ARMAssemblerProxy::disassemble(const char* name) { + return mTarget->disassemble(name); +} +void ARMAssemblerProxy::prolog() { + mTarget->prolog(); +} +void ARMAssemblerProxy::epilog(uint32_t touched) { + mTarget->epilog(touched); +} +void ARMAssemblerProxy::comment(const char* string) { + mTarget->comment(string); +} + + +void ARMAssemblerProxy::dataProcessing( int opcode, int cc, int s, + int Rd, int Rn, uint32_t Op2) +{ + mTarget->dataProcessing(opcode, cc, s, Rd, Rn, Op2); +} + +void ARMAssemblerProxy::MLA(int cc, int s, int Rd, int Rm, int Rs, int Rn) { + mTarget->MLA(cc, s, Rd, Rm, Rs, Rn); +} +void ARMAssemblerProxy::MUL(int cc, int s, int Rd, int Rm, int Rs) { + mTarget->MUL(cc, s, Rd, Rm, Rs); +} +void ARMAssemblerProxy::UMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + mTarget->UMULL(cc, s, RdLo, RdHi, Rm, Rs); +} +void ARMAssemblerProxy::UMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + mTarget->UMUAL(cc, s, RdLo, RdHi, Rm, Rs); +} +void ARMAssemblerProxy::SMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + mTarget->SMULL(cc, s, RdLo, RdHi, Rm, Rs); +} +void ARMAssemblerProxy::SMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs) { + mTarget->SMUAL(cc, s, RdLo, RdHi, Rm, Rs); +} + +void ARMAssemblerProxy::B(int cc, uint32_t* pc) { + mTarget->B(cc, pc); +} +void ARMAssemblerProxy::BL(int cc, uint32_t* pc) { + mTarget->BL(cc, pc); +} +void ARMAssemblerProxy::BX(int cc, int Rn) { + mTarget->BX(cc, Rn); +} +void ARMAssemblerProxy::label(const char* theLabel) { + mTarget->label(theLabel); +} +void ARMAssemblerProxy::B(int cc, const char* label) { + mTarget->B(cc, label); +} +void ARMAssemblerProxy::BL(int cc, const char* label) { + mTarget->BL(cc, label); +} + +uint32_t* ARMAssemblerProxy::pcForLabel(const char* label) { + return mTarget->pcForLabel(label); +} + +void ARMAssemblerProxy::LDR(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->LDR(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::LDRB(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->LDRB(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::STR(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->STR(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::STRB(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->STRB(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::LDRH(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->LDRH(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::LDRSB(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->LDRSB(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::LDRSH(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->LDRSH(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::STRH(int cc, int Rd, int Rn, uint32_t offset) { + mTarget->STRH(cc, Rd, Rn, offset); +} +void ARMAssemblerProxy::LDM(int cc, int dir, int Rn, int W, uint32_t reg_list) { + mTarget->LDM(cc, dir, Rn, W, reg_list); +} +void ARMAssemblerProxy::STM(int cc, int dir, int Rn, int W, uint32_t reg_list) { + mTarget->STM(cc, dir, Rn, W, reg_list); +} + +void ARMAssemblerProxy::SWP(int cc, int Rn, int Rd, int Rm) { + mTarget->SWP(cc, Rn, Rd, Rm); +} +void ARMAssemblerProxy::SWPB(int cc, int Rn, int Rd, int Rm) { + mTarget->SWPB(cc, Rn, Rd, Rm); +} +void ARMAssemblerProxy::SWI(int cc, uint32_t comment) { + mTarget->SWI(cc, comment); +} + + +void ARMAssemblerProxy::PLD(int Rn, uint32_t offset) { + mTarget->PLD(Rn, offset); +} +void ARMAssemblerProxy::CLZ(int cc, int Rd, int Rm) { + mTarget->CLZ(cc, Rd, Rm); +} +void ARMAssemblerProxy::QADD(int cc, int Rd, int Rm, int Rn) { + mTarget->QADD(cc, Rd, Rm, Rn); +} +void ARMAssemblerProxy::QDADD(int cc, int Rd, int Rm, int Rn) { + mTarget->QDADD(cc, Rd, Rm, Rn); +} +void ARMAssemblerProxy::QSUB(int cc, int Rd, int Rm, int Rn) { + mTarget->QSUB(cc, Rd, Rm, Rn); +} +void ARMAssemblerProxy::QDSUB(int cc, int Rd, int Rm, int Rn) { + mTarget->QDSUB(cc, Rd, Rm, Rn); +} +void ARMAssemblerProxy::SMUL(int cc, int xy, int Rd, int Rm, int Rs) { + mTarget->SMUL(cc, xy, Rd, Rm, Rs); +} +void ARMAssemblerProxy::SMULW(int cc, int y, int Rd, int Rm, int Rs) { + mTarget->SMULW(cc, y, Rd, Rm, Rs); +} +void ARMAssemblerProxy::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn) { + mTarget->SMLA(cc, xy, Rd, Rm, Rs, Rn); +} +void ARMAssemblerProxy::SMLAL( int cc, int xy, + int RdHi, int RdLo, int Rs, int Rm) { + mTarget->SMLAL(cc, xy, RdHi, RdLo, Rs, Rm); +} +void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) { + mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn); +} + + +}; // namespace android + diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.h b/libpixelflinger/codeflinger/ARMAssemblerProxy.h new file mode 100644 index 0000000..4bdca9c --- /dev/null +++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.h @@ -0,0 +1,123 @@ +/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.h +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#ifndef ANDROID_ARMASSEMBLER_PROXY_H +#define ANDROID_ARMASSEMBLER_PROXY_H + +#include <stdint.h> +#include <sys/types.h> + +#include "codeflinger/ARMAssemblerInterface.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +class ARMAssemblerProxy : public ARMAssemblerInterface +{ +public: + // ARMAssemblerProxy take ownership of the target + + ARMAssemblerProxy(); + ARMAssemblerProxy(ARMAssemblerInterface* target); + virtual ~ARMAssemblerProxy(); + + void setTarget(ARMAssemblerInterface* target); + + virtual void reset(); + virtual int generate(const char* name); + virtual void disassemble(const char* name); + + virtual void prolog(); + virtual void epilog(uint32_t touched); + virtual void comment(const char* string); + + virtual void dataProcessing(int opcode, int cc, int s, + int Rd, int Rn, + uint32_t Op2); + virtual void MLA(int cc, int s, + int Rd, int Rm, int Rs, int Rn); + virtual void MUL(int cc, int s, + int Rd, int Rm, int Rs); + virtual void UMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void UMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void SMULL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + virtual void SMUAL(int cc, int s, + int RdLo, int RdHi, int Rm, int Rs); + + virtual void B(int cc, uint32_t* pc); + virtual void BL(int cc, uint32_t* pc); + virtual void BX(int cc, int Rn); + virtual void label(const char* theLabel); + virtual void B(int cc, const char* label); + virtual void BL(int cc, const char* label); + + uint32_t* pcForLabel(const char* label); + + virtual void LDR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void LDRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void STR (int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void STRB(int cc, int Rd, + int Rn, uint32_t offset = immed12_pre(0)); + virtual void LDRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDRSB(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDRSH(int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void STRH (int cc, int Rd, + int Rn, uint32_t offset = immed8_pre(0)); + virtual void LDM(int cc, int dir, + int Rn, int W, uint32_t reg_list); + virtual void STM(int cc, int dir, + int Rn, int W, uint32_t reg_list); + + virtual void SWP(int cc, int Rn, int Rd, int Rm); + virtual void SWPB(int cc, int Rn, int Rd, int Rm); + virtual void SWI(int cc, uint32_t comment); + + virtual void PLD(int Rn, uint32_t offset); + virtual void CLZ(int cc, int Rd, int Rm); + virtual void QADD(int cc, int Rd, int Rm, int Rn); + virtual void QDADD(int cc, int Rd, int Rm, int Rn); + virtual void QSUB(int cc, int Rd, int Rm, int Rn); + virtual void QDSUB(int cc, int Rd, int Rm, int Rn); + virtual void SMUL(int cc, int xy, + int Rd, int Rm, int Rs); + virtual void SMULW(int cc, int y, + int Rd, int Rm, int Rs); + virtual void SMLA(int cc, int xy, + int Rd, int Rm, int Rs, int Rn); + virtual void SMLAL(int cc, int xy, + int RdHi, int RdLo, int Rs, int Rm); + virtual void SMLAW(int cc, int y, + int Rd, int Rm, int Rs, int Rn); + +private: + ARMAssemblerInterface* mTarget; +}; + +}; // namespace android + +#endif //ANDROID_ARMASSEMBLER_PROXY_H diff --git a/libpixelflinger/codeflinger/CodeCache.cpp b/libpixelflinger/codeflinger/CodeCache.cpp new file mode 100644 index 0000000..29410c8 --- /dev/null +++ b/libpixelflinger/codeflinger/CodeCache.cpp @@ -0,0 +1,151 @@ +/* libs/pixelflinger/codeflinger/CodeCache.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> + +#include <cutils/log.h> +#include <cutils/atomic.h> + +#include "codeflinger/CodeCache.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +#if defined(__arm__) +#include <unistd.h> +#include <errno.h> +#endif + +// ---------------------------------------------------------------------------- + +Assembly::Assembly(size_t size) + : mCount(1), mSize(0) +{ + mBase = (uint32_t*)malloc(size); + if (mBase) { + mSize = size; + } +} + +Assembly::~Assembly() +{ + free(mBase); +} + +void Assembly::incStrong(const void*) const +{ + android_atomic_inc(&mCount); +} + +void Assembly::decStrong(const void*) const +{ + if (android_atomic_dec(&mCount) == 1) { + delete this; + } +} + +ssize_t Assembly::size() const +{ + if (!mBase) return NO_MEMORY; + return mSize; +} + +uint32_t* Assembly::base() const +{ + return mBase; +} + +ssize_t Assembly::resize(size_t newSize) +{ + mBase = (uint32_t*)realloc(mBase, newSize); + mSize = newSize; + return size(); +} + +// ---------------------------------------------------------------------------- + +CodeCache::CodeCache(size_t size) + : mCacheSize(size), mCacheInUse(0) +{ + pthread_mutex_init(&mLock, 0); +} + +CodeCache::~CodeCache() +{ + pthread_mutex_destroy(&mLock); +} + +sp<Assembly> CodeCache::lookup(const AssemblyKeyBase& keyBase) const +{ + pthread_mutex_lock(&mLock); + sp<Assembly> r; + ssize_t index = mCacheData.indexOfKey(key_t(keyBase)); + if (index >= 0) { + const cache_entry_t& e = mCacheData.valueAt(index); + e.when = mWhen++; + r = e.entry; + } + pthread_mutex_unlock(&mLock); + return r; +} + +int CodeCache::cache( const AssemblyKeyBase& keyBase, + const sp<Assembly>& assembly) +{ + pthread_mutex_lock(&mLock); + + const ssize_t assemblySize = assembly->size(); + while (mCacheInUse + assemblySize > mCacheSize) { + // evict the LRU + size_t lru = 0; + size_t count = mCacheData.size(); + for (size_t i=0 ; i<count ; i++) { + const cache_entry_t& e = mCacheData.valueAt(i); + if (e.when < mCacheData.valueAt(lru).when) { + lru = i; + } + } + const cache_entry_t& e = mCacheData.valueAt(lru); + mCacheInUse -= e.entry->size(); + mCacheData.removeItemsAt(lru); + } + + ssize_t err = mCacheData.add(key_t(keyBase), cache_entry_t(assembly, mWhen)); + if (err >= 0) { + mCacheInUse += assemblySize; + mWhen++; + // synchronize caches... +#if defined(__arm__) + const long base = long(assembly->base()); + const long curr = base + long(assembly->size()); + err = cacheflush(base, curr, 0); + LOGE_IF(err, "__ARM_NR_cacheflush error %s\n", + strerror(errno)); +#endif + } + + pthread_mutex_unlock(&mLock); + return err; +} + +// ---------------------------------------------------------------------------- + +}; // namespace android diff --git a/libpixelflinger/codeflinger/CodeCache.h b/libpixelflinger/codeflinger/CodeCache.h new file mode 100644 index 0000000..370ce17 --- /dev/null +++ b/libpixelflinger/codeflinger/CodeCache.h @@ -0,0 +1,134 @@ +/* libs/pixelflinger/codeflinger/CodeCache.h +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#ifndef ANDROID_CODECACHE_H +#define ANDROID_CODECACHE_H + +#include <stdint.h> +#include <pthread.h> +#include <sys/types.h> + +#include <utils/KeyedVector.h> + +#include "tinyutils/smartpointer.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +class AssemblyKeyBase { +public: + virtual ~AssemblyKeyBase() { } + virtual int compare_type(const AssemblyKeyBase& key) const = 0; +}; + +template <typename T> +class AssemblyKey : public AssemblyKeyBase +{ +public: + AssemblyKey(const T& rhs) : mKey(rhs) { } + virtual int compare_type(const AssemblyKeyBase& key) const { + const T& rhs = static_cast<const AssemblyKey&>(key).mKey; + return android::compare_type(mKey, rhs); + } +private: + T mKey; +}; + +// ---------------------------------------------------------------------------- + +class Assembly +{ +public: + Assembly(size_t size); + virtual ~Assembly(); + + ssize_t size() const; + uint32_t* base() const; + ssize_t resize(size_t size); + + // protocol for sp<> + void incStrong(const void* id) const; + void decStrong(const void* id) const; + typedef void weakref_type; + +private: + mutable int32_t mCount; + uint32_t* mBase; + ssize_t mSize; +}; + +// ---------------------------------------------------------------------------- + +class CodeCache +{ +public: +// pretty simple cache API... + CodeCache(size_t size); + ~CodeCache(); + + sp<Assembly> lookup(const AssemblyKeyBase& key) const; + + int cache( const AssemblyKeyBase& key, + const sp<Assembly>& assembly); + +private: + // nothing to see here... + struct cache_entry_t { + inline cache_entry_t() { } + inline cache_entry_t(const sp<Assembly>& a, int64_t w) + : entry(a), when(w) { } + sp<Assembly> entry; + mutable int64_t when; + }; + + class key_t { + friend int compare_type( + const key_value_pair_t<key_t, cache_entry_t>&, + const key_value_pair_t<key_t, cache_entry_t>&); + const AssemblyKeyBase* mKey; + public: + key_t() { }; + key_t(const AssemblyKeyBase& k) : mKey(&k) { } + }; + + mutable pthread_mutex_t mLock; + mutable int64_t mWhen; + size_t mCacheSize; + size_t mCacheInUse; + KeyedVector<key_t, cache_entry_t> mCacheData; + + friend int compare_type( + const key_value_pair_t<key_t, cache_entry_t>&, + const key_value_pair_t<key_t, cache_entry_t>&); +}; + +// KeyedVector uses compare_type(), which is more efficient, than +// just using operator < () +inline int compare_type( + const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& lhs, + const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& rhs) +{ + return lhs.key.mKey->compare_type(*(rhs.key.mKey)); +} + +// ---------------------------------------------------------------------------- + +}; // namespace android + +#endif //ANDROID_CODECACHE_H diff --git a/libpixelflinger/codeflinger/GGLAssembler.cpp b/libpixelflinger/codeflinger/GGLAssembler.cpp new file mode 100644 index 0000000..90c275e --- /dev/null +++ b/libpixelflinger/codeflinger/GGLAssembler.cpp @@ -0,0 +1,1135 @@ +/* libs/pixelflinger/codeflinger/GGLAssembler.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#define LOG_TAG "GGLAssembler" + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <cutils/log.h> + +#include "codeflinger/GGLAssembler.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +GGLAssembler::GGLAssembler(ARMAssemblerInterface* target) + : ARMAssemblerProxy(target), RegisterAllocator(), mOptLevel(7) +{ +} + +GGLAssembler::~GGLAssembler() +{ +} + +void GGLAssembler::prolog() +{ + ARMAssemblerProxy::prolog(); +} + +void GGLAssembler::epilog(uint32_t touched) +{ + ARMAssemblerProxy::epilog(touched); +} + +void GGLAssembler::reset(int opt_level) +{ + ARMAssemblerProxy::reset(); + RegisterAllocator::reset(); + mOptLevel = opt_level; +} + +// --------------------------------------------------------------------------- + +int GGLAssembler::scanline(const needs_t& needs, context_t const* c) +{ + int err = 0; + int opt_level = mOptLevel; + while (opt_level >= 0) { + reset(opt_level); + err = scanline_core(needs, c); + if (err == 0) + break; + opt_level--; + } + + // XXX: in theory, pcForLabel is not valid before generate() + uint32_t* fragment_start_pc = pcForLabel("fragment_loop"); + uint32_t* fragment_end_pc = pcForLabel("epilog"); + const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc); + + // build a name for our pipeline + char name[64]; + sprintf(name, + "scanline__%08X:%08X_%08X_%08X [%3d ipp]", + needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops); + + if (err) { + LOGE("Error while generating ""%s""\n", name); + disassemble(name); + return -1; + } + + return generate(name); +} + +int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c) +{ + int64_t duration = ggl_system_time(); + + mBlendFactorCached = 0; + mBlending = 0; + mMasking = 0; + mAA = GGL_READ_NEEDS(P_AA, needs.p); + mDithering = GGL_READ_NEEDS(P_DITHER, needs.p); + mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER; + mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER; + mFog = GGL_READ_NEEDS(P_FOG, needs.p) != 0; + mSmooth = GGL_READ_NEEDS(SHADE, needs.n) != 0; + mBuilderContext.needs = needs; + mBuilderContext.c = c; + mBuilderContext.Rctx = reserveReg(R0); // context always in R0 + mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ]; + + // ------------------------------------------------------------------------ + + decodeLogicOpNeeds(needs); + + decodeTMUNeeds(needs, c); + + mBlendSrc = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n)); + mBlendDst = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n)); + mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n)); + mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n)); + + if (!mCbFormat.c[GGLFormat::ALPHA].h) { + if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) || + (mBlendSrc == GGL_DST_ALPHA)) { + mBlendSrc = GGL_ONE; + } + if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) || + (mBlendSrcA == GGL_DST_ALPHA)) { + mBlendSrcA = GGL_ONE; + } + if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) || + (mBlendDst == GGL_DST_ALPHA)) { + mBlendDst = GGL_ONE; + } + if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) || + (mBlendDstA == GGL_DST_ALPHA)) { + mBlendDstA = GGL_ONE; + } + } + + // if we need the framebuffer, read it now + const int blending = blending_codes(mBlendSrc, mBlendDst) | + blending_codes(mBlendSrcA, mBlendDstA); + + // XXX: handle special cases, destination not modified... + if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) && + (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) { + // Destination unmodified (beware of logic ops) + } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) && + (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) { + // Destination is zero (beware of logic ops) + } + + const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n); + for (int i=0 ; i<4 ; i++) { + const int mask = 1<<i; + component_info_t& info = mInfo[i]; + int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc; + int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst; + if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA) + fs = GGL_ONE; + info.masked = !!(masking & mask); + info.inDest = !info.masked && mCbFormat.c[i].h && + ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp)); + if (mCbFormat.components >= GGL_LUMINANCE && + (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) { + info.inDest = false; + } + info.needed = (i==GGLFormat::ALPHA) && + (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS); + info.replaced = !!(mTextureMachine.replaced & mask); + info.iterated = (!info.replaced && (info.inDest || info.needed)); + info.smooth = mSmooth && info.iterated; + info.fog = mFog && info.inDest && (i != GGLFormat::ALPHA); + info.blend = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO)); + + mBlending |= (info.blend ? mask : 0); + mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0; + } + + + fragment_parts_t parts; + + // ------------------------------------------------------------------------ + prolog(); + // ------------------------------------------------------------------------ + + build_scanline_prolog(parts, needs); + + if (registerFile().status()) + return registerFile().status(); + + // ------------------------------------------------------------------------ + label("fragment_loop"); + // ------------------------------------------------------------------------ + { + Scratch regs(registerFile()); + + if (mDithering) { + // update the dither index. + MOV(AL, 0, parts.count.reg, + reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT)); + ADD(AL, 0, parts.count.reg, parts.count.reg, + imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT))); + MOV(AL, 0, parts.count.reg, + reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT)); + } + + // XXX: could we do an early alpha-test here in some cases? + // It would probaly be used only with smooth-alpha and no texture + // (or no alpha component in the texture). + + // Early z-test + if (mAlphaTest==GGL_ALWAYS) { + build_depth_test(parts, Z_TEST|Z_WRITE); + } else { + // we cannot do the z-write here, because + // it might be killed by the alpha-test later + build_depth_test(parts, Z_TEST); + } + + { // texture coordinates + Scratch scratches(registerFile()); + + // texel generation + build_textures(parts, regs); + } + + if ((blending & (FACTOR_DST|BLEND_DST)) || mMasking || + (mLogicOp & LOGIC_OP_DST)) { + // blending / logic_op / masking need the framebuffer + mDstPixel.setTo(regs.obtain(), &mCbFormat); + + // load the framebuffer pixel + comment("fetch color-buffer"); + load(parts.cbPtr, mDstPixel); + } + + if (registerFile().status()) + return registerFile().status(); + + pixel_t pixel; + int directTex = mTextureMachine.directTexture; + if (directTex | parts.packed) { + // note: we can't have both here + // iterated color or direct texture + pixel = directTex ? parts.texel[directTex-1] : parts.iterated; + pixel.flags &= ~CORRUPTIBLE; + } else { + if (mDithering) { + const int ctxtReg = mBuilderContext.Rctx; + const int mask = GGL_DITHER_SIZE-1; + parts.dither = reg_t(regs.obtain()); + AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask)); + ADD(AL, 0, parts.dither.reg, parts.dither.reg, ctxtReg); + LDRB(AL, parts.dither.reg, parts.dither.reg, + immed12_pre(GGL_OFFSETOF(ditherMatrix))); + } + + // allocate a register for the resulting pixel + pixel.setTo(regs.obtain(), &mCbFormat, FIRST); + + build_component(pixel, parts, GGLFormat::ALPHA, regs); + + if (mAlphaTest!=GGL_ALWAYS) { + // only handle the z-write part here. We know z-test + // was successful, as well as alpha-test. + build_depth_test(parts, Z_WRITE); + } + + build_component(pixel, parts, GGLFormat::RED, regs); + build_component(pixel, parts, GGLFormat::GREEN, regs); + build_component(pixel, parts, GGLFormat::BLUE, regs); + + pixel.flags |= CORRUPTIBLE; + } + + if (registerFile().status()) + return registerFile().status(); + + if (pixel.reg == -1) { + // be defensive here. if we're here it's probably + // that this whole fragment is a no-op. + pixel = mDstPixel; + } + + // logic operation + build_logic_op(pixel, regs); + + // masking + build_masking(pixel, regs); + + comment("store"); + store(parts.cbPtr, pixel, WRITE_BACK); + } + + if (registerFile().status()) + return registerFile().status(); + + // update the iterated color... + if (parts.reload != 3) { + build_smooth_shade(parts); + } + + // update iterated z + build_iterate_z(parts); + + // update iterated fog + build_iterate_f(parts); + + SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16)); + B(PL, "fragment_loop"); + label("epilog"); + epilog(registerFile().touched()); + + if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) { + if (mDepthTest!=GGL_ALWAYS) { + label("discard_before_textures"); + build_iterate_texture_coordinates(parts); + } + label("discard_after_textures"); + build_smooth_shade(parts); + build_iterate_z(parts); + build_iterate_f(parts); + ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3)); + SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16)); + B(PL, "fragment_loop"); + epilog(registerFile().touched()); + } + + return registerFile().status(); +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_scanline_prolog( + fragment_parts_t& parts, const needs_t& needs) +{ + Scratch scratches(registerFile()); + int Rctx = mBuilderContext.Rctx; + + // compute count + comment("compute ct (# of pixels to process)"); + parts.count.setTo(obtainReg()); + int Rx = scratches.obtain(); + int Ry = scratches.obtain(); + CONTEXT_LOAD(Rx, iterators.xl); + CONTEXT_LOAD(parts.count.reg, iterators.xr); + CONTEXT_LOAD(Ry, iterators.y); + + // parts.count = iterators.xr - Rx + SUB(AL, 0, parts.count.reg, parts.count.reg, Rx); + SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1)); + + if (mDithering) { + // parts.count.reg = 0xNNNNXXDD + // NNNN = count-1 + // DD = dither offset + // XX = 0xxxxxxx (x = garbage) + Scratch scratches(registerFile()); + int tx = scratches.obtain(); + int ty = scratches.obtain(); + AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK)); + AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK)); + ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT)); + ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16)); + } else { + // parts.count.reg = 0xNNNN0000 + // NNNN = count-1 + MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16)); + } + + // compute dst ptr + comment("compute color-buffer pointer"); + const int cb_bits = mCbFormat.size*8; + int Rs = scratches.obtain(); + parts.cbPtr.setTo(obtainReg(), cb_bits); + CONTEXT_LOAD(Rs, state.buffers.color.stride); + CONTEXT_LOAD(parts.cbPtr.reg, state.buffers.color.data); + SMLABB(AL, Rs, Ry, Rs, Rx); // Rs = Rx + Ry*Rs + base_offset(parts.cbPtr, parts.cbPtr, Rs); + scratches.recycle(Rs); + + // init fog + const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p); + if (need_fog) { + comment("compute initial fog coordinate"); + Scratch scratches(registerFile()); + int dfdx = scratches.obtain(); + int ydfdy = scratches.obtain(); + int f = ydfdy; + CONTEXT_LOAD(dfdx, generated_vars.dfdx); + CONTEXT_LOAD(ydfdy, iterators.ydfdy); + MLA(AL, 0, f, Rx, dfdx, ydfdy); + CONTEXT_STORE(f, generated_vars.f); + } + + // init Z coordinate + if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) { + parts.z = reg_t(obtainReg()); + comment("compute initial Z coordinate"); + Scratch scratches(registerFile()); + int dzdx = scratches.obtain(); + int ydzdy = parts.z.reg; + CONTEXT_LOAD(dzdx, generated_vars.dzdx); // 1.31 fixed-point + CONTEXT_LOAD(ydzdy, iterators.ydzdy); // 1.31 fixed-point + MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy); + + // we're going to index zbase of parts.count + // zbase = base + (xl-count + stride*y)*2 + int Rs = dzdx; + int zbase = scratches.obtain(); + CONTEXT_LOAD(Rs, state.buffers.depth.stride); + CONTEXT_LOAD(zbase, state.buffers.depth.data); + SMLABB(AL, Rs, Ry, Rs, Rx); + ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16)); + ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1)); + CONTEXT_STORE(zbase, generated_vars.zbase); + } + + // init texture coordinates + init_textures(parts.coords, reg_t(Rx), reg_t(Ry)); + scratches.recycle(Ry); + + // iterated color + init_iterated_color(parts, reg_t(Rx)); + + // init coverage factor application (anti-aliasing) + if (mAA) { + parts.covPtr.setTo(obtainReg(), 16); + CONTEXT_LOAD(parts.covPtr.reg, state.buffers.coverage); + ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1)); + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_component( pixel_t& pixel, + const fragment_parts_t& parts, + int component, + Scratch& regs) +{ + static char const * comments[] = {"alpha", "red", "green", "blue"}; + comment(comments[component]); + + // local register file + Scratch scratches(registerFile()); + const int dst_component_size = pixel.component_size(component); + + component_t temp(-1); + build_incoming_component( temp, dst_component_size, + parts, component, scratches, regs); + + if (mInfo[component].inDest) { + + // blending... + build_blending( temp, mDstPixel, component, scratches ); + + // downshift component and rebuild pixel... + downshift(pixel, component, temp, parts.dither); + } +} + +void GGLAssembler::build_incoming_component( + component_t& temp, + int dst_size, + const fragment_parts_t& parts, + int component, + Scratch& scratches, + Scratch& global_regs) +{ + const uint32_t component_mask = 1<<component; + + // Figure out what we need for the blending stage... + int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc; + int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst; + if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) { + fs = GGL_ONE; + } + + // Figure out what we need to extract and for what reason + const int blending = blending_codes(fs, fd); + + // Are we actually going to blend? + const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO)); + + // expand the source if the destination has more bits + int need_expander = false; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) { + texture_unit_t& tmu = mTextureMachine.tmu[i]; + if ((tmu.format_idx) && + (parts.texel[i].component_size(component) < dst_size)) { + need_expander = true; + } + } + + // do we need to extract this component? + const bool multiTexture = mTextureMachine.activeUnits > 1; + const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) && + (isAlphaSourceNeeded()); + int need_extract = mInfo[component].needed; + if (mInfo[component].inDest) + { + need_extract |= ((need_blending ? + (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander)); + need_extract |= (mTextureMachine.mask != mTextureMachine.replaced); + need_extract |= mInfo[component].smooth; + need_extract |= mInfo[component].fog; + need_extract |= mDithering; + need_extract |= multiTexture; + } + + if (need_extract) { + Scratch& regs = blend_needs_alpha_source ? global_regs : scratches; + component_t fragment; + + // iterated color + build_iterated_color(fragment, parts, component, regs); + + // texture environement (decal, modulate, replace) + build_texture_environment(fragment, parts, component, regs); + + // expand the source if the destination has more bits + if (need_expander && (fragment.size() < dst_size)) { + // we're here only if we fetched a texel + // (so we know for sure fragment is CORRUPTIBLE) + expand(fragment, fragment, dst_size); + } + + // We have a few specific things to do for the alpha-channel + if ((component==GGLFormat::ALPHA) && + (mInfo[component].needed || fragment.size()<dst_size)) + { + // convert to integer_t first and make sure + // we don't corrupt a needed register + if (fragment.l) { + component_t incoming(fragment); + modify(fragment, regs); + MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l)); + fragment.h -= fragment.l; + fragment.l = 0; + } + + // coverage factor application + build_coverage_application(fragment, parts, regs); + + // alpha-test + build_alpha_test(fragment, parts); + + if (blend_needs_alpha_source) { + // We keep only 8 bits for the blending stage + const int shift = fragment.h <= 8 ? 0 : fragment.h-8; + if (fragment.flags & CORRUPTIBLE) { + fragment.flags &= ~CORRUPTIBLE; + mAlphaSource.setTo(fragment.reg, + fragment.size(), fragment.flags); + if (shift) { + MOV(AL, 0, mAlphaSource.reg, + reg_imm(mAlphaSource.reg, LSR, shift)); + } + } else { + // XXX: it would better to do this in build_blend_factor() + // so we can avoid the extra MOV below. + mAlphaSource.setTo(regs.obtain(), + fragment.size(), CORRUPTIBLE); + if (shift) { + MOV(AL, 0, mAlphaSource.reg, + reg_imm(fragment.reg, LSR, shift)); + } else { + MOV(AL, 0, mAlphaSource.reg, fragment.reg); + } + } + mAlphaSource.s -= shift; + } + } + + // fog... + build_fog( fragment, component, regs ); + + temp = fragment; + } else { + if (mInfo[component].inDest) { + // extraction not needed and replace + // we just select the right component + if ((mTextureMachine.replaced & component_mask) == 0) { + // component wasn't replaced, so use it! + temp = component_t(parts.iterated, component); + } + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) { + const texture_unit_t& tmu = mTextureMachine.tmu[i]; + if ((tmu.mask & component_mask) && + ((tmu.replaced & component_mask) == 0)) { + temp = component_t(parts.texel[i], component); + } + } + } + } +} + +bool GGLAssembler::isAlphaSourceNeeded() const +{ + // XXX: also needed for alpha-test + const int bs = mBlendSrc; + const int bd = mBlendDst; + return bs==GGL_SRC_ALPHA_SATURATE || + bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA || + bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ; +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts) +{ + if (mSmooth && !parts.iterated_packed) { + // update the iterated color in a pipelined way... + comment("update iterated color"); + Scratch scratches(registerFile()); + + const int reload = parts.reload; + for (int i=0 ; i<4 ; i++) { + if (!mInfo[i].iterated) + continue; + + int c = parts.argb[i].reg; + int dx = parts.argb_dx[i].reg; + + if (reload & 1) { + c = scratches.obtain(); + CONTEXT_LOAD(c, generated_vars.argb[i].c); + } + if (reload & 2) { + dx = scratches.obtain(); + CONTEXT_LOAD(dx, generated_vars.argb[i].dx); + } + + if (mSmooth) { + ADD(AL, 0, c, c, dx); + } + + if (reload & 1) { + CONTEXT_STORE(c, generated_vars.argb[i].c); + scratches.recycle(c); + } + if (reload & 2) { + scratches.recycle(dx); + } + } + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_coverage_application(component_t& fragment, + const fragment_parts_t& parts, Scratch& regs) +{ + // here fragment.l is guarenteed to be 0 + if (mAA) { + // coverages are 1.15 fixed-point numbers + comment("coverage application"); + + component_t incoming(fragment); + modify(fragment, regs); + + Scratch scratches(registerFile()); + int cf = scratches.obtain(); + LDRH(AL, cf, parts.covPtr.reg, immed8_post(2)); + if (fragment.h > 31) { + fragment.h--; + SMULWB(AL, fragment.reg, incoming.reg, cf); + } else { + MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1)); + SMULWB(AL, fragment.reg, fragment.reg, cf); + } + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_alpha_test(component_t& fragment, + const fragment_parts_t& parts) +{ + if (mAlphaTest != GGL_ALWAYS) { + comment("Alpha Test"); + Scratch scratches(registerFile()); + int ref = scratches.obtain(); + const int shift = GGL_COLOR_BITS-fragment.size(); + CONTEXT_LOAD(ref, state.alpha_test.ref); + if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift)); + else CMP(AL, fragment.reg, ref); + int cc = NV; + switch (mAlphaTest) { + case GGL_NEVER: cc = NV; break; + case GGL_LESS: cc = LT; break; + case GGL_EQUAL: cc = EQ; break; + case GGL_LEQUAL: cc = LS; break; + case GGL_GREATER: cc = HI; break; + case GGL_NOTEQUAL: cc = NE; break; + case GGL_GEQUAL: cc = HS; break; + } + B(cc^1, "discard_after_textures"); + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_depth_test( + const fragment_parts_t& parts, uint32_t mask) +{ + mask &= Z_TEST|Z_WRITE; + const needs_t& needs = mBuilderContext.needs; + const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p); + Scratch scratches(registerFile()); + + if (mDepthTest != GGL_ALWAYS || zmask) { + int cc=AL, ic=AL; + switch (mDepthTest) { + case GGL_LESS: ic = HI; break; + case GGL_EQUAL: ic = EQ; break; + case GGL_LEQUAL: ic = HS; break; + case GGL_GREATER: ic = LT; break; + case GGL_NOTEQUAL: ic = NE; break; + case GGL_GEQUAL: ic = LS; break; + case GGL_NEVER: + // this never happens, because it's taken care of when + // computing the needs. but we keep it for completness. + comment("Depth Test (NEVER)"); + B(AL, "discard_before_textures"); + return; + case GGL_ALWAYS: + // we're here because zmask is enabled + mask &= ~Z_TEST; // test always passes. + break; + } + + // inverse the condition + cc = ic^1; + + if ((mask & Z_WRITE) && !zmask) { + mask &= ~Z_WRITE; + } + + if (!mask) + return; + + comment("Depth Test"); + + int zbase = scratches.obtain(); + int depth = scratches.obtain(); + int z = parts.z.reg; + + CONTEXT_LOAD(zbase, generated_vars.zbase); // stall + SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15)); + // above does zbase = zbase + ((count >> 16) << 1) + + if (mask & Z_TEST) { + LDRH(AL, depth, zbase); // stall + CMP(AL, depth, reg_imm(z, LSR, 16)); + B(cc, "discard_before_textures"); + } + if (mask & Z_WRITE) { + if (mask == Z_WRITE) { + // only z-write asked, cc is meaningless + ic = AL; + } + MOV(AL, 0, depth, reg_imm(z, LSR, 16)); + STRH(ic, depth, zbase); + } + } +} + +void GGLAssembler::build_iterate_z(const fragment_parts_t& parts) +{ + const needs_t& needs = mBuilderContext.needs; + if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) { + Scratch scratches(registerFile()); + int dzdx = scratches.obtain(); + CONTEXT_LOAD(dzdx, generated_vars.dzdx); // stall + ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx); + } +} + +void GGLAssembler::build_iterate_f(const fragment_parts_t& parts) +{ + const needs_t& needs = mBuilderContext.needs; + if (GGL_READ_NEEDS(P_FOG, needs.p)) { + Scratch scratches(registerFile()); + int dfdx = scratches.obtain(); + int f = scratches.obtain(); + CONTEXT_LOAD(f, generated_vars.f); + CONTEXT_LOAD(dfdx, generated_vars.dfdx); // stall + ADD(AL, 0, f, f, dfdx); + CONTEXT_STORE(f, generated_vars.f); + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs) +{ + const needs_t& needs = mBuilderContext.needs; + const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; + if (opcode == GGL_COPY) + return; + + comment("logic operation"); + + pixel_t s(pixel); + if (!(pixel.flags & CORRUPTIBLE)) { + pixel.reg = regs.obtain(); + pixel.flags |= CORRUPTIBLE; + } + + pixel_t d(mDstPixel); + switch(opcode) { + case GGL_CLEAR: MOV(AL, 0, pixel.reg, imm(0)); break; + case GGL_AND: AND(AL, 0, pixel.reg, s.reg, d.reg); break; + case GGL_AND_REVERSE: BIC(AL, 0, pixel.reg, s.reg, d.reg); break; + case GGL_COPY: break; + case GGL_AND_INVERTED: BIC(AL, 0, pixel.reg, d.reg, s.reg); break; + case GGL_NOOP: MOV(AL, 0, pixel.reg, d.reg); break; + case GGL_XOR: EOR(AL, 0, pixel.reg, s.reg, d.reg); break; + case GGL_OR: ORR(AL, 0, pixel.reg, s.reg, d.reg); break; + case GGL_NOR: ORR(AL, 0, pixel.reg, s.reg, d.reg); + MVN(AL, 0, pixel.reg, pixel.reg); break; + case GGL_EQUIV: EOR(AL, 0, pixel.reg, s.reg, d.reg); + MVN(AL, 0, pixel.reg, pixel.reg); break; + case GGL_INVERT: MVN(AL, 0, pixel.reg, d.reg); break; + case GGL_OR_REVERSE: // s | ~d == ~(~s & d) + BIC(AL, 0, pixel.reg, d.reg, s.reg); + MVN(AL, 0, pixel.reg, pixel.reg); break; + case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg); break; + case GGL_OR_INVERTED: // ~s | d == ~(s & ~d) + BIC(AL, 0, pixel.reg, s.reg, d.reg); + MVN(AL, 0, pixel.reg, pixel.reg); break; + case GGL_NAND: AND(AL, 0, pixel.reg, s.reg, d.reg); + MVN(AL, 0, pixel.reg, pixel.reg); break; + case GGL_SET: MVN(AL, 0, pixel.reg, imm(0)); break; + }; +} + +// --------------------------------------------------------------------------- + +static uint32_t find_bottom(uint32_t val) +{ + uint32_t i = 0; + while (!(val & (3<<i))) + i+= 2; + return i; +} + +static void normalize(uint32_t& val, uint32_t& rot) +{ + rot = 0; + while (!(val&3) || (val & 0xFC000000)) { + uint32_t newval; + newval = val >> 2; + newval |= (val&3) << 30; + val = newval; + rot += 2; + if (rot == 32) { + rot = 0; + break; + } + } +} + +void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits) +{ + uint32_t rot; + uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1; + mask &= size; + + if (mask == size) { + if (d != s) + MOV( AL, 0, d, s); + return; + } + + int negative_logic = !isValidImmediate(mask); + if (negative_logic) { + mask = ~mask & size; + } + normalize(mask, rot); + + if (mask) { + while (mask) { + uint32_t bitpos = find_bottom(mask); + int shift = rot + bitpos; + uint32_t m = mask & (0xff << bitpos); + mask &= ~m; + m >>= bitpos; + int32_t newMask = (m<<shift) | (m>>(32-shift)); + if (!negative_logic) { + AND( AL, 0, d, s, imm(newMask) ); + } else { + BIC( AL, 0, d, s, imm(newMask) ); + } + s = d; + } + } else { + MOV( AL, 0, d, imm(0)); + } +} + +void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs) +{ + if (!mMasking) + return; + + comment("color mask"); + + pixel_t fb(mDstPixel); + pixel_t s(pixel); + if (!(pixel.flags & CORRUPTIBLE)) { + pixel.reg = regs.obtain(); + pixel.flags |= CORRUPTIBLE; + } + + int mask = 0; + for (int i=0 ; i<4 ; i++) { + const int component_mask = 1<<i; + const int h = fb.format.c[i].h; + const int l = fb.format.c[i].l; + if (h && (!(mMasking & component_mask))) { + mask |= ((1<<(h-l))-1) << l; + } + } + + // There is no need to clear the masked components of the source + // (unless we applied a logic op), because they're already zeroed + // by contruction (masked components are not computed) + + if (mLogicOp) { + const needs_t& needs = mBuilderContext.needs; + const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; + if (opcode != GGL_CLEAR) { + // clear masked component of source + build_and_immediate(pixel.reg, s.reg, mask, fb.size()); + s = pixel; + } + } + + // clear non masked components of destination + build_and_immediate(fb.reg, fb.reg, ~mask, fb.size()); + + // or back the channels that were masked + if (s.reg == fb.reg) { + // this is in fact a MOV + if (s.reg == pixel.reg) { + // ugh. this in in fact a nop + } else { + MOV(AL, 0, pixel.reg, fb.reg); + } + } else { + ORR(AL, 0, pixel.reg, s.reg, fb.reg); + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::base_offset( + const pointer_t& d, const pointer_t& b, const reg_t& o) +{ + switch (b.size) { + case 32: + ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2)); + break; + case 24: + if (d.reg == b.reg) { + ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1)); + ADD(AL, 0, d.reg, d.reg, o.reg); + } else { + ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1)); + ADD(AL, 0, d.reg, d.reg, b.reg); + } + break; + case 16: + ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1)); + break; + case 8: + ADD(AL, 0, d.reg, b.reg, o.reg); + break; + } +} + +// ---------------------------------------------------------------------------- +// cheezy register allocator... +// ---------------------------------------------------------------------------- + +void RegisterAllocator::reset() +{ + mRegs.reset(); +} + +int RegisterAllocator::reserveReg(int reg) +{ + return mRegs.reserve(reg); +} + +int RegisterAllocator::obtainReg() +{ + return mRegs.obtain(); +} + +void RegisterAllocator::recycleReg(int reg) +{ + mRegs.recycle(reg); +} + +RegisterAllocator::RegisterFile& RegisterAllocator::registerFile() +{ + return mRegs; +} + +// ---------------------------------------------------------------------------- + +RegisterAllocator::RegisterFile::RegisterFile() + : mRegs(0), mTouched(0), mStatus(0) +{ + reserve(ARMAssemblerInterface::SP); + reserve(ARMAssemblerInterface::PC); +} + +RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs) + : mRegs(rhs.mRegs), mTouched(rhs.mTouched) +{ +} + +RegisterAllocator::RegisterFile::~RegisterFile() +{ +} + +bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const +{ + return (mRegs == rhs.mRegs); +} + +void RegisterAllocator::RegisterFile::reset() +{ + mRegs = mTouched = mStatus = 0; + reserve(ARMAssemblerInterface::SP); + reserve(ARMAssemblerInterface::PC); +} + +int RegisterAllocator::RegisterFile::reserve(int reg) +{ + LOG_ALWAYS_FATAL_IF(isUsed(reg), + "reserving register %d, but already in use", + reg); + mRegs |= (1<<reg); + mTouched |= mRegs; + return reg; +} + +void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask) +{ + mRegs |= regMask; + mTouched |= regMask; +} + +int RegisterAllocator::RegisterFile::isUsed(int reg) const +{ + LOG_ALWAYS_FATAL_IF(reg>=16, "invalid register %d", reg); + return mRegs & (1<<reg); +} + +int RegisterAllocator::RegisterFile::obtain() +{ + const char priorityList[14] = { 0, 1, 2, 3, + 12, 14, 4, 5, + 6, 7, 8, 9, + 10, 11 }; + const int nbreg = sizeof(priorityList); + int i, r; + for (i=0 ; i<nbreg ; i++) { + r = priorityList[i]; + if (!isUsed(r)) { + break; + } + } + // this is not an error anymore because, we'll try again with + // a lower optimization level. + //LOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n"); + if (i >= nbreg) { + mStatus |= OUT_OF_REGISTERS; + // we return SP so we can more easily debug things + // the code will never be run anyway. + return ARMAssemblerInterface::SP; + } + reserve(r); + return r; +} + +bool RegisterAllocator::RegisterFile::hasFreeRegs() const +{ + return ((mRegs & 0xFFFF) == 0xFFFF) ? false : true; +} + +int RegisterAllocator::RegisterFile::countFreeRegs() const +{ + int f = ~mRegs & 0xFFFF; + // now count number of 1 + f = (f & 0x5555) + ((f>>1) & 0x5555); + f = (f & 0x3333) + ((f>>2) & 0x3333); + f = (f & 0x0F0F) + ((f>>4) & 0x0F0F); + f = (f & 0x00FF) + ((f>>8) & 0x00FF); + return f; +} + +void RegisterAllocator::RegisterFile::recycle(int reg) +{ + LOG_FATAL_IF(!isUsed(reg), + "recycling unallocated register %d", + reg); + mRegs &= ~(1<<reg); +} + +void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask) +{ + LOG_FATAL_IF((mRegs & regMask)!=regMask, + "recycling unallocated registers " + "(recycle=%08x, allocated=%08x, unallocated=%08x)", + regMask, mRegs, mRegs®Mask); + mRegs &= ~regMask; +} + +uint32_t RegisterAllocator::RegisterFile::touched() const +{ + return mTouched; +} + +// ---------------------------------------------------------------------------- + +}; // namespace android + diff --git a/libpixelflinger/codeflinger/GGLAssembler.h b/libpixelflinger/codeflinger/GGLAssembler.h new file mode 100644 index 0000000..ccaf43d --- /dev/null +++ b/libpixelflinger/codeflinger/GGLAssembler.h @@ -0,0 +1,549 @@ +/* libs/pixelflinger/codeflinger/GGLAssembler.h +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + + +#ifndef ANDROID_GGLASSEMBLER_H +#define ANDROID_GGLASSEMBLER_H + +#include <stdint.h> +#include <sys/types.h> + +#include <private/pixelflinger/ggl_context.h> + +#include "codeflinger/ARMAssemblerProxy.h" + + +namespace android { + +// ---------------------------------------------------------------------------- + +#define CONTEXT_LOAD(REG, FIELD) \ + LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD))) + +#define CONTEXT_STORE(REG, FIELD) \ + STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD))) + + +class RegisterAllocator +{ +public: + class RegisterFile; + + RegisterFile& registerFile(); + int reserveReg(int reg); + int obtainReg(); + void recycleReg(int reg); + void reset(); + + class RegisterFile + { + public: + RegisterFile(); + RegisterFile(const RegisterFile& rhs); + ~RegisterFile(); + + void reset(); + + bool operator == (const RegisterFile& rhs) const; + bool operator != (const RegisterFile& rhs) const { + return !operator == (rhs); + } + + int reserve(int reg); + void reserveSeveral(uint32_t regMask); + + void recycle(int reg); + void recycleSeveral(uint32_t regMask); + + int obtain(); + inline int isUsed(int reg) const; + + bool hasFreeRegs() const; + int countFreeRegs() const; + + uint32_t touched() const; + inline uint32_t status() const { return mStatus; } + + enum { + OUT_OF_REGISTERS = 0x1 + }; + + private: + uint32_t mRegs; + uint32_t mTouched; + uint32_t mStatus; + }; + + class Scratch + { + public: + Scratch(RegisterFile& regFile) + : mRegFile(regFile), mScratch(0) { + } + ~Scratch() { + mRegFile.recycleSeveral(mScratch); + } + int obtain() { + int reg = mRegFile.obtain(); + mScratch |= 1<<reg; + return reg; + } + void recycle(int reg) { + mRegFile.recycle(reg); + mScratch &= ~(1<<reg); + } + bool isUsed(int reg) { + return (mScratch & (1<<reg)); + } + int countFreeRegs() { + return mRegFile.countFreeRegs(); + } + private: + RegisterFile& mRegFile; + uint32_t mScratch; + }; + + class Spill + { + public: + Spill(RegisterFile& regFile, ARMAssemblerInterface& gen, uint32_t reglist) + : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0) + { + if (reglist) { + int count = 0; + while (reglist) { + count++; + reglist &= ~(1 << (31 - __builtin_clz(reglist))); + } + if (count == 1) { + int reg = 31 - __builtin_clz(mRegList); + mGen.STR(mGen.AL, reg, mGen.SP, mGen.immed12_pre(-4, 1)); + } else { + mGen.STM(mGen.AL, mGen.DB, mGen.SP, 1, mRegList); + } + mRegFile.recycleSeveral(mRegList); + mCount = count; + } + } + ~Spill() { + if (mRegList) { + if (mCount == 1) { + int reg = 31 - __builtin_clz(mRegList); + mGen.LDR(mGen.AL, reg, mGen.SP, mGen.immed12_post(4)); + } else { + mGen.LDM(mGen.AL, mGen.IA, mGen.SP, 1, mRegList); + } + mRegFile.reserveSeveral(mRegList); + } + } + private: + RegisterFile& mRegFile; + ARMAssemblerInterface& mGen; + uint32_t mRegList; + int mCount; + }; + +private: + RegisterFile mRegs; +}; + +// ---------------------------------------------------------------------------- + +class GGLAssembler : public ARMAssemblerProxy, public RegisterAllocator +{ +public: + + GGLAssembler(ARMAssemblerInterface* target); + virtual ~GGLAssembler(); + + uint32_t* base() const { return 0; } // XXX + uint32_t* pc() const { return 0; } // XXX + + void reset(int opt_level); + + virtual void prolog(); + virtual void epilog(uint32_t touched); + + // generate scanline code for given needs + int scanline(const needs_t& needs, context_t const* c); + int scanline_core(const needs_t& needs, context_t const* c); + + enum { + CLEAR_LO = 0x0001, + CLEAR_HI = 0x0002, + CORRUPTIBLE = 0x0004, + FIRST = 0x0008 + }; + + enum { //load/store flags + WRITE_BACK = 0x0001 + }; + + struct reg_t { + reg_t() : reg(-1), flags(0) { + } + reg_t(int r, int f=0) + : reg(r), flags(f) { + } + void setTo(int r, int f=0) { + reg=r; flags=f; + } + int reg; + uint16_t flags; + }; + + struct integer_t : public reg_t { + integer_t() : reg_t(), s(0) { + } + integer_t(int r, int sz=32, int f=0) + : reg_t(r, f), s(sz) { + } + void setTo(int r, int sz=32, int f=0) { + reg_t::setTo(r, f); s=sz; + } + int8_t s; + inline int size() const { return s; } + }; + + struct pixel_t : public reg_t { + pixel_t() : reg_t() { + memset(&format, 0, sizeof(GGLFormat)); + } + pixel_t(int r, const GGLFormat* fmt, int f=0) + : reg_t(r, f), format(*fmt) { + } + void setTo(int r, const GGLFormat* fmt, int f=0) { + reg_t::setTo(r, f); format = *fmt; + } + GGLFormat format; + inline int hi(int c) const { return format.c[c].h; } + inline int low(int c) const { return format.c[c].l; } + inline int mask(int c) const { return ((1<<size(c))-1) << low(c); } + inline int size() const { return format.size*8; } + inline int size(int c) const { return component_size(c); } + inline int component_size(int c) const { return hi(c) - low(c); } + }; + + struct component_t : public reg_t { + component_t() : reg_t(), h(0), l(0) { + } + component_t(int r, int f=0) + : reg_t(r, f), h(0), l(0) { + } + component_t(int r, int lo, int hi, int f=0) + : reg_t(r, f), h(hi), l(lo) { + } + explicit component_t(const integer_t& rhs) + : reg_t(rhs.reg, rhs.flags), h(rhs.s), l(0) { + } + explicit component_t(const pixel_t& rhs, int component) { + setTo( rhs.reg, + rhs.format.c[component].l, + rhs.format.c[component].h, + rhs.flags|CLEAR_LO|CLEAR_HI); + } + void setTo(int r, int lo=0, int hi=0, int f=0) { + reg_t::setTo(r, f); h=hi; l=lo; + } + int8_t h; + int8_t l; + inline int size() const { return h-l; } + }; + + struct pointer_t : public reg_t { + pointer_t() : reg_t(), size(0) { + } + pointer_t(int r, int s, int f=0) + : reg_t(r, f), size(s) { + } + void setTo(int r, int s, int f=0) { + reg_t::setTo(r, f); size=s; + } + int8_t size; + }; + + +private: + struct tex_coord_t { + reg_t s; + reg_t t; + pointer_t ptr; + }; + + struct fragment_parts_t { + uint32_t packed : 1; + uint32_t reload : 2; + uint32_t iterated_packed : 1; + pixel_t iterated; + pointer_t cbPtr; + pointer_t covPtr; + reg_t count; + reg_t argb[4]; + reg_t argb_dx[4]; + reg_t z; + reg_t dither; + pixel_t texel[GGL_TEXTURE_UNIT_COUNT]; + tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT]; + }; + + struct texture_unit_t { + int format_idx; + GGLFormat format; + int bits; + int swrap; + int twrap; + int env; + int pot; + int linear; + uint8_t mask; + uint8_t replaced; + }; + + struct texture_machine_t { + texture_unit_t tmu[GGL_TEXTURE_UNIT_COUNT]; + uint8_t mask; + uint8_t replaced; + uint8_t directTexture; + uint8_t activeUnits; + }; + + struct component_info_t { + bool masked : 1; + bool inDest : 1; + bool needed : 1; + bool replaced : 1; + bool iterated : 1; + bool smooth : 1; + bool blend : 1; + bool fog : 1; + }; + + struct builder_context_t { + context_t const* c; + needs_t needs; + int Rctx; + }; + + template <typename T> + void modify(T& r, Scratch& regs) + { + if (!(r.flags & CORRUPTIBLE)) { + r.reg = regs.obtain(); + r.flags |= CORRUPTIBLE; + } + } + + // helpers + void base_offset(const pointer_t& d, const pointer_t& b, const reg_t& o); + + // texture environement + void modulate( component_t& dest, + const component_t& incoming, + const pixel_t& texel, int component); + + void decal( component_t& dest, + const component_t& incoming, + const pixel_t& texel, int component); + + void blend( component_t& dest, + const component_t& incoming, + const pixel_t& texel, int component, int tmu); + + // load/store stuff + void store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0); + void load(const pointer_t& addr, const pixel_t& dest, uint32_t flags=0); + void extract(integer_t& d, const pixel_t& s, int component); + void extract(component_t& d, const pixel_t& s, int component); + void extract(integer_t& d, int s, int h, int l, int bits=32); + void expand(integer_t& d, const integer_t& s, int dbits); + void expand(integer_t& d, const component_t& s, int dbits); + void expand(component_t& d, const component_t& s, int dbits); + void downshift(pixel_t& d, int component, component_t s, const reg_t& dither); + + + void mul_factor( component_t& d, + const integer_t& v, + const integer_t& f); + + void mul_factor_add( component_t& d, + const integer_t& v, + const integer_t& f, + const component_t& a); + + void component_add( component_t& d, + const integer_t& dst, + const integer_t& src); + + void component_sat( const component_t& v); + + + void build_scanline_prolog( fragment_parts_t& parts, + const needs_t& needs); + + void build_smooth_shade(const fragment_parts_t& parts); + + void build_component( pixel_t& pixel, + const fragment_parts_t& parts, + int component, + Scratch& global_scratches); + + void build_incoming_component( + component_t& temp, + int dst_size, + const fragment_parts_t& parts, + int component, + Scratch& scratches, + Scratch& global_scratches); + + void init_iterated_color(fragment_parts_t& parts, const reg_t& x); + + void build_iterated_color( component_t& fragment, + const fragment_parts_t& parts, + int component, + Scratch& regs); + + void decodeLogicOpNeeds(const needs_t& needs); + + void decodeTMUNeeds(const needs_t& needs, context_t const* c); + + void init_textures( tex_coord_t* coords, + const reg_t& x, + const reg_t& y); + + void build_textures( fragment_parts_t& parts, + Scratch& regs); + + void filter8( const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS); + + void filter16( const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS); + + void filter24( const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS); + + void filter32( const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS); + + void build_texture_environment( component_t& fragment, + const fragment_parts_t& parts, + int component, + Scratch& regs); + + void wrapping( int d, + int coord, int size, + int tx_wrap, int tx_linear); + + void build_fog( component_t& temp, + int component, + Scratch& parent_scratches); + + void build_blending( component_t& in_out, + const pixel_t& pixel, + int component, + Scratch& parent_scratches); + + void build_blend_factor( + integer_t& factor, int f, int component, + const pixel_t& dst_pixel, + integer_t& fragment, + integer_t& fb, + Scratch& scratches); + + void build_blendFOneMinusF( component_t& temp, + const integer_t& factor, + const integer_t& fragment, + const integer_t& fb); + + void build_blendOneMinusFF( component_t& temp, + const integer_t& factor, + const integer_t& fragment, + const integer_t& fb); + + void build_coverage_application(component_t& fragment, + const fragment_parts_t& parts, + Scratch& regs); + + void build_alpha_test(component_t& fragment, const fragment_parts_t& parts); + + enum { Z_TEST=1, Z_WRITE=2 }; + void build_depth_test(const fragment_parts_t& parts, uint32_t mask); + void build_iterate_z(const fragment_parts_t& parts); + void build_iterate_f(const fragment_parts_t& parts); + void build_iterate_texture_coordinates(const fragment_parts_t& parts); + + void build_logic_op(pixel_t& pixel, Scratch& regs); + + void build_masking(pixel_t& pixel, Scratch& regs); + + void build_and_immediate(int d, int s, uint32_t mask, int bits); + + bool isAlphaSourceNeeded() const; + + enum { + FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8 + }; + + enum { + LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4 + }; + + static int blending_codes(int fs, int fd); + + builder_context_t mBuilderContext; + texture_machine_t mTextureMachine; + component_info_t mInfo[4]; + int mBlending; + int mMasking; + int mLogicOp; + int mAlphaTest; + int mAA; + int mDithering; + int mDepthTest; + + int mSmooth; + int mFog; + pixel_t mDstPixel; + + GGLFormat mCbFormat; + + int mBlendFactorCached; + integer_t mAlphaSource; + + int mBaseRegister; + + int mBlendSrc; + int mBlendDst; + int mBlendSrcA; + int mBlendDstA; + + int mOptLevel; +}; + +// ---------------------------------------------------------------------------- + +}; // namespace android + +#endif // ANDROID_GGLASSEMBLER_H diff --git a/libpixelflinger/codeflinger/armreg.h b/libpixelflinger/codeflinger/armreg.h new file mode 100644 index 0000000..fde81ba --- /dev/null +++ b/libpixelflinger/codeflinger/armreg.h @@ -0,0 +1,300 @@ +/* $NetBSD: armreg.h,v 1.28 2003/10/31 16:30:15 scw Exp $ */ + +/*- + * Copyright (c) 1998, 2001 Ben Harris + * Copyright (c) 1994-1996 Mark Brinicombe. + * Copyright (c) 1994 Brini. + * All rights reserved. + * + * This code is derived from software written for Brini by Mark Brinicombe + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Brini. + * 4. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/armreg.h,v 1.3 2005/11/21 19:06:25 cognet Exp $ + */ + +#ifndef MACHINE_ARMREG_H +#define MACHINE_ARMREG_H +#define INSN_SIZE 4 +#define INSN_COND_MASK 0xf0000000 /* Condition mask */ +#define PSR_MODE 0x0000001f /* mode mask */ +#define PSR_USR26_MODE 0x00000000 +#define PSR_FIQ26_MODE 0x00000001 +#define PSR_IRQ26_MODE 0x00000002 +#define PSR_SVC26_MODE 0x00000003 +#define PSR_USR32_MODE 0x00000010 +#define PSR_FIQ32_MODE 0x00000011 +#define PSR_IRQ32_MODE 0x00000012 +#define PSR_SVC32_MODE 0x00000013 +#define PSR_ABT32_MODE 0x00000017 +#define PSR_UND32_MODE 0x0000001b +#define PSR_SYS32_MODE 0x0000001f +#define PSR_32_MODE 0x00000010 +#define PSR_FLAGS 0xf0000000 /* flags */ + +#define PSR_C_bit (1 << 29) /* carry */ + +/* The high-order byte is always the implementor */ +#define CPU_ID_IMPLEMENTOR_MASK 0xff000000 +#define CPU_ID_ARM_LTD 0x41000000 /* 'A' */ +#define CPU_ID_DEC 0x44000000 /* 'D' */ +#define CPU_ID_INTEL 0x69000000 /* 'i' */ +#define CPU_ID_TI 0x54000000 /* 'T' */ + +/* How to decide what format the CPUID is in. */ +#define CPU_ID_ISOLD(x) (((x) & 0x0000f000) == 0x00000000) +#define CPU_ID_IS7(x) (((x) & 0x0000f000) == 0x00007000) +#define CPU_ID_ISNEW(x) (!CPU_ID_ISOLD(x) && !CPU_ID_IS7(x)) + +/* On ARM3 and ARM6, this byte holds the foundry ID. */ +#define CPU_ID_FOUNDRY_MASK 0x00ff0000 +#define CPU_ID_FOUNDRY_VLSI 0x00560000 + +/* On ARM7 it holds the architecture and variant (sub-model) */ +#define CPU_ID_7ARCH_MASK 0x00800000 +#define CPU_ID_7ARCH_V3 0x00000000 +#define CPU_ID_7ARCH_V4T 0x00800000 +#define CPU_ID_7VARIANT_MASK 0x007f0000 + +/* On more recent ARMs, it does the same, but in a different format */ +#define CPU_ID_ARCH_MASK 0x000f0000 +#define CPU_ID_ARCH_V3 0x00000000 +#define CPU_ID_ARCH_V4 0x00010000 +#define CPU_ID_ARCH_V4T 0x00020000 +#define CPU_ID_ARCH_V5 0x00030000 +#define CPU_ID_ARCH_V5T 0x00040000 +#define CPU_ID_ARCH_V5TE 0x00050000 +#define CPU_ID_VARIANT_MASK 0x00f00000 + +/* Next three nybbles are part number */ +#define CPU_ID_PARTNO_MASK 0x0000fff0 + +/* Intel XScale has sub fields in part number */ +#define CPU_ID_XSCALE_COREGEN_MASK 0x0000e000 /* core generation */ +#define CPU_ID_XSCALE_COREREV_MASK 0x00001c00 /* core revision */ +#define CPU_ID_XSCALE_PRODUCT_MASK 0x000003f0 /* product number */ + +/* And finally, the revision number. */ +#define CPU_ID_REVISION_MASK 0x0000000f + +/* Individual CPUs are probably best IDed by everything but the revision. */ +#define CPU_ID_CPU_MASK 0xfffffff0 + +/* Fake CPU IDs for ARMs without CP15 */ +#define CPU_ID_ARM2 0x41560200 +#define CPU_ID_ARM250 0x41560250 + +/* Pre-ARM7 CPUs -- [15:12] == 0 */ +#define CPU_ID_ARM3 0x41560300 +#define CPU_ID_ARM600 0x41560600 +#define CPU_ID_ARM610 0x41560610 +#define CPU_ID_ARM620 0x41560620 + +/* ARM7 CPUs -- [15:12] == 7 */ +#define CPU_ID_ARM700 0x41007000 /* XXX This is a guess. */ +#define CPU_ID_ARM710 0x41007100 +#define CPU_ID_ARM7500 0x41027100 /* XXX This is a guess. */ +#define CPU_ID_ARM710A 0x41047100 /* inc ARM7100 */ +#define CPU_ID_ARM7500FE 0x41077100 +#define CPU_ID_ARM710T 0x41807100 +#define CPU_ID_ARM720T 0x41807200 +#define CPU_ID_ARM740T8K 0x41807400 /* XXX no MMU, 8KB cache */ +#define CPU_ID_ARM740T4K 0x41817400 /* XXX no MMU, 4KB cache */ + +/* Post-ARM7 CPUs */ +#define CPU_ID_ARM810 0x41018100 +#define CPU_ID_ARM920T 0x41129200 +#define CPU_ID_ARM920T_ALT 0x41009200 +#define CPU_ID_ARM922T 0x41029220 +#define CPU_ID_ARM940T 0x41029400 /* XXX no MMU */ +#define CPU_ID_ARM946ES 0x41049460 /* XXX no MMU */ +#define CPU_ID_ARM966ES 0x41049660 /* XXX no MMU */ +#define CPU_ID_ARM966ESR1 0x41059660 /* XXX no MMU */ +#define CPU_ID_ARM1020E 0x4115a200 /* (AKA arm10 rev 1) */ +#define CPU_ID_ARM1022ES 0x4105a220 +#define CPU_ID_SA110 0x4401a100 +#define CPU_ID_SA1100 0x4401a110 +#define CPU_ID_TI925T 0x54029250 +#define CPU_ID_SA1110 0x6901b110 +#define CPU_ID_IXP1200 0x6901c120 +#define CPU_ID_80200 0x69052000 +#define CPU_ID_PXA250 0x69052100 /* sans core revision */ +#define CPU_ID_PXA210 0x69052120 +#define CPU_ID_PXA250A 0x69052100 /* 1st version Core */ +#define CPU_ID_PXA210A 0x69052120 /* 1st version Core */ +#define CPU_ID_PXA250B 0x69052900 /* 3rd version Core */ +#define CPU_ID_PXA210B 0x69052920 /* 3rd version Core */ +#define CPU_ID_PXA250C 0x69052d00 /* 4th version Core */ +#define CPU_ID_PXA210C 0x69052d20 /* 4th version Core */ +#define CPU_ID_80321_400 0x69052420 +#define CPU_ID_80321_600 0x69052430 +#define CPU_ID_80321_400_B0 0x69052c20 +#define CPU_ID_80321_600_B0 0x69052c30 +#define CPU_ID_IXP425_533 0x690541c0 +#define CPU_ID_IXP425_400 0x690541d0 +#define CPU_ID_IXP425_266 0x690541f0 + +/* ARM3-specific coprocessor 15 registers */ +#define ARM3_CP15_FLUSH 1 +#define ARM3_CP15_CONTROL 2 +#define ARM3_CP15_CACHEABLE 3 +#define ARM3_CP15_UPDATEABLE 4 +#define ARM3_CP15_DISRUPTIVE 5 + +/* ARM3 Control register bits */ +#define ARM3_CTL_CACHE_ON 0x00000001 +#define ARM3_CTL_SHARED 0x00000002 +#define ARM3_CTL_MONITOR 0x00000004 + +/* + * Post-ARM3 CP15 registers: + * + * 1 Control register + * + * 2 Translation Table Base + * + * 3 Domain Access Control + * + * 4 Reserved + * + * 5 Fault Status + * + * 6 Fault Address + * + * 7 Cache/write-buffer Control + * + * 8 TLB Control + * + * 9 Cache Lockdown + * + * 10 TLB Lockdown + * + * 11 Reserved + * + * 12 Reserved + * + * 13 Process ID (for FCSE) + * + * 14 Reserved + * + * 15 Implementation Dependent + */ + +/* Some of the definitions below need cleaning up for V3/V4 architectures */ + +/* CPU control register (CP15 register 1) */ +#define CPU_CONTROL_MMU_ENABLE 0x00000001 /* M: MMU/Protection unit enable */ +#define CPU_CONTROL_AFLT_ENABLE 0x00000002 /* A: Alignment fault enable */ +#define CPU_CONTROL_DC_ENABLE 0x00000004 /* C: IDC/DC enable */ +#define CPU_CONTROL_WBUF_ENABLE 0x00000008 /* W: Write buffer enable */ +#define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */ +#define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */ +#define CPU_CONTROL_LABT_ENABLE 0x00000040 /* L: Late abort enable */ +#define CPU_CONTROL_BEND_ENABLE 0x00000080 /* B: Big-endian mode */ +#define CPU_CONTROL_SYST_ENABLE 0x00000100 /* S: System protection bit */ +#define CPU_CONTROL_ROM_ENABLE 0x00000200 /* R: ROM protection bit */ +#define CPU_CONTROL_CPCLK 0x00000400 /* F: Implementation defined */ +#define CPU_CONTROL_BPRD_ENABLE 0x00000800 /* Z: Branch prediction enable */ +#define CPU_CONTROL_IC_ENABLE 0x00001000 /* I: IC enable */ +#define CPU_CONTROL_VECRELOC 0x00002000 /* V: Vector relocation */ +#define CPU_CONTROL_ROUNDROBIN 0x00004000 /* RR: Predictable replacement */ +#define CPU_CONTROL_V4COMPAT 0x00008000 /* L4: ARMv4 compat LDR R15 etc */ + +#define CPU_CONTROL_IDC_ENABLE CPU_CONTROL_DC_ENABLE + +/* XScale Auxillary Control Register (CP15 register 1, opcode2 1) */ +#define XSCALE_AUXCTL_K 0x00000001 /* dis. write buffer coalescing */ +#define XSCALE_AUXCTL_P 0x00000002 /* ECC protect page table access */ +#define XSCALE_AUXCTL_MD_WB_RA 0x00000000 /* mini-D$ wb, read-allocate */ +#define XSCALE_AUXCTL_MD_WB_RWA 0x00000010 /* mini-D$ wb, read/write-allocate */ +#define XSCALE_AUXCTL_MD_WT 0x00000020 /* mini-D$ wt, read-allocate */ +#define XSCALE_AUXCTL_MD_MASK 0x00000030 + +/* Cache type register definitions */ +#define CPU_CT_ISIZE(x) ((x) & 0xfff) /* I$ info */ +#define CPU_CT_DSIZE(x) (((x) >> 12) & 0xfff) /* D$ info */ +#define CPU_CT_S (1U << 24) /* split cache */ +#define CPU_CT_CTYPE(x) (((x) >> 25) & 0xf) /* cache type */ + +#define CPU_CT_CTYPE_WT 0 /* write-through */ +#define CPU_CT_CTYPE_WB1 1 /* write-back, clean w/ read */ +#define CPU_CT_CTYPE_WB2 2 /* w/b, clean w/ cp15,7 */ +#define CPU_CT_CTYPE_WB6 6 /* w/b, cp15,7, lockdown fmt A */ +#define CPU_CT_CTYPE_WB7 7 /* w/b, cp15,7, lockdown fmt B */ + +#define CPU_CT_xSIZE_LEN(x) ((x) & 0x3) /* line size */ +#define CPU_CT_xSIZE_M (1U << 2) /* multiplier */ +#define CPU_CT_xSIZE_ASSOC(x) (((x) >> 3) & 0x7) /* associativity */ +#define CPU_CT_xSIZE_SIZE(x) (((x) >> 6) & 0x7) /* size */ + +/* Fault status register definitions */ + +#define FAULT_TYPE_MASK 0x0f +#define FAULT_USER 0x10 + +#define FAULT_WRTBUF_0 0x00 /* Vector Exception */ +#define FAULT_WRTBUF_1 0x02 /* Terminal Exception */ +#define FAULT_BUSERR_0 0x04 /* External Abort on Linefetch -- Section */ +#define FAULT_BUSERR_1 0x06 /* External Abort on Linefetch -- Page */ +#define FAULT_BUSERR_2 0x08 /* External Abort on Non-linefetch -- Section */ +#define FAULT_BUSERR_3 0x0a /* External Abort on Non-linefetch -- Page */ +#define FAULT_BUSTRNL1 0x0c /* External abort on Translation -- Level 1 */ +#define FAULT_BUSTRNL2 0x0e /* External abort on Translation -- Level 2 */ +#define FAULT_ALIGN_0 0x01 /* Alignment */ +#define FAULT_ALIGN_1 0x03 /* Alignment */ +#define FAULT_TRANS_S 0x05 /* Translation -- Section */ +#define FAULT_TRANS_P 0x07 /* Translation -- Page */ +#define FAULT_DOMAIN_S 0x09 /* Domain -- Section */ +#define FAULT_DOMAIN_P 0x0b /* Domain -- Page */ +#define FAULT_PERM_S 0x0d /* Permission -- Section */ +#define FAULT_PERM_P 0x0f /* Permission -- Page */ + +#define FAULT_IMPRECISE 0x400 /* Imprecise exception (XSCALE) */ + +/* + * Address of the vector page, low and high versions. + */ +#define ARM_VECTORS_LOW 0x00000000U +#define ARM_VECTORS_HIGH 0xffff0000U + +/* + * ARM Instructions + * + * 3 3 2 2 2 + * 1 0 9 8 7 0 + * +-------+-------------------------------------------------------+ + * | cond | instruction dependant | + * |c c c c| | + * +-------+-------------------------------------------------------+ + */ + +#define INSN_SIZE 4 /* Always 4 bytes */ +#define INSN_COND_MASK 0xf0000000 /* Condition mask */ +#define INSN_COND_AL 0xe0000000 /* Always condition */ + +#endif /* !MACHINE_ARMREG_H */ diff --git a/libpixelflinger/codeflinger/blending.cpp b/libpixelflinger/codeflinger/blending.cpp new file mode 100644 index 0000000..6d3b282 --- /dev/null +++ b/libpixelflinger/codeflinger/blending.cpp @@ -0,0 +1,676 @@ +/* libs/pixelflinger/codeflinger/blending.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> + +#include <cutils/log.h> + +#include "codeflinger/GGLAssembler.h" + + +namespace android { + +void GGLAssembler::build_fog( + component_t& temp, // incomming fragment / output + int component, + Scratch& regs) +{ + if (mInfo[component].fog) { + Scratch scratches(registerFile()); + comment("fog"); + + integer_t fragment(temp.reg, temp.h, temp.flags); + if (!(temp.flags & CORRUPTIBLE)) { + temp.reg = regs.obtain(); + temp.flags |= CORRUPTIBLE; + } + + integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE); + LDRB(AL, fogColor.reg, mBuilderContext.Rctx, + immed12_pre(GGL_OFFSETOF(state.fog.color[component]))); + + integer_t factor(scratches.obtain(), 16, CORRUPTIBLE); + CONTEXT_LOAD(factor.reg, generated_vars.f); + + build_blendFOneMinusF(temp, factor, fragment, fogColor); + } +} + +void GGLAssembler::build_blending( + component_t& temp, // incomming fragment / output + const pixel_t& pixel, // framebuffer + int component, + Scratch& regs) +{ + if (!mInfo[component].blend) + return; + + int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc; + int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst; + if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) + fs = GGL_ONE; + const int blending = blending_codes(fs, fd); + if (!temp.size()) { + // here, blending will produce something which doesn't depend on + // that component (eg: GL_ZERO:GL_*), so the register has not been + // allocated yet. Will never be used as a source. + temp = component_t(regs.obtain(), CORRUPTIBLE); + } + + // we are doing real blending... + // fb: extracted dst + // fragment: extracted src + // temp: component_t(fragment) and result + + // scoped register allocator + Scratch scratches(registerFile()); + comment("blending"); + + // we can optimize these cases a bit... + // (1) saturation is not needed + // (2) we can use only one multiply instead of 2 + // (3) we can reduce the register pressure + // R = S*f + D*(1-f) = (S-D)*f + D + // R = S*(1-f) + D*f = (D-S)*f + S + + const bool same_factor_opt1 = + (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) || + (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) || + (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) || + (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA); + + const bool same_factor_opt2 = + (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) || + (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || + (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) || + (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA); + + + // XXX: we could also optimize these cases: + // R = S*f + D*f = (S+D)*f + // R = S*(1-f) + D*(1-f) = (S+D)*(1-f) + // R = S*D + D*S = 2*S*D + + + // see if we need to extract 'component' from the destination (fb) + integer_t fb; + if (blending & (BLEND_DST|FACTOR_DST)) { + fb.setTo(scratches.obtain(), 32); + extract(fb, pixel, component); + if (mDithering) { + // XXX: maybe what we should do instead, is simply + // expand fb -or- fragment to the larger of the two + if (fb.size() < temp.size()) { + // for now we expand 'fb' to min(fragment, 8) + int new_size = temp.size() < 8 ? temp.size() : 8; + expand(fb, fb, new_size); + } + } + } + + + // convert input fragment to integer_t + if (temp.l && (temp.flags & CORRUPTIBLE)) { + MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l)); + temp.h -= temp.l; + temp.l = 0; + } + integer_t fragment(temp.reg, temp.size(), temp.flags); + + // if not done yet, convert input fragment to integer_t + if (temp.l) { + // here we know temp is not CORRUPTIBLE + fragment.reg = scratches.obtain(); + MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l)); + fragment.flags |= CORRUPTIBLE; + } + + if (!(temp.flags & CORRUPTIBLE)) { + // temp is not corruptible, but since it's the destination it + // will be modified, so we need to allocate a new register. + temp.reg = regs.obtain(); + temp.flags &= ~CORRUPTIBLE; + fragment.flags &= ~CORRUPTIBLE; + } + + if ((blending & BLEND_SRC) && !same_factor_opt1) { + // source (fragment) is needed for the blending stage + // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1) + fragment.flags &= ~CORRUPTIBLE; + } + + + if (same_factor_opt1) { + // R = S*f + D*(1-f) = (S-D)*f + D + integer_t factor; + build_blend_factor(factor, fs, + component, pixel, fragment, fb, scratches); + // fb is always corruptible from this point + fb.flags |= CORRUPTIBLE; + build_blendFOneMinusF(temp, factor, fragment, fb); + } else if (same_factor_opt2) { + // R = S*(1-f) + D*f = (D-S)*f + S + integer_t factor; + // fb is always corrruptible here + fb.flags |= CORRUPTIBLE; + build_blend_factor(factor, fd, + component, pixel, fragment, fb, scratches); + build_blendOneMinusFF(temp, factor, fragment, fb); + } else { + integer_t src_factor; + integer_t dst_factor; + + // if destination (fb) is not needed for the blending stage, + // then it can be marked as CORRUPTIBLE + if (!(blending & BLEND_DST)) { + fb.flags |= CORRUPTIBLE; + } + + // XXX: try to mark some registers as CORRUPTIBLE + // in most case we could make those corruptible + // when we're processing the last component + // but not always, for instance + // when fragment is constant and not reloaded + // when fb is needed for logic-ops or masking + // when a register is aliased (for instance with mAlphaSource) + + // blend away... + if (fs==GGL_ZERO) { + if (fd==GGL_ZERO) { // R = 0 + // already taken care of + } else if (fd==GGL_ONE) { // R = D + // already taken care of + } else { // R = D*fd + // compute fd + build_blend_factor(dst_factor, fd, + component, pixel, fragment, fb, scratches); + mul_factor(temp, fb, dst_factor); + } + } else if (fs==GGL_ONE) { + if (fd==GGL_ZERO) { // R = S + // NOP, taken care of + } else if (fd==GGL_ONE) { // R = S + D + component_add(temp, fb, fragment); // args order matters + component_sat(temp); + } else { // R = S + D*fd + // compute fd + build_blend_factor(dst_factor, fd, + component, pixel, fragment, fb, scratches); + mul_factor_add(temp, fb, dst_factor, component_t(fragment)); + if (fd==GGL_ONE_MINUS_SRC_ALPHA) { + // XXX: in theory this is not correct, we should + // saturate here. However, this mode is often + // used for displaying alpha-premultiplied graphics, + // in which case, saturation is not necessary. + // unfortunatelly, we have no way to know. + // This is a case, where we sacrifice correctness for + // performance. we should probably have some heuristics. + } else { + component_sat(temp); + } + } + } else { + // compute fs + build_blend_factor(src_factor, fs, + component, pixel, fragment, fb, scratches); + if (fd==GGL_ZERO) { // R = S*fs + mul_factor(temp, fragment, src_factor); + } else if (fd==GGL_ONE) { // R = S*fs + D + mul_factor_add(temp, fragment, src_factor, component_t(fb)); + component_sat(temp); + } else { // R = S*fs + D*fd + mul_factor(temp, fragment, src_factor); + if (scratches.isUsed(src_factor.reg)) + scratches.recycle(src_factor.reg); + // compute fd + build_blend_factor(dst_factor, fd, + component, pixel, fragment, fb, scratches); + mul_factor_add(temp, fb, dst_factor, temp); + if (!same_factor_opt1 && !same_factor_opt2) { + component_sat(temp); + } + } + } + } + + // now we can be corrupted (it's the dest) + temp.flags |= CORRUPTIBLE; +} + +void GGLAssembler::build_blend_factor( + integer_t& factor, int f, int component, + const pixel_t& dst_pixel, + integer_t& fragment, + integer_t& fb, + Scratch& scratches) +{ + integer_t src_alpha(fragment); + + // src_factor/dst_factor won't be used after blending, + // so it's fine to mark them as CORRUPTIBLE (if not aliased) + factor.flags |= CORRUPTIBLE; + + switch(f) { + case GGL_ONE_MINUS_SRC_ALPHA: + case GGL_SRC_ALPHA: + if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) { + // we're processing alpha, so we already have + // src-alpha in fragment, and we need src-alpha just this time. + } else { + // alpha-src will be needed for other components + if (!mBlendFactorCached || mBlendFactorCached==f) { + src_alpha = mAlphaSource; + factor = mAlphaSource; + factor.flags &= ~CORRUPTIBLE; + // we already computed the blend factor before, nothing to do. + if (mBlendFactorCached) + return; + // this is the first time, make sure to compute the blend + // factor properly. + mBlendFactorCached = f; + break; + } else { + // we have a cached alpha blend factor, but we want another one, + // this should really not happen because by construction, + // we cannot have BOTH source and destination + // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because + // the blending stage uses the f/(1-f) optimization + + // for completeness, we handle this case though. Since there + // are only 2 choices, this meens we want "the other one" + // (1-factor) + factor = mAlphaSource; + factor.flags &= ~CORRUPTIBLE; + RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s))); + mBlendFactorCached = f; + return; + } + } + // fall-through... + case GGL_ONE_MINUS_DST_COLOR: + case GGL_DST_COLOR: + case GGL_ONE_MINUS_SRC_COLOR: + case GGL_SRC_COLOR: + case GGL_ONE_MINUS_DST_ALPHA: + case GGL_DST_ALPHA: + case GGL_SRC_ALPHA_SATURATE: + // help us find out what register we can use for the blend-factor + // CORRUPTIBLE registers are chosen first, or a new one is allocated. + if (fragment.flags & CORRUPTIBLE) { + factor.setTo(fragment.reg, 32, CORRUPTIBLE); + fragment.flags &= ~CORRUPTIBLE; + } else if (fb.flags & CORRUPTIBLE) { + factor.setTo(fb.reg, 32, CORRUPTIBLE); + fb.flags &= ~CORRUPTIBLE; + } else { + factor.setTo(scratches.obtain(), 32, CORRUPTIBLE); + } + break; + } + + // XXX: doesn't work if size==1 + + switch(f) { + case GGL_ONE_MINUS_DST_COLOR: + case GGL_DST_COLOR: + factor.s = fb.s; + ADD(AL, 0, factor.reg, fb.reg, reg_imm(fb.reg, LSR, fb.s-1)); + break; + case GGL_ONE_MINUS_SRC_COLOR: + case GGL_SRC_COLOR: + factor.s = fragment.s; + ADD(AL, 0, factor.reg, fragment.reg, + reg_imm(fragment.reg, LSR, fragment.s-1)); + break; + case GGL_ONE_MINUS_SRC_ALPHA: + case GGL_SRC_ALPHA: + factor.s = src_alpha.s; + ADD(AL, 0, factor.reg, src_alpha.reg, + reg_imm(src_alpha.reg, LSR, src_alpha.s-1)); + break; + case GGL_ONE_MINUS_DST_ALPHA: + case GGL_DST_ALPHA: + // XXX: should be precomputed + extract(factor, dst_pixel, GGLFormat::ALPHA); + ADD(AL, 0, factor.reg, factor.reg, + reg_imm(factor.reg, LSR, factor.s-1)); + break; + case GGL_SRC_ALPHA_SATURATE: + // XXX: should be precomputed + // XXX: f = min(As, 1-Ad) + // btw, we're guaranteed that Ad's size is <= 8, because + // it's extracted from the framebuffer + break; + } + + switch(f) { + case GGL_ONE_MINUS_DST_COLOR: + case GGL_ONE_MINUS_SRC_COLOR: + case GGL_ONE_MINUS_DST_ALPHA: + case GGL_ONE_MINUS_SRC_ALPHA: + RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s))); + } + + // don't need more than 8-bits for the blend factor + // and this will prevent overflows in the multiplies later + if (factor.s > 8) { + MOV(AL, 0, factor.reg, reg_imm(factor.reg, LSR, factor.s-8)); + factor.s = 8; + } +} + +int GGLAssembler::blending_codes(int fs, int fd) +{ + int blending = 0; + switch(fs) { + case GGL_ONE: + blending |= BLEND_SRC; + break; + + case GGL_ONE_MINUS_DST_COLOR: + case GGL_DST_COLOR: + blending |= FACTOR_DST|BLEND_SRC; + break; + case GGL_ONE_MINUS_DST_ALPHA: + case GGL_DST_ALPHA: + // no need to extract 'component' from the destination + // for the blend factor, because we need ALPHA only. + blending |= BLEND_SRC; + break; + + case GGL_ONE_MINUS_SRC_COLOR: + case GGL_SRC_COLOR: + blending |= FACTOR_SRC|BLEND_SRC; + break; + case GGL_ONE_MINUS_SRC_ALPHA: + case GGL_SRC_ALPHA: + case GGL_SRC_ALPHA_SATURATE: + blending |= FACTOR_SRC|BLEND_SRC; + break; + } + switch(fd) { + case GGL_ONE: + blending |= BLEND_DST; + break; + + case GGL_ONE_MINUS_DST_COLOR: + case GGL_DST_COLOR: + blending |= FACTOR_DST|BLEND_DST; + break; + case GGL_ONE_MINUS_DST_ALPHA: + case GGL_DST_ALPHA: + blending |= FACTOR_DST|BLEND_DST; + break; + + case GGL_ONE_MINUS_SRC_COLOR: + case GGL_SRC_COLOR: + blending |= FACTOR_SRC|BLEND_DST; + break; + case GGL_ONE_MINUS_SRC_ALPHA: + case GGL_SRC_ALPHA: + // no need to extract 'component' from the source + // for the blend factor, because we need ALPHA only. + blending |= BLEND_DST; + break; + } + return blending; +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::build_blendFOneMinusF( + component_t& temp, + const integer_t& factor, + const integer_t& fragment, + const integer_t& fb) +{ + // R = S*f + D*(1-f) = (S-D)*f + D + Scratch scratches(registerFile()); + // compute S-D + integer_t diff(fragment.flags & CORRUPTIBLE ? + fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE); + const int shift = fragment.size() - fb.size(); + if (shift>0) RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift)); + else if (shift<0) RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift)); + else RSB(AL, 0, diff.reg, fb.reg, fragment.reg); + mul_factor_add(temp, diff, factor, component_t(fb)); +} + +void GGLAssembler::build_blendOneMinusFF( + component_t& temp, + const integer_t& factor, + const integer_t& fragment, + const integer_t& fb) +{ + // R = S*f + D*(1-f) = (S-D)*f + D + Scratch scratches(registerFile()); + // compute D-S + integer_t diff(fb.flags & CORRUPTIBLE ? + fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE); + const int shift = fragment.size() - fb.size(); + if (shift>0) SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift)); + else if (shift<0) SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift)); + else SUB(AL, 0, diff.reg, fb.reg, fragment.reg); + mul_factor_add(temp, diff, factor, component_t(fragment)); +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::mul_factor( component_t& d, + const integer_t& v, + const integer_t& f) +{ + int vs = v.size(); + int fs = f.size(); + int ms = vs+fs; + + // XXX: we could have special cases for 1 bit mul + + // all this code below to use the best multiply instruction + // wrt the parameters size. We take advantage of the fact + // that the 16-bits multiplies allow a 16-bit shift + // The trick is that we just make sure that we have at least 8-bits + // per component (which is enough for a 8 bits display). + + int xy; + int vshift = 0; + int fshift = 0; + int smulw = 0; + + if (vs<16) { + if (fs<16) { + xy = xyBB; + } else if (GGL_BETWEEN(fs, 24, 31)) { + ms -= 16; + xy = xyTB; + } else { + // eg: 15 * 18 -> 15 * 15 + fshift = fs - 15; + ms -= fshift; + xy = xyBB; + } + } else if (GGL_BETWEEN(vs, 24, 31)) { + if (fs<16) { + ms -= 16; + xy = xyTB; + } else if (GGL_BETWEEN(fs, 24, 31)) { + ms -= 32; + xy = xyTT; + } else { + // eg: 24 * 18 -> 8 * 18 + fshift = fs - 15; + ms -= 16 + fshift; + xy = xyTB; + } + } else { + if (fs<16) { + // eg: 18 * 15 -> 15 * 15 + vshift = vs - 15; + ms -= vshift; + xy = xyBB; + } else if (GGL_BETWEEN(fs, 24, 31)) { + // eg: 18 * 24 -> 15 * 8 + vshift = vs - 15; + ms -= 16 + vshift; + xy = xyBT; + } else { + // eg: 18 * 18 -> (15 * 18)>>16 + fshift = fs - 15; + ms -= 16 + fshift; + xy = yB; //XXX SMULWB + smulw = 1; + } + } + + LOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs); + + int vreg = v.reg; + int freg = f.reg; + if (vshift) { + MOV(AL, 0, d.reg, reg_imm(vreg, LSR, vshift)); + vreg = d.reg; + } + if (fshift) { + MOV(AL, 0, d.reg, reg_imm(vreg, LSR, fshift)); + freg = d.reg; + } + if (smulw) SMULW(AL, xy, d.reg, vreg, freg); + else SMUL(AL, xy, d.reg, vreg, freg); + + + d.h = ms; + if (mDithering) { + d.l = 0; + } else { + d.l = fs; + d.flags |= CLEAR_LO; + } +} + +void GGLAssembler::mul_factor_add( component_t& d, + const integer_t& v, + const integer_t& f, + const component_t& a) +{ + // XXX: we could have special cases for 1 bit mul + Scratch scratches(registerFile()); + + int vs = v.size(); + int fs = f.size(); + int as = a.h; + int ms = vs+fs; + + LOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as); + + integer_t add(a.reg, a.h, a.flags); + + // 'a' is a component_t but it is guaranteed to have + // its high bits set to 0. However in the dithering case, + // we can't get away with truncating the potentially bad bits + // so extraction is needed. + + if ((mDithering) && (a.size() < ms)) { + // we need to expand a + if (!(a.flags & CORRUPTIBLE)) { + // ... but it's not corruptible, so we need to pick a + // temporary register. + // Try to uses the destination register first (it's likely + // to be usable, unless it aliases an input). + if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) { + add.reg = d.reg; + } else { + add.reg = scratches.obtain(); + } + } + expand(add, a, ms); // extracts and expands + as = ms; + } + + if (ms == as) { + if (vs<16 && fs<16) SMLABB(AL, d.reg, v.reg, f.reg, add.reg); + else MLA(AL, 0, d.reg, v.reg, f.reg, add.reg); + } else { + int temp = d.reg; + if (temp == add.reg) { + // the mul will modify add.reg, we need an intermediary reg + if (v.flags & CORRUPTIBLE) temp = v.reg; + else if (f.flags & CORRUPTIBLE) temp = f.reg; + else temp = scratches.obtain(); + } + + if (vs<16 && fs<16) SMULBB(AL, temp, v.reg, f.reg); + else MUL(AL, 0, temp, v.reg, f.reg); + + if (ms>as) { + ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSL, ms-as)); + } else if (ms<as) { + // not sure if we should expand the mul instead? + ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSR, as-ms)); + } + } + + d.h = ms; + if (mDithering) { + d.l = a.l; + } else { + d.l = fs>a.l ? fs : a.l; + d.flags |= CLEAR_LO; + } +} + +void GGLAssembler::component_add(component_t& d, + const integer_t& dst, const integer_t& src) +{ + // here we're guaranteed that fragment.size() >= fb.size() + const int shift = src.size() - dst.size(); + if (!shift) { + ADD(AL, 0, d.reg, src.reg, dst.reg); + } else { + ADD(AL, 0, d.reg, src.reg, reg_imm(dst.reg, LSL, shift)); + } + + d.h = src.size(); + if (mDithering) { + d.l = 0; + } else { + d.l = shift; + d.flags |= CLEAR_LO; + } +} + +void GGLAssembler::component_sat(const component_t& v) +{ + const int one = ((1<<v.size())-1)<<v.l; + CMP(AL, v.reg, imm( 1<<v.h )); + if (isValidImmediate(one)) { + MOV(HS, 0, v.reg, imm( one )); + } else if (isValidImmediate(~one)) { + MVN(HS, 0, v.reg, imm( ~one )); + } else { + MOV(HS, 0, v.reg, imm( 1<<v.h )); + SUB(HS, 0, v.reg, v.reg, imm( 1<<v.l )); + } +} + +// ---------------------------------------------------------------------------- + +}; // namespace android + diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c new file mode 100644 index 0000000..4676da0 --- /dev/null +++ b/libpixelflinger/codeflinger/disassem.c @@ -0,0 +1,702 @@ +/* $NetBSD: disassem.c,v 1.14 2003/03/27 16:58:36 mycroft Exp $ */ + +/*- + * Copyright (c) 1996 Mark Brinicombe. + * Copyright (c) 1996 Brini. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Brini. + * 4. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * RiscBSD kernel project + * + * db_disasm.c + * + * Kernel disassembler + * + * Created : 10/02/96 + * + * Structured after the sparc/sparc/db_disasm.c by David S. Miller & + * Paul Kranenburg + * + * This code is not complete. Not all instructions are disassembled. + */ + +#include <sys/cdefs.h> +//__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/arm/arm/disassem.c,v 1.2 2005/01/05 21:58:47 imp Exp $"); +#include <sys/param.h> +#include <stdio.h> + +#include "disassem.h" +#include "armreg.h" +//#include <ddb/ddb.h> + +/* + * General instruction format + * + * insn[cc][mod] [operands] + * + * Those fields with an uppercase format code indicate that the field + * follows directly after the instruction before the separator i.e. + * they modify the instruction rather than just being an operand to + * the instruction. The only exception is the writeback flag which + * follows a operand. + * + * + * 2 - print Operand 2 of a data processing instruction + * d - destination register (bits 12-15) + * n - n register (bits 16-19) + * s - s register (bits 8-11) + * o - indirect register rn (bits 16-19) (used by swap) + * m - m register (bits 0-3) + * a - address operand of ldr/str instruction + * e - address operand of ldrh/strh instruction + * l - register list for ldm/stm instruction + * f - 1st fp operand (register) (bits 12-14) + * g - 2nd fp operand (register) (bits 16-18) + * h - 3rd fp operand (register/immediate) (bits 0-4) + * b - branch address + * t - thumb branch address (bits 24, 0-23) + * k - breakpoint comment (bits 0-3, 8-19) + * X - block transfer type + * Y - block transfer type (r13 base) + * c - comment field bits(0-23) + * p - saved or current status register + * F - PSR transfer fields + * D - destination-is-r15 (P) flag on TST, TEQ, CMP, CMN + * L - co-processor transfer size + * S - set status flag + * P - fp precision + * Q - fp precision (for ldf/stf) + * R - fp rounding + * v - co-processor data transfer registers + addressing mode + * W - writeback flag + * x - instruction in hex + * # - co-processor number + * y - co-processor data processing registers + * z - co-processor register transfer registers + */ + +struct arm32_insn { + u_int mask; + u_int pattern; + char* name; + char* format; +}; + +static const struct arm32_insn arm32_i[] = { + { 0x0fffffff, 0x0ff00000, "imb", "c" }, /* Before swi */ + { 0x0fffffff, 0x0ff00001, "imbrange", "c" }, /* Before swi */ + { 0x0f000000, 0x0f000000, "swi", "c" }, + { 0xfe000000, 0xfa000000, "blx", "t" }, /* Before b and bl */ + { 0x0f000000, 0x0a000000, "b", "b" }, + { 0x0f000000, 0x0b000000, "bl", "b" }, + { 0x0fe000f0, 0x00000090, "mul", "Snms" }, + { 0x0fe000f0, 0x00200090, "mla", "Snmsd" }, + { 0x0fe000f0, 0x00800090, "umull", "Sdnms" }, + { 0x0fe000f0, 0x00c00090, "smull", "Sdnms" }, + { 0x0fe000f0, 0x00a00090, "umlal", "Sdnms" }, + { 0x0fe000f0, 0x00e00090, "smlal", "Sdnms" }, + { 0x0d700000, 0x04200000, "strt", "daW" }, + { 0x0d700000, 0x04300000, "ldrt", "daW" }, + { 0x0d700000, 0x04600000, "strbt", "daW" }, + { 0x0d700000, 0x04700000, "ldrbt", "daW" }, + { 0x0c500000, 0x04000000, "str", "daW" }, + { 0x0c500000, 0x04100000, "ldr", "daW" }, + { 0x0c500000, 0x04400000, "strb", "daW" }, + { 0x0c500000, 0x04500000, "ldrb", "daW" }, + { 0x0e1f0000, 0x080d0000, "stm", "YnWl" },/* separate out r13 base */ + { 0x0e1f0000, 0x081d0000, "ldm", "YnWl" },/* separate out r13 base */ + { 0x0e100000, 0x08000000, "stm", "XnWl" }, + { 0x0e100000, 0x08100000, "ldm", "XnWl" }, + { 0x0e1000f0, 0x00100090, "ldrb", "deW" }, + { 0x0e1000f0, 0x00000090, "strb", "deW" }, + { 0x0e1000f0, 0x001000d0, "ldrsb", "deW" }, + { 0x0e1000f0, 0x001000b0, "ldrh", "deW" }, + { 0x0e1000f0, 0x000000b0, "strh", "deW" }, + { 0x0e1000f0, 0x001000f0, "ldrsh", "deW" }, + { 0x0f200090, 0x00200090, "und", "x" }, /* Before data processing */ + { 0x0e1000d0, 0x000000d0, "und", "x" }, /* Before data processing */ + { 0x0ff00ff0, 0x01000090, "swp", "dmo" }, + { 0x0ff00ff0, 0x01400090, "swpb", "dmo" }, + { 0x0fbf0fff, 0x010f0000, "mrs", "dp" }, /* Before data processing */ + { 0x0fb0fff0, 0x0120f000, "msr", "pFm" },/* Before data processing */ + { 0x0fb0f000, 0x0320f000, "msr", "pF2" },/* Before data processing */ + { 0x0ffffff0, 0x012fff10, "bx", "m" }, + { 0x0fff0ff0, 0x016f0f10, "clz", "dm" }, + { 0x0ffffff0, 0x012fff30, "blx", "m" }, + { 0xfff000f0, 0xe1200070, "bkpt", "k" }, + { 0x0de00000, 0x00000000, "and", "Sdn2" }, + { 0x0de00000, 0x00200000, "eor", "Sdn2" }, + { 0x0de00000, 0x00400000, "sub", "Sdn2" }, + { 0x0de00000, 0x00600000, "rsb", "Sdn2" }, + { 0x0de00000, 0x00800000, "add", "Sdn2" }, + { 0x0de00000, 0x00a00000, "adc", "Sdn2" }, + { 0x0de00000, 0x00c00000, "sbc", "Sdn2" }, + { 0x0de00000, 0x00e00000, "rsc", "Sdn2" }, + { 0x0df00000, 0x01100000, "tst", "Dn2" }, + { 0x0df00000, 0x01300000, "teq", "Dn2" }, + { 0x0df00000, 0x01500000, "cmp", "Dn2" }, + { 0x0df00000, 0x01700000, "cmn", "Dn2" }, + { 0x0de00000, 0x01800000, "orr", "Sdn2" }, + { 0x0de00000, 0x01a00000, "mov", "Sd2" }, + { 0x0de00000, 0x01c00000, "bic", "Sdn2" }, + { 0x0de00000, 0x01e00000, "mvn", "Sd2" }, + { 0x0ff08f10, 0x0e000100, "adf", "PRfgh" }, + { 0x0ff08f10, 0x0e100100, "muf", "PRfgh" }, + { 0x0ff08f10, 0x0e200100, "suf", "PRfgh" }, + { 0x0ff08f10, 0x0e300100, "rsf", "PRfgh" }, + { 0x0ff08f10, 0x0e400100, "dvf", "PRfgh" }, + { 0x0ff08f10, 0x0e500100, "rdf", "PRfgh" }, + { 0x0ff08f10, 0x0e600100, "pow", "PRfgh" }, + { 0x0ff08f10, 0x0e700100, "rpw", "PRfgh" }, + { 0x0ff08f10, 0x0e800100, "rmf", "PRfgh" }, + { 0x0ff08f10, 0x0e900100, "fml", "PRfgh" }, + { 0x0ff08f10, 0x0ea00100, "fdv", "PRfgh" }, + { 0x0ff08f10, 0x0eb00100, "frd", "PRfgh" }, + { 0x0ff08f10, 0x0ec00100, "pol", "PRfgh" }, + { 0x0f008f10, 0x0e000100, "fpbop", "PRfgh" }, + { 0x0ff08f10, 0x0e008100, "mvf", "PRfh" }, + { 0x0ff08f10, 0x0e108100, "mnf", "PRfh" }, + { 0x0ff08f10, 0x0e208100, "abs", "PRfh" }, + { 0x0ff08f10, 0x0e308100, "rnd", "PRfh" }, + { 0x0ff08f10, 0x0e408100, "sqt", "PRfh" }, + { 0x0ff08f10, 0x0e508100, "log", "PRfh" }, + { 0x0ff08f10, 0x0e608100, "lgn", "PRfh" }, + { 0x0ff08f10, 0x0e708100, "exp", "PRfh" }, + { 0x0ff08f10, 0x0e808100, "sin", "PRfh" }, + { 0x0ff08f10, 0x0e908100, "cos", "PRfh" }, + { 0x0ff08f10, 0x0ea08100, "tan", "PRfh" }, + { 0x0ff08f10, 0x0eb08100, "asn", "PRfh" }, + { 0x0ff08f10, 0x0ec08100, "acs", "PRfh" }, + { 0x0ff08f10, 0x0ed08100, "atn", "PRfh" }, + { 0x0f008f10, 0x0e008100, "fpuop", "PRfh" }, + { 0x0e100f00, 0x0c000100, "stf", "QLv" }, + { 0x0e100f00, 0x0c100100, "ldf", "QLv" }, + { 0x0ff00f10, 0x0e000110, "flt", "PRgd" }, + { 0x0ff00f10, 0x0e100110, "fix", "PRdh" }, + { 0x0ff00f10, 0x0e200110, "wfs", "d" }, + { 0x0ff00f10, 0x0e300110, "rfs", "d" }, + { 0x0ff00f10, 0x0e400110, "wfc", "d" }, + { 0x0ff00f10, 0x0e500110, "rfc", "d" }, + { 0x0ff0ff10, 0x0e90f110, "cmf", "PRgh" }, + { 0x0ff0ff10, 0x0eb0f110, "cnf", "PRgh" }, + { 0x0ff0ff10, 0x0ed0f110, "cmfe", "PRgh" }, + { 0x0ff0ff10, 0x0ef0f110, "cnfe", "PRgh" }, + { 0xff100010, 0xfe000010, "mcr2", "#z" }, + { 0x0f100010, 0x0e000010, "mcr", "#z" }, + { 0xff100010, 0xfe100010, "mrc2", "#z" }, + { 0x0f100010, 0x0e100010, "mrc", "#z" }, + { 0xff000010, 0xfe000000, "cdp2", "#y" }, + { 0x0f000010, 0x0e000000, "cdp", "#y" }, + { 0xfe100090, 0xfc100000, "ldc2", "L#v" }, + { 0x0e100090, 0x0c100000, "ldc", "L#v" }, + { 0xfe100090, 0xfc000000, "stc2", "L#v" }, + { 0x0e100090, 0x0c000000, "stc", "L#v" }, + { 0xf550f000, 0xf550f000, "pld", "ne" }, + { 0x0ff00ff0, 0x01000050, "qaad", "dmn" }, + { 0x0ff00ff0, 0x01400050, "qdaad", "dmn" }, + { 0x0ff00ff0, 0x01600050, "qdsub", "dmn" }, + { 0x0ff00ff0, 0x01200050, "dsub", "dmn" }, + { 0x0ff000f0, 0x01000080, "smlabb", "nmsd" }, // d & n inverted!! + { 0x0ff000f0, 0x010000a0, "smlatb", "nmsd" }, // d & n inverted!! + { 0x0ff000f0, 0x010000c0, "smlabt", "nmsd" }, // d & n inverted!! + { 0x0ff000f0, 0x010000e0, "smlatt", "nmsd" }, // d & n inverted!! + { 0x0ff000f0, 0x01400080, "smlalbb","ndms" }, // d & n inverted!! + { 0x0ff000f0, 0x014000a0, "smlaltb","ndms" }, // d & n inverted!! + { 0x0ff000f0, 0x014000c0, "smlalbt","ndms" }, // d & n inverted!! + { 0x0ff000f0, 0x014000e0, "smlaltt","ndms" }, // d & n inverted!! + { 0x0ff000f0, 0x01200080, "smlawb", "nmsd" }, // d & n inverted!! + { 0x0ff0f0f0, 0x012000a0, "smulwb","nms" }, // d & n inverted!! + { 0x0ff000f0, 0x012000c0, "smlawt", "nmsd" }, // d & n inverted!! + { 0x0ff0f0f0, 0x012000e0, "smulwt","nms" }, // d & n inverted!! + { 0x0ff0f0f0, 0x01600080, "smulbb","nms" }, // d & n inverted!! + { 0x0ff0f0f0, 0x016000a0, "smultb","nms" }, // d & n inverted!! + { 0x0ff0f0f0, 0x016000c0, "smulbt","nms" }, // d & n inverted!! + { 0x0ff0f0f0, 0x016000e0, "smultt","nms" }, // d & n inverted!! + { 0x00000000, 0x00000000, NULL, NULL } +}; + +static char const arm32_insn_conditions[][4] = { + "eq", "ne", "cs", "cc", + "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", + "gt", "le", "", "nv" +}; + +static char const insn_block_transfers[][4] = { + "da", "ia", "db", "ib" +}; + +static char const insn_stack_block_transfers[][4] = { + "ed", "ea", "fd", "fa" +}; + +static char const op_shifts[][4] = { + "lsl", "lsr", "asr", "ror" +}; + +static char const insn_fpa_rounding[][2] = { + "", "p", "m", "z" +}; + +static char const insn_fpa_precision[][2] = { + "s", "d", "e", "p" +}; + +static char const insn_fpaconstants[][8] = { + "0.0", "1.0", "2.0", "3.0", + "4.0", "5.0", "0.5", "10.0" +}; + +#define insn_condition(x) arm32_insn_conditions[(x >> 28) & 0x0f] +#define insn_blktrans(x) insn_block_transfers[(x >> 23) & 3] +#define insn_stkblktrans(x) insn_stack_block_transfers[(x >> 23) & 3] +#define op2_shift(x) op_shifts[(x >> 5) & 3] +#define insn_fparnd(x) insn_fpa_rounding[(x >> 5) & 0x03] +#define insn_fpaprec(x) insn_fpa_precision[(((x >> 18) & 2)|(x >> 7)) & 1] +#define insn_fpaprect(x) insn_fpa_precision[(((x >> 21) & 2)|(x >> 15)) & 1] +#define insn_fpaimm(x) insn_fpaconstants[x & 0x07] + +/* Local prototypes */ +static void disasm_register_shift(const disasm_interface_t *di, u_int insn); +static void disasm_print_reglist(const disasm_interface_t *di, u_int insn); +static void disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, + u_int loc); +static void disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, + u_int loc); +static void disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, + u_int loc); +static u_int disassemble_readword(u_int address); +static void disassemble_printaddr(u_int address); + +u_int +disasm(const disasm_interface_t *di, u_int loc, int altfmt) +{ + const struct arm32_insn *i_ptr = &arm32_i[0]; + + u_int insn; + int matchp; + int branch; + char* f_ptr; + int fmt; + + fmt = 0; + matchp = 0; + insn = di->di_readword(loc); + +/* di->di_printf("loc=%08x insn=%08x : ", loc, insn);*/ + + while (i_ptr->name) { + if ((insn & i_ptr->mask) == i_ptr->pattern) { + matchp = 1; + break; + } + i_ptr++; + } + + if (!matchp) { + di->di_printf("und%s\t%08x\n", insn_condition(insn), insn); + return(loc + INSN_SIZE); + } + + /* If instruction forces condition code, don't print it. */ + if ((i_ptr->mask & 0xf0000000) == 0xf0000000) + di->di_printf("%s", i_ptr->name); + else + di->di_printf("%s%s", i_ptr->name, insn_condition(insn)); + + f_ptr = i_ptr->format; + + /* Insert tab if there are no instruction modifiers */ + + if (*(f_ptr) < 'A' || *(f_ptr) > 'Z') { + ++fmt; + di->di_printf("\t"); + } + + while (*f_ptr) { + switch (*f_ptr) { + /* 2 - print Operand 2 of a data processing instruction */ + case '2': + if (insn & 0x02000000) { + int rotate= ((insn >> 7) & 0x1e); + + di->di_printf("#0x%08x", + (insn & 0xff) << (32 - rotate) | + (insn & 0xff) >> rotate); + } else { + disasm_register_shift(di, insn); + } + break; + /* d - destination register (bits 12-15) */ + case 'd': + di->di_printf("r%d", ((insn >> 12) & 0x0f)); + break; + /* D - insert 'p' if Rd is R15 */ + case 'D': + if (((insn >> 12) & 0x0f) == 15) + di->di_printf("p"); + break; + /* n - n register (bits 16-19) */ + case 'n': + di->di_printf("r%d", ((insn >> 16) & 0x0f)); + break; + /* s - s register (bits 8-11) */ + case 's': + di->di_printf("r%d", ((insn >> 8) & 0x0f)); + break; + /* o - indirect register rn (bits 16-19) (used by swap) */ + case 'o': + di->di_printf("[r%d]", ((insn >> 16) & 0x0f)); + break; + /* m - m register (bits 0-4) */ + case 'm': + di->di_printf("r%d", ((insn >> 0) & 0x0f)); + break; + /* a - address operand of ldr/str instruction */ + case 'a': + disasm_insn_ldrstr(di, insn, loc); + break; + /* e - address operand of ldrh/strh instruction */ + case 'e': + disasm_insn_ldrhstrh(di, insn, loc); + break; + /* l - register list for ldm/stm instruction */ + case 'l': + disasm_print_reglist(di, insn); + break; + /* f - 1st fp operand (register) (bits 12-14) */ + case 'f': + di->di_printf("f%d", (insn >> 12) & 7); + break; + /* g - 2nd fp operand (register) (bits 16-18) */ + case 'g': + di->di_printf("f%d", (insn >> 16) & 7); + break; + /* h - 3rd fp operand (register/immediate) (bits 0-4) */ + case 'h': + if (insn & (1 << 3)) + di->di_printf("#%s", insn_fpaimm(insn)); + else + di->di_printf("f%d", insn & 7); + break; + /* b - branch address */ + case 'b': + branch = ((insn << 2) & 0x03ffffff); + if (branch & 0x02000000) + branch |= 0xfc000000; + di->di_printaddr(loc + 8 + branch); + break; + /* t - blx address */ + case 't': + branch = ((insn << 2) & 0x03ffffff) | + (insn >> 23 & 0x00000002); + if (branch & 0x02000000) + branch |= 0xfc000000; + di->di_printaddr(loc + 8 + branch); + break; + /* X - block transfer type */ + case 'X': + di->di_printf("%s", insn_blktrans(insn)); + break; + /* Y - block transfer type (r13 base) */ + case 'Y': + di->di_printf("%s", insn_stkblktrans(insn)); + break; + /* c - comment field bits(0-23) */ + case 'c': + di->di_printf("0x%08x", (insn & 0x00ffffff)); + break; + /* k - breakpoint comment (bits 0-3, 8-19) */ + case 'k': + di->di_printf("0x%04x", + (insn & 0x000fff00) >> 4 | (insn & 0x0000000f)); + break; + /* p - saved or current status register */ + case 'p': + if (insn & 0x00400000) + di->di_printf("spsr"); + else + di->di_printf("cpsr"); + break; + /* F - PSR transfer fields */ + case 'F': + di->di_printf("_"); + if (insn & (1 << 16)) + di->di_printf("c"); + if (insn & (1 << 17)) + di->di_printf("x"); + if (insn & (1 << 18)) + di->di_printf("s"); + if (insn & (1 << 19)) + di->di_printf("f"); + break; + /* B - byte transfer flag */ + case 'B': + if (insn & 0x00400000) + di->di_printf("b"); + break; + /* L - co-processor transfer size */ + case 'L': + if (insn & (1 << 22)) + di->di_printf("l"); + break; + /* S - set status flag */ + case 'S': + if (insn & 0x00100000) + di->di_printf("s"); + break; + /* P - fp precision */ + case 'P': + di->di_printf("%s", insn_fpaprec(insn)); + break; + /* Q - fp precision (for ldf/stf) */ + case 'Q': + break; + /* R - fp rounding */ + case 'R': + di->di_printf("%s", insn_fparnd(insn)); + break; + /* W - writeback flag */ + case 'W': + if (insn & (1 << 21)) + di->di_printf("!"); + break; + /* # - co-processor number */ + case '#': + di->di_printf("p%d", (insn >> 8) & 0x0f); + break; + /* v - co-processor data transfer registers+addressing mode */ + case 'v': + disasm_insn_ldcstc(di, insn, loc); + break; + /* x - instruction in hex */ + case 'x': + di->di_printf("0x%08x", insn); + break; + /* y - co-processor data processing registers */ + case 'y': + di->di_printf("%d, ", (insn >> 20) & 0x0f); + + di->di_printf("c%d, c%d, c%d", (insn >> 12) & 0x0f, + (insn >> 16) & 0x0f, insn & 0x0f); + + di->di_printf(", %d", (insn >> 5) & 0x07); + break; + /* z - co-processor register transfer registers */ + case 'z': + di->di_printf("%d, ", (insn >> 21) & 0x07); + di->di_printf("r%d, c%d, c%d, %d", + (insn >> 12) & 0x0f, (insn >> 16) & 0x0f, + insn & 0x0f, (insn >> 5) & 0x07); + +/* if (((insn >> 5) & 0x07) != 0) + di->di_printf(", %d", (insn >> 5) & 0x07);*/ + break; + default: + di->di_printf("[%c - unknown]", *f_ptr); + break; + } + if (*(f_ptr+1) >= 'A' && *(f_ptr+1) <= 'Z') + ++f_ptr; + else if (*(++f_ptr)) { + ++fmt; + if (fmt == 1) + di->di_printf("\t"); + else + di->di_printf(", "); + } + }; + + di->di_printf("\n"); + + return(loc + INSN_SIZE); +} + + +static void +disasm_register_shift(const disasm_interface_t *di, u_int insn) +{ + di->di_printf("r%d", (insn & 0x0f)); + if ((insn & 0x00000ff0) == 0) + ; + else if ((insn & 0x00000ff0) == 0x00000060) + di->di_printf(", rrx"); + else { + if (insn & 0x10) + di->di_printf(", %s r%d", op2_shift(insn), + (insn >> 8) & 0x0f); + else + di->di_printf(", %s #%d", op2_shift(insn), + (insn >> 7) & 0x1f); + } +} + + +static void +disasm_print_reglist(const disasm_interface_t *di, u_int insn) +{ + int loop; + int start; + int comma; + + di->di_printf("{"); + start = -1; + comma = 0; + + for (loop = 0; loop < 17; ++loop) { + if (start != -1) { + if (loop == 16 || !(insn & (1 << loop))) { + if (comma) + di->di_printf(", "); + else + comma = 1; + if (start == loop - 1) + di->di_printf("r%d", start); + else + di->di_printf("r%d-r%d", start, loop - 1); + start = -1; + } + } else { + if (insn & (1 << loop)) + start = loop; + } + } + di->di_printf("}"); + + if (insn & (1 << 22)) + di->di_printf("^"); +} + +static void +disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, u_int loc) +{ + int offset; + + offset = insn & 0xfff; + if ((insn & 0x032f0000) == 0x010f0000) { + /* rA = pc, immediate index */ + if (insn & 0x00800000) + loc += offset; + else + loc -= offset; + di->di_printaddr(loc + 8); + } else { + di->di_printf("[r%d", (insn >> 16) & 0x0f); + if ((insn & 0x03000fff) != 0x01000000) { + di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]"); + if (!(insn & 0x00800000)) + di->di_printf("-"); + if (insn & (1 << 25)) + disasm_register_shift(di, insn); + else + di->di_printf("#0x%03x", offset); + } + if (insn & (1 << 24)) + di->di_printf("]"); + } +} + +static void +disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, u_int loc) +{ + int offset; + + offset = ((insn & 0xf00) >> 4) | (insn & 0xf); + if ((insn & 0x004f0000) == 0x004f0000) { + /* rA = pc, immediate index */ + if (insn & 0x00800000) + loc += offset; + else + loc -= offset; + di->di_printaddr(loc + 8); + } else { + di->di_printf("[r%d", (insn >> 16) & 0x0f); + if ((insn & 0x01400f0f) != 0x01400000) { + di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]"); + if (!(insn & 0x00800000)) + di->di_printf("-"); + if (insn & (1 << 22)) + di->di_printf("#0x%02x", offset); + else + di->di_printf("r%d", (insn & 0x0f)); + } + if (insn & (1 << 24)) + di->di_printf("]"); + } +} + +static void +disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, u_int loc) +{ + if (((insn >> 8) & 0xf) == 1) + di->di_printf("f%d, ", (insn >> 12) & 0x07); + else + di->di_printf("c%d, ", (insn >> 12) & 0x0f); + + di->di_printf("[r%d", (insn >> 16) & 0x0f); + + di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]"); + + if (!(insn & (1 << 23))) + di->di_printf("-"); + + di->di_printf("#0x%03x", (insn & 0xff) << 2); + + if (insn & (1 << 24)) + di->di_printf("]"); + + if (insn & (1 << 21)) + di->di_printf("!"); +} + +static u_int +disassemble_readword(u_int address) +{ + return(*((u_int *)address)); +} + +static void +disassemble_printaddr(u_int address) +{ + printf("0x%08x", address); +} + +static const disasm_interface_t disassemble_di = { + disassemble_readword, disassemble_printaddr, printf +}; + +void +disassemble(u_int address) +{ + + (void)disasm(&disassemble_di, address, 0); +} + +/* End of disassem.c */ diff --git a/libpixelflinger/codeflinger/disassem.h b/libpixelflinger/codeflinger/disassem.h new file mode 100644 index 0000000..02747cd --- /dev/null +++ b/libpixelflinger/codeflinger/disassem.h @@ -0,0 +1,65 @@ +/* $NetBSD: disassem.h,v 1.4 2001/03/04 04:15:58 matt Exp $ */ + +/*- + * Copyright (c) 1997 Mark Brinicombe. + * Copyright (c) 1997 Causality Limited. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Mark Brinicombe. + * 4. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Define the interface structure required by the disassembler. + * + * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/disassem.h,v 1.2 2005/01/05 21:58:48 imp Exp $ + */ + +#ifndef ANDROID_MACHINE_DISASSEM_H +#define ANDROID_MACHINE_DISASSEM_H + +#include <sys/types.h> + +#if __cplusplus +extern "C" { +#endif + +typedef struct { + u_int (*di_readword)(u_int); + void (*di_printaddr)(u_int); + void (*di_printf)(const char *, ...); +} disasm_interface_t; + +/* Prototypes for callable functions */ + +u_int disasm(const disasm_interface_t *, u_int, int); +void disassemble(u_int); + +#if __cplusplus +} +#endif + +#endif /* !ANDROID_MACHINE_DISASSEM_H */ diff --git a/libpixelflinger/codeflinger/load_store.cpp b/libpixelflinger/codeflinger/load_store.cpp new file mode 100644 index 0000000..514ce07 --- /dev/null +++ b/libpixelflinger/codeflinger/load_store.cpp @@ -0,0 +1,378 @@ +/* libs/pixelflinger/codeflinger/load_store.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#include <assert.h> +#include <stdio.h> +#include <cutils/log.h> + +#include "codeflinger/GGLAssembler.h" + +namespace android { + +// ---------------------------------------------------------------------------- + +void GGLAssembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags) +{ + const int bits = addr.size; + const int inc = (flags & WRITE_BACK)?1:0; + switch (bits) { + case 32: + if (inc) STR(AL, s.reg, addr.reg, immed12_post(4)); + else STR(AL, s.reg, addr.reg); + break; + case 24: + // 24 bits formats are a little special and used only for RGB + // 0x00BBGGRR is unpacked as R,G,B + STRB(AL, s.reg, addr.reg, immed12_pre(0)); + MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8)); + STRB(AL, s.reg, addr.reg, immed12_pre(1)); + MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8)); + STRB(AL, s.reg, addr.reg, immed12_pre(2)); + if (!(s.flags & CORRUPTIBLE)) { + MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 16)); + } + if (inc) + ADD(AL, 0, addr.reg, addr.reg, imm(3)); + break; + case 16: + if (inc) STRH(AL, s.reg, addr.reg, immed8_post(2)); + else STRH(AL, s.reg, addr.reg); + break; + case 8: + if (inc) STRB(AL, s.reg, addr.reg, immed12_post(1)); + else STRB(AL, s.reg, addr.reg); + break; + } +} + +void GGLAssembler::load(const pointer_t& addr, const pixel_t& s, uint32_t flags) +{ + Scratch scratches(registerFile()); + int s0; + + const int bits = addr.size; + const int inc = (flags & WRITE_BACK)?1:0; + switch (bits) { + case 32: + if (inc) LDR(AL, s.reg, addr.reg, immed12_post(4)); + else LDR(AL, s.reg, addr.reg); + break; + case 24: + // 24 bits formats are a little special and used only for RGB + // R,G,B is packed as 0x00BBGGRR + s0 = scratches.obtain(); + if (s.reg != addr.reg) { + LDRB(AL, s.reg, addr.reg, immed12_pre(0)); // R + LDRB(AL, s0, addr.reg, immed12_pre(1)); // G + ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 8)); + LDRB(AL, s0, addr.reg, immed12_pre(2)); // B + ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 16)); + } else { + int s1 = scratches.obtain(); + LDRB(AL, s1, addr.reg, immed12_pre(0)); // R + LDRB(AL, s0, addr.reg, immed12_pre(1)); // G + ORR(AL, 0, s1, s1, reg_imm(s0, LSL, 8)); + LDRB(AL, s0, addr.reg, immed12_pre(2)); // B + ORR(AL, 0, s.reg, s1, reg_imm(s0, LSL, 16)); + } + if (inc) + ADD(AL, 0, addr.reg, addr.reg, imm(3)); + break; + case 16: + if (inc) LDRH(AL, s.reg, addr.reg, immed8_post(2)); + else LDRH(AL, s.reg, addr.reg); + break; + case 8: + if (inc) LDRB(AL, s.reg, addr.reg, immed12_post(1)); + else LDRB(AL, s.reg, addr.reg); + break; + } +} + +void GGLAssembler::extract(integer_t& d, int s, int h, int l, int bits) +{ + const int maskLen = h-l; + + assert(maskLen<=8); + assert(h); + + if (h != bits) { + const int mask = ((1<<maskLen)-1) << l; + if (isValidImmediate(mask)) { + AND(AL, 0, d.reg, s, imm(mask)); // component = packed & mask; + } else if (isValidImmediate(~mask)) { + BIC(AL, 0, d.reg, s, imm(~mask)); // component = packed & mask; + } else { + MOV(AL, 0, d.reg, reg_imm(s, LSL, 32-h)); + l += 32-h; + h = 32; + } + s = d.reg; + } + + if (l) { + MOV(AL, 0, d.reg, reg_imm(s, LSR, l)); // component = packed >> l; + s = d.reg; + } + + if (s != d.reg) { + MOV(AL, 0, d.reg, s); + } + + d.s = maskLen; +} + +void GGLAssembler::extract(integer_t& d, const pixel_t& s, int component) +{ + extract(d, s.reg, + s.format.c[component].h, + s.format.c[component].l, + s.size()); +} + +void GGLAssembler::extract(component_t& d, const pixel_t& s, int component) +{ + integer_t r(d.reg, 32, d.flags); + extract(r, s.reg, + s.format.c[component].h, + s.format.c[component].l, + s.size()); + d = component_t(r); +} + + +void GGLAssembler::expand(integer_t& d, const component_t& s, int dbits) +{ + if (s.l || (s.flags & CLEAR_HI)) { + extract(d, s.reg, s.h, s.l, 32); + expand(d, d, dbits); + } else { + expand(d, integer_t(s.reg, s.size(), s.flags), dbits); + } +} + +void GGLAssembler::expand(component_t& d, const component_t& s, int dbits) +{ + integer_t r(d.reg, 32, d.flags); + expand(r, d, dbits); + d = component_t(r); +} + +void GGLAssembler::expand(integer_t& dst, const integer_t& src, int dbits) +{ + assert(src.size()); + + int sbits = src.size(); + int s = src.reg; + int d = dst.reg; + + // be sure to set 'dst' after we read 'src' as they may be identical + dst.s = dbits; + dst.flags = 0; + + if (dbits<=sbits) { + if (s != d) { + MOV(AL, 0, d, s); + } + return; + } + + if (sbits == 1) { + RSB(AL, 0, d, s, reg_imm(s, LSL, dbits)); + // d = (s<<dbits) - s; + return; + } + + if (dbits % sbits) { + MOV(AL, 0, d, reg_imm(s, LSL, dbits-sbits)); + // d = s << (dbits-sbits); + dbits -= sbits; + do { + ORR(AL, 0, d, d, reg_imm(d, LSR, sbits)); + // d |= d >> sbits; + dbits -= sbits; + sbits *= 2; + } while(dbits>0); + return; + } + + dbits -= sbits; + do { + ORR(AL, 0, d, s, reg_imm(s, LSL, sbits)); + // d |= d<<sbits; + s = d; + dbits -= sbits; + if (sbits*2 < dbits) { + sbits *= 2; + } + } while(dbits>0); +} + +void GGLAssembler::downshift( + pixel_t& d, int component, component_t s, const reg_t& dither) +{ + const needs_t& needs = mBuilderContext.needs; + Scratch scratches(registerFile()); + + int sh = s.h; + int sl = s.l; + int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0; + int maskLoBits = (sl!=0) ? ((s.flags & CLEAR_LO)?1:0) : 0; + int sbits = sh - sl; + + int dh = d.format.c[component].h; + int dl = d.format.c[component].l; + int dbits = dh - dl; + int dithering = 0; + + LOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits); + + if (sbits>dbits) { + // see if we need to dither + dithering = mDithering; + } + + int ireg = d.reg; + if (!(d.flags & FIRST)) { + if (s.flags & CORRUPTIBLE) { + ireg = s.reg; + } else { + ireg = scratches.obtain(); + } + } + d.flags &= ~FIRST; + + if (maskHiBits) { + // we need to mask the high bits (and possibly the lowbits too) + // and we might be able to use immediate mask. + if (!dithering) { + // we don't do this if we only have maskLoBits because we can + // do it more efficiently below (in the case where dl=0) + const int offset = sh - dbits; + if (dbits<=8 && offset >= 0) { + const uint32_t mask = ((1<<dbits)-1) << offset; + if (isValidImmediate(mask) || isValidImmediate(~mask)) { + build_and_immediate(ireg, s.reg, mask, 32); + sl = offset; + s.reg = ireg; + sbits = dbits; + maskLoBits = maskHiBits = 0; + } + } + } else { + // in the dithering case though, we need to preserve the lower bits + const uint32_t mask = ((1<<sbits)-1) << sl; + if (isValidImmediate(mask) || isValidImmediate(~mask)) { + build_and_immediate(ireg, s.reg, mask, 32); + s.reg = ireg; + maskLoBits = maskHiBits = 0; + } + } + } + + // XXX: we could special case (maskHiBits & !maskLoBits) + // like we do for maskLoBits below, but it happens very rarely + // that we have maskHiBits only and the conditions necessary to lead + // to better code (like doing d |= s << 24) + + if (maskHiBits) { + MOV(AL, 0, ireg, reg_imm(s.reg, LSL, 32-sh)); + sl += 32-sh; + sh = 32; + s.reg = ireg; + maskHiBits = 0; + } + + // Downsampling should be performed as follows: + // V * ((1<<dbits)-1) / ((1<<sbits)-1) + // V * [(1<<dbits)/((1<<sbits)-1) - 1/((1<<sbits)-1)] + // V * [1/((1<<sbits)-1)>>dbits - 1/((1<<sbits)-1)] + // V/((1<<(sbits-dbits))-(1>>dbits)) - (V>>sbits)/((1<<sbits)-1)>>sbits + // V/((1<<(sbits-dbits))-(1>>dbits)) - (V>>sbits)/(1-(1>>sbits)) + // + // By approximating (1>>dbits) and (1>>sbits) to 0: + // + // V>>(sbits-dbits) - V>>sbits + // + // A good approximation is V>>(sbits-dbits), + // but better one (needed for dithering) is: + // + // (V>>(sbits-dbits)<<sbits - V)>>sbits + // (V<<dbits - V)>>sbits + // (V - V>>dbits)>>(sbits-dbits) + + // Dithering is done here + if (dithering) { + comment("dithering"); + if (sl) { + MOV(AL, 0, ireg, reg_imm(s.reg, LSR, sl)); + sh -= sl; + sl = 0; + s.reg = ireg; + } + // scaling (V-V>>dbits) + SUB(AL, 0, ireg, s.reg, reg_imm(s.reg, LSR, dbits)); + const int shift = (GGL_DITHER_BITS - (sbits-dbits)); + if (shift>0) ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSR, shift)); + else if (shift<0) ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSL,-shift)); + else ADD(AL, 0, ireg, ireg, dither.reg); + s.reg = ireg; + } + + if ((maskLoBits|dithering) && (sh > dbits)) { + int shift = sh-dbits; + if (dl) { + MOV(AL, 0, ireg, reg_imm(s.reg, LSR, shift)); + if (ireg == d.reg) { + MOV(AL, 0, d.reg, reg_imm(ireg, LSL, dl)); + } else { + ORR(AL, 0, d.reg, d.reg, reg_imm(ireg, LSL, dl)); + } + } else { + if (ireg == d.reg) { + MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift)); + } else { + ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift)); + } + } + } else { + int shift = sh-dh; + if (shift>0) { + if (ireg == d.reg) { + MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift)); + } else { + ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift)); + } + } else if (shift<0) { + if (ireg == d.reg) { + MOV(AL, 0, d.reg, reg_imm(s.reg, LSL, -shift)); + } else { + ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSL, -shift)); + } + } else { + if (ireg == d.reg) { + if (s.reg != d.reg) { + MOV(AL, 0, d.reg, s.reg); + } + } else { + ORR(AL, 0, d.reg, d.reg, s.reg); + } + } + } +} + +}; // namespace android diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp new file mode 100644 index 0000000..269b6c0 --- /dev/null +++ b/libpixelflinger/codeflinger/texturing.cpp @@ -0,0 +1,1208 @@ +/* libs/pixelflinger/codeflinger/texturing.cpp +** +** Copyright 2006, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> + +#include <cutils/log.h> + +#include "codeflinger/GGLAssembler.h" + + +namespace android { + +// --------------------------------------------------------------------------- + +// iterators are initialized like this: +// (intToFixedCenter(x) * dx)>>16 + x0 +// ((x<<16 + 0x8000) * dx)>>16 + x0 +// ((x<<16)*dx + (0x8000*dx))>>16 + x0 +// ( (x*dx) + dx>>1 ) + x0 +// (x*dx) + (dx>>1 + x0) + +void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x) +{ + context_t const* c = mBuilderContext.c; + const needs_t& needs = mBuilderContext.needs; + + if (mSmooth) { + // NOTE: we could take this case in the mDithering + !mSmooth case, + // but this would use up to 4 more registers for the color components + // for only a little added quality. + // Currently, this causes the system to run out of registers in + // some case (see issue #719496) + + comment("compute initial iterated color (smooth and/or dither case)"); + + parts.iterated_packed = 0; + parts.packed = 0; + + // 0x1: color component + // 0x2: iterators + const int optReload = mOptLevel >> 1; + if (optReload >= 3) parts.reload = 0; // reload nothing + else if (optReload == 2) parts.reload = 2; // reload iterators + else if (optReload == 1) parts.reload = 1; // reload colors + else if (optReload <= 0) parts.reload = 3; // reload both + + if (!mSmooth) { + // we're not smoothing (just dithering), we never have to + // reload the iterators + parts.reload &= ~2; + } + + Scratch scratches(registerFile()); + const int t0 = (parts.reload & 1) ? scratches.obtain() : 0; + const int t1 = (parts.reload & 2) ? scratches.obtain() : 0; + for (int i=0 ; i<4 ; i++) { + if (!mInfo[i].iterated) + continue; + + // this component exists in the destination and is not replaced + // by a texture unit. + const int c = (parts.reload & 1) ? t0 : obtainReg(); + if (i==0) CONTEXT_LOAD(c, iterators.ydady); + if (i==1) CONTEXT_LOAD(c, iterators.ydrdy); + if (i==2) CONTEXT_LOAD(c, iterators.ydgdy); + if (i==3) CONTEXT_LOAD(c, iterators.ydbdy); + parts.argb[i].reg = c; + + if (mInfo[i].smooth) { + parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg(); + const int dvdx = parts.argb_dx[i].reg; + CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx); + MLA(AL, 0, c, x.reg, dvdx, c); + + // adjust the color iterator to make sure it won't overflow + if (!mAA) { + // this is not needed when we're using anti-aliasing + // because we will (have to) clamp the components + // anyway. + int end = scratches.obtain(); + MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16)); + MLA(AL, 1, end, dvdx, end, c); + SUB(MI, 0, c, c, end); + BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); + scratches.recycle(end); + } + } + + if (parts.reload & 1) { + CONTEXT_STORE(c, generated_vars.argb[i].c); + } + } + } else { + // We're not smoothed, so we can + // just use a packed version of the color and extract the + // components as needed (or not at all if we don't blend) + + // figure out if we need the iterated color + int load = 0; + for (int i=0 ; i<4 ; i++) { + component_info_t& info = mInfo[i]; + if ((info.inDest || info.needed) && !info.replaced) + load |= 1; + } + + parts.iterated_packed = 1; + parts.packed = (!mTextureMachine.mask && !mBlending + && !mFog && !mDithering); + parts.reload = 0; + if (load || parts.packed) { + if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) { + comment("load initial iterated color (8888 packed)"); + parts.iterated.setTo(obtainReg(), + &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888])); + CONTEXT_LOAD(parts.iterated.reg, packed8888); + } else { + comment("load initial iterated color (dest format packed)"); + + parts.iterated.setTo(obtainReg(), &mCbFormat); + + // pre-mask the iterated color + const int bits = parts.iterated.size(); + const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1; + uint32_t mask = 0; + if (mMasking) { + for (int i=0 ; i<4 ; i++) { + const int component_mask = 1<<i; + const int h = parts.iterated.format.c[i].h; + const int l = parts.iterated.format.c[i].l; + if (h && (!(mMasking & component_mask))) { + mask |= ((1<<(h-l))-1) << l; + } + } + } + + if (mMasking && ((mask & size)==0)) { + // none of the components are present in the mask + } else { + CONTEXT_LOAD(parts.iterated.reg, packed); + if (mCbFormat.size == 1) { + AND(AL, 0, parts.iterated.reg, + parts.iterated.reg, imm(0xFF)); + } else if (mCbFormat.size == 2) { + MOV(AL, 0, parts.iterated.reg, + reg_imm(parts.iterated.reg, LSR, 16)); + } + } + + // pre-mask the iterated color + if (mMasking) { + build_and_immediate(parts.iterated.reg, parts.iterated.reg, + mask, bits); + } + } + } + } +} + +void GGLAssembler::build_iterated_color( + component_t& fragment, + const fragment_parts_t& parts, + int component, + Scratch& regs) +{ + fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); + + if (!mInfo[component].iterated) + return; + + if (parts.iterated_packed) { + // iterated colors are packed, extract the one we need + extract(fragment, parts.iterated, component); + } else { + fragment.h = GGL_COLOR_BITS; + fragment.l = GGL_COLOR_BITS - 8; + fragment.flags |= CLEAR_LO; + // iterated colors are held in their own register, + // (smooth and/or dithering case) + if (parts.reload==3) { + // this implies mSmooth + Scratch scratches(registerFile()); + int dx = scratches.obtain(); + CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); + CONTEXT_LOAD(dx, generated_vars.argb[component].dx); + ADD(AL, 0, dx, fragment.reg, dx); + CONTEXT_STORE(dx, generated_vars.argb[component].c); + } else if (parts.reload & 1) { + CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); + } else { + // we don't reload, so simply rename the register and mark as + // non CORRUPTIBLE so that the texture env or blending code + // won't modify this (renamed) register + regs.recycle(fragment.reg); + fragment.reg = parts.argb[component].reg; + fragment.flags &= ~CORRUPTIBLE; + } + if (mInfo[component].smooth && mAA) { + // when using smooth shading AND anti-aliasing, we need to clamp + // the iterators because there is always an extra pixel on the + // edges, which most of the time will cause an overflow + // (since technically its outside of the domain). + BIC(AL, 0, fragment.reg, fragment.reg, + reg_imm(fragment.reg, ASR, 31)); + component_sat(fragment); + } + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs) +{ + // gather some informations about the components we need to process... + const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; + switch(opcode) { + case GGL_COPY: + mLogicOp = 0; + break; + case GGL_CLEAR: + case GGL_SET: + mLogicOp = LOGIC_OP; + break; + case GGL_AND: + case GGL_AND_REVERSE: + case GGL_AND_INVERTED: + case GGL_XOR: + case GGL_OR: + case GGL_NOR: + case GGL_EQUIV: + case GGL_OR_REVERSE: + case GGL_OR_INVERTED: + case GGL_NAND: + mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST; + break; + case GGL_NOOP: + case GGL_INVERT: + mLogicOp = LOGIC_OP|LOGIC_OP_DST; + break; + case GGL_COPY_INVERTED: + mLogicOp = LOGIC_OP|LOGIC_OP_SRC; + break; + }; +} + +void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c) +{ + uint8_t replaced=0; + mTextureMachine.mask = 0; + mTextureMachine.activeUnits = 0; + for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) { + texture_unit_t& tmu = mTextureMachine.tmu[i]; + if (replaced == 0xF) { + // all components are replaced, skip this TMU. + tmu.format_idx = 0; + tmu.mask = 0; + tmu.replaced = replaced; + continue; + } + tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]); + tmu.format = c->formats[tmu.format_idx]; + tmu.bits = tmu.format.size*8; + tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]); + tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]); + tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i])); + tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]); + tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i]) + && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now + + // 5551 linear filtering is not supported + if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551) + tmu.linear = 0; + + tmu.mask = 0; + tmu.replaced = replaced; + + if (tmu.format_idx) { + mTextureMachine.activeUnits++; + if (tmu.format.c[0].h) tmu.mask |= 0x1; + if (tmu.format.c[1].h) tmu.mask |= 0x2; + if (tmu.format.c[2].h) tmu.mask |= 0x4; + if (tmu.format.c[3].h) tmu.mask |= 0x8; + if (tmu.env == GGL_REPLACE) { + replaced |= tmu.mask; + } else if (tmu.env == GGL_DECAL) { + if (!tmu.format.c[GGLFormat::ALPHA].h) { + // if we don't have alpha, decal does nothing + tmu.mask = 0; + } else { + // decal always ignores At + tmu.mask &= ~(1<<GGLFormat::ALPHA); + } + } + } + mTextureMachine.mask |= tmu.mask; + //printf("%d: mask=%08lx, replaced=%08lx\n", + // i, int(tmu.mask), int(tmu.replaced)); + } + mTextureMachine.replaced = replaced; + mTextureMachine.directTexture = 0; + //printf("replaced=%08lx\n", mTextureMachine.replaced); +} + + +void GGLAssembler::init_textures( + tex_coord_t* coords, + const reg_t& x, const reg_t& y) +{ + context_t const* c = mBuilderContext.c; + const needs_t& needs = mBuilderContext.needs; + int Rctx = mBuilderContext.Rctx; + int Rx = x.reg; + int Ry = y.reg; + + if (mTextureMachine.mask) { + comment("compute texture coordinates"); + } + + // init texture coordinates for each tmu + const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n); + const bool multiTexture = mTextureMachine.activeUnits > 1; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { + const texture_unit_t& tmu = mTextureMachine.tmu[i]; + if (tmu.format_idx == 0) + continue; + if ((tmu.swrap == GGL_NEEDS_WRAP_11) && + (tmu.twrap == GGL_NEEDS_WRAP_11)) + { + // 1:1 texture + pointer_t& txPtr = coords[i].ptr; + txPtr.setTo(obtainReg(), tmu.bits); + CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy); + ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16) + CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy); + ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16) + // merge base & offset + CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride); + SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride + CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); + base_offset(txPtr, txPtr, Rx); + } else { + Scratch scratches(registerFile()); + reg_t& s = coords[i].s; + reg_t& t = coords[i].t; + // s = (x * dsdx)>>16 + ydsdy + // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0 + // t = (x * dtdx)>>16 + ydtdy + // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0 + s.setTo(obtainReg()); + t.setTo(obtainReg()); + const int need_w = GGL_READ_NEEDS(W, needs.n); + if (need_w) { + CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy); + CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy); + } else { + int ydsdy = scratches.obtain(); + int ydtdy = scratches.obtain(); + CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx); + CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy); + CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx); + CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy); + MLA(AL, 0, s.reg, Rx, s.reg, ydsdy); + MLA(AL, 0, t.reg, Rx, t.reg, ydtdy); + } + + if ((mOptLevel&1)==0) { + CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); + CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); + recycleReg(s.reg); + recycleReg(t.reg); + } + } + + // direct texture? + if (!multiTexture && !mBlending && !mDithering && !mFog && + cb_format_idx == tmu.format_idx && !tmu.linear && + mTextureMachine.replaced == tmu.mask) + { + mTextureMachine.directTexture = i + 1; + } + } +} + +void GGLAssembler::build_textures( fragment_parts_t& parts, + Scratch& regs) +{ + context_t const* c = mBuilderContext.c; + const needs_t& needs = mBuilderContext.needs; + int Rctx = mBuilderContext.Rctx; + + // We don't have a way to spill registers automatically + // spill depth and AA regs, when we know we may have to. + // build the spill list... + uint32_t spill_list = 0; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { + const texture_unit_t& tmu = mTextureMachine.tmu[i]; + if (tmu.format_idx == 0) + continue; + if (tmu.linear) { + // we may run out of register if we have linear filtering + // at 1 or 4 bytes / pixel on any texture unit. + if (tmu.format.size == 1) { + // if depth and AA enabled, we'll run out of 1 register + if (parts.z.reg > 0 && parts.covPtr.reg > 0) + spill_list |= 1<<parts.covPtr.reg; + } + if (tmu.format.size == 4) { + // if depth or AA enabled, we'll run out of 1 or 2 registers + if (parts.z.reg > 0) + spill_list |= 1<<parts.z.reg; + if (parts.covPtr.reg > 0) + spill_list |= 1<<parts.covPtr.reg; + } + } + } + + Spill spill(registerFile(), *this, spill_list); + + const bool multiTexture = mTextureMachine.activeUnits > 1; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { + const texture_unit_t& tmu = mTextureMachine.tmu[i]; + if (tmu.format_idx == 0) + continue; + + pointer_t& txPtr = parts.coords[i].ptr; + pixel_t& texel = parts.texel[i]; + + // repeat... + if ((tmu.swrap == GGL_NEEDS_WRAP_11) && + (tmu.twrap == GGL_NEEDS_WRAP_11)) + { // 1:1 textures + comment("fetch texel"); + texel.setTo(regs.obtain(), &tmu.format); + load(txPtr, texel, WRITE_BACK); + } else { + Scratch scratches(registerFile()); + reg_t& s = parts.coords[i].s; + reg_t& t = parts.coords[i].t; + if ((mOptLevel&1)==0) { + comment("reload s/t (multitexture or linear filtering)"); + s.reg = scratches.obtain(); + t.reg = scratches.obtain(); + CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]); + CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]); + } + + comment("compute repeat/clamp"); + int u = scratches.obtain(); + int v = scratches.obtain(); + int width = scratches.obtain(); + int height = scratches.obtain(); + int U = 0; + int V = 0; + + CONTEXT_LOAD(width, generated_vars.texture[i].width); + CONTEXT_LOAD(height, generated_vars.texture[i].height); + + int FRAC_BITS = 0; + if (tmu.linear) { + // linear interpolation + if (tmu.format.size == 1) { + // for 8-bits textures, we can afford + // 7 bits of fractional precision at no + // additional cost (we can't do 8 bits + // because filter8 uses signed 16 bits muls) + FRAC_BITS = 7; + } else if (tmu.format.size == 2) { + // filter16() is internally limited to 4 bits, so: + // FRAC_BITS=2 generates less instructions, + // FRAC_BITS=3,4,5 creates unpleasant artifacts, + // FRAC_BITS=6+ looks good + FRAC_BITS = 6; + } else if (tmu.format.size == 4) { + // filter32() is internally limited to 8 bits, so: + // FRAC_BITS=4 looks good + // FRAC_BITS=5+ looks better, but generates 3 extra ipp + FRAC_BITS = 6; + } else { + // for all other cases we use 4 bits. + FRAC_BITS = 4; + } + } + wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS); + wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS); + + if (tmu.linear) { + comment("compute linear filtering offsets"); + // pixel size scale + const int shift = 31 - gglClz(tmu.format.size); + U = scratches.obtain(); + V = scratches.obtain(); + + // sample the texel center + SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1))); + SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1))); + + // get the fractionnal part of U,V + AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1)); + AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1)); + + // compute width-1 and height-1 + SUB(AL, 0, width, width, imm(1)); + SUB(AL, 0, height, height, imm(1)); + + // get the integer part of U,V and clamp/wrap + // and compute offset to the next texel + if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) { + // u has already been REPEATed + MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); + MOV(MI, 0, u, width); + CMP(AL, u, width); + MOV(LT, 0, width, imm(1 << shift)); + if (shift) + MOV(GE, 0, width, reg_imm(width, LSL, shift)); + RSB(GE, 0, width, width, imm(0)); + } else { + // u has not been CLAMPed yet + // algorithm: + // if ((u>>4) >= width) + // u = width<<4 + // width = 0 + // else + // width = 1<<shift + // u = u>>4; // get integer part + // if (u<0) + // u = 0 + // width = 0 + // generated_vars.rt = width + + CMP(AL, width, reg_imm(u, ASR, FRAC_BITS)); + MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS)); + MOV(LE, 0, width, imm(0)); + MOV(GT, 0, width, imm(1 << shift)); + MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); + MOV(MI, 0, u, imm(0)); + MOV(MI, 0, width, imm(0)); + } + CONTEXT_STORE(width, generated_vars.rt); + + const int stride = width; + CONTEXT_LOAD(stride, generated_vars.texture[i].stride); + if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) { + // v has already been REPEATed + MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); + MOV(MI, 0, v, height); + CMP(AL, v, height); + MOV(LT, 0, height, imm(1 << shift)); + if (shift) + MOV(GE, 0, height, reg_imm(height, LSL, shift)); + RSB(GE, 0, height, height, imm(0)); + MUL(AL, 0, height, stride, height); + } else { + // u has not been CLAMPed yet + CMP(AL, height, reg_imm(v, ASR, FRAC_BITS)); + MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS)); + MOV(LE, 0, height, imm(0)); + if (shift) { + MOV(GT, 0, height, reg_imm(stride, LSL, shift)); + } else { + MOV(GT, 0, height, stride); + } + MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); + MOV(MI, 0, v, imm(0)); + MOV(MI, 0, height, imm(0)); + } + CONTEXT_STORE(height, generated_vars.lb); + } + + scratches.recycle(width); + scratches.recycle(height); + + // iterate texture coordinates... + comment("iterate s,t"); + int dsdx = scratches.obtain(); + int dtdx = scratches.obtain(); + CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); + CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); + ADD(AL, 0, s.reg, s.reg, dsdx); + ADD(AL, 0, t.reg, t.reg, dtdx); + if ((mOptLevel&1)==0) { + CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); + CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); + scratches.recycle(s.reg); + scratches.recycle(t.reg); + } + scratches.recycle(dsdx); + scratches.recycle(dtdx); + + // merge base & offset... + comment("merge base & offset"); + texel.setTo(regs.obtain(), &tmu.format); + txPtr.setTo(texel.reg, tmu.bits); + int stride = scratches.obtain(); + CONTEXT_LOAD(stride, generated_vars.texture[i].stride); + CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); + SMLABB(AL, u, v, stride, u); // u+v*stride + base_offset(txPtr, txPtr, u); + + // load texel + if (!tmu.linear) { + comment("fetch texel"); + load(txPtr, texel, 0); + } else { + // recycle registers we don't need anymore + scratches.recycle(u); + scratches.recycle(v); + scratches.recycle(stride); + + comment("fetch texel, bilinear"); + switch (tmu.format.size) { + case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; + case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; + case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; + case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; + } + } + } + } +} + +void GGLAssembler::build_iterate_texture_coordinates( + const fragment_parts_t& parts) +{ + const bool multiTexture = mTextureMachine.activeUnits > 1; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { + const texture_unit_t& tmu = mTextureMachine.tmu[i]; + if (tmu.format_idx == 0) + continue; + + if ((tmu.swrap == GGL_NEEDS_WRAP_11) && + (tmu.twrap == GGL_NEEDS_WRAP_11)) + { // 1:1 textures + const pointer_t& txPtr = parts.coords[i].ptr; + ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3)); + } else { + Scratch scratches(registerFile()); + int s = parts.coords[i].s.reg; + int t = parts.coords[i].t.reg; + if ((mOptLevel&1)==0) { + s = scratches.obtain(); + t = scratches.obtain(); + CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]); + CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]); + } + int dsdx = scratches.obtain(); + int dtdx = scratches.obtain(); + CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); + CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); + ADD(AL, 0, s, s, dsdx); + ADD(AL, 0, t, t, dtdx); + if ((mOptLevel&1)==0) { + CONTEXT_STORE(s, generated_vars.texture[i].spill[0]); + CONTEXT_STORE(t, generated_vars.texture[i].spill[1]); + } + } + } +} + +void GGLAssembler::filter8( + const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS) +{ + if (tmu.format.components != GGL_ALPHA && + tmu.format.components != GGL_LUMINANCE) + { + // this is a packed format, and we don't support + // linear filtering (it's probably RGB 332) + // Should not happen with OpenGL|ES + LDRB(AL, texel.reg, txPtr.reg); + return; + } + + // ------------------------ + // about ~22 cycles / pixel + Scratch scratches(registerFile()); + + int pixel= scratches.obtain(); + int d = scratches.obtain(); + int u = scratches.obtain(); + int k = scratches.obtain(); + int rt = scratches.obtain(); + int lb = scratches.obtain(); + + // RB -> U * V + + CONTEXT_LOAD(rt, generated_vars.rt); + CONTEXT_LOAD(lb, generated_vars.lb); + + int offset = pixel; + ADD(AL, 0, offset, lb, rt); + LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset)); + SMULBB(AL, u, U, V); + SMULBB(AL, d, pixel, u); + RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2))); + + // LB -> (1-U) * V + RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); + LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb)); + SMULBB(AL, u, U, V); + SMLABB(AL, d, pixel, u, d); + SUB(AL, 0, k, k, u); + + // LT -> (1-U)*(1-V) + RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); + LDRB(AL, pixel, txPtr.reg); + SMULBB(AL, u, U, V); + SMLABB(AL, d, pixel, u, d); + + // RT -> U*(1-V) + LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt)); + SUB(AL, 0, u, k, u); + SMLABB(AL, texel.reg, pixel, u, d); + + for (int i=0 ; i<4 ; i++) { + if (!texel.format.c[i].h) continue; + texel.format.c[i].h = FRAC_BITS*2+8; + texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough + } + texel.format.size = 4; + texel.format.bitsPerPixel = 32; + texel.flags |= CLEAR_LO; +} + +void GGLAssembler::filter16( + const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS) +{ + // compute the mask + // XXX: it would be nice if the mask below could be computed + // automatically. + uint32_t mask = 0; + int shift = 0; + int prec = 0; + switch (tmu.format_idx) { + case GGL_PIXEL_FORMAT_RGB_565: + // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb + // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb + mask = 0x07E0F81F; + shift = 16; + prec = 5; + break; + case GGL_PIXEL_FORMAT_RGBA_4444: + // 0000,1111,0000,1111 | 0000,1111,0000,1111 + mask = 0x0F0F0F0F; + shift = 12; + prec = 4; + break; + case GGL_PIXEL_FORMAT_LA_88: + // 0000,0000,1111,1111 | 0000,0000,1111,1111 + // AALL -> 00AA | 00LL + mask = 0x00FF00FF; + shift = 8; + prec = 8; + break; + default: + // unsupported format, do something sensical... + LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx); + LDRH(AL, texel.reg, txPtr.reg); + return; + } + + const int adjust = FRAC_BITS*2 - prec; + const int round = 0; + + // update the texel format + texel.format.size = 4; + texel.format.bitsPerPixel = 32; + texel.flags |= CLEAR_HI|CLEAR_LO; + for (int i=0 ; i<4 ; i++) { + if (!texel.format.c[i].h) continue; + const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift; + texel.format.c[i].h = tmu.format.c[i].h + offset + prec; + texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec); + } + + // ------------------------ + // about ~40 cycles / pixel + Scratch scratches(registerFile()); + + int pixel= scratches.obtain(); + int d = scratches.obtain(); + int u = scratches.obtain(); + int k = scratches.obtain(); + + // RB -> U * V + int offset = pixel; + CONTEXT_LOAD(offset, generated_vars.rt); + CONTEXT_LOAD(u, generated_vars.lb); + ADD(AL, 0, offset, offset, u); + + LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); + SMULBB(AL, u, U, V); + ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); + build_and_immediate(pixel, pixel, mask, 32); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MUL(AL, 0, d, pixel, u); + RSB(AL, 0, k, u, imm(1<<prec)); + + // LB -> (1-U) * V + CONTEXT_LOAD(offset, generated_vars.lb); + RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); + LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); + SMULBB(AL, u, U, V); + ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); + build_and_immediate(pixel, pixel, mask, 32); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MLA(AL, 0, d, pixel, u, d); + SUB(AL, 0, k, k, u); + + // LT -> (1-U)*(1-V) + RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); + LDRH(AL, pixel, txPtr.reg); + SMULBB(AL, u, U, V); + ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); + build_and_immediate(pixel, pixel, mask, 32); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MLA(AL, 0, d, pixel, u, d); + + // RT -> U*(1-V) + CONTEXT_LOAD(offset, generated_vars.rt); + LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); + SUB(AL, 0, u, k, u); + ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); + build_and_immediate(pixel, pixel, mask, 32); + MLA(AL, 0, texel.reg, pixel, u, d); +} + +void GGLAssembler::filter24( + const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS) +{ + // not supported yet (currently disabled) + load(txPtr, texel, 0); +} + +void GGLAssembler::filter32( + const fragment_parts_t& parts, + pixel_t& texel, const texture_unit_t& tmu, + int U, int V, pointer_t& txPtr, + int FRAC_BITS) +{ + const int adjust = FRAC_BITS*2 - 8; + const int round = 0; + + // ------------------------ + // about ~38 cycles / pixel + Scratch scratches(registerFile()); + + int pixel= scratches.obtain(); + int dh = scratches.obtain(); + int u = scratches.obtain(); + int k = scratches.obtain(); + + int temp = scratches.obtain(); + int dl = scratches.obtain(); + int mask = scratches.obtain(); + + MOV(AL, 0, mask, imm(0xFF)); + ORR(AL, 0, mask, mask, imm(0xFF0000)); + + // RB -> U * V + int offset = pixel; + CONTEXT_LOAD(offset, generated_vars.rt); + CONTEXT_LOAD(u, generated_vars.lb); + ADD(AL, 0, offset, offset, u); + + LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); + SMULBB(AL, u, U, V); + AND(AL, 0, temp, mask, pixel); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MUL(AL, 0, dh, temp, u); + AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); + MUL(AL, 0, dl, temp, u); + RSB(AL, 0, k, u, imm(0x100)); + + // LB -> (1-U) * V + CONTEXT_LOAD(offset, generated_vars.lb); + RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); + LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); + SMULBB(AL, u, U, V); + AND(AL, 0, temp, mask, pixel); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MLA(AL, 0, dh, temp, u, dh); + AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); + MLA(AL, 0, dl, temp, u, dl); + SUB(AL, 0, k, k, u); + + // LT -> (1-U)*(1-V) + RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); + LDR(AL, pixel, txPtr.reg); + SMULBB(AL, u, U, V); + AND(AL, 0, temp, mask, pixel); + if (adjust) { + if (round) + ADD(AL, 0, u, u, imm(1<<(adjust-1))); + MOV(AL, 0, u, reg_imm(u, LSR, adjust)); + } + MLA(AL, 0, dh, temp, u, dh); + AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); + MLA(AL, 0, dl, temp, u, dl); + + // RT -> U*(1-V) + CONTEXT_LOAD(offset, generated_vars.rt); + LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); + SUB(AL, 0, u, k, u); + AND(AL, 0, temp, mask, pixel); + MLA(AL, 0, dh, temp, u, dh); + AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); + MLA(AL, 0, dl, temp, u, dl); + + AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8)); + AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8)); + ORR(AL, 0, texel.reg, dh, dl); +} + +void GGLAssembler::build_texture_environment( + component_t& fragment, + const fragment_parts_t& parts, + int component, + Scratch& regs) +{ + const uint32_t component_mask = 1<<component; + const bool multiTexture = mTextureMachine.activeUnits > 1; + for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) { + texture_unit_t& tmu = mTextureMachine.tmu[i]; + + if (tmu.mask & component_mask) { + // replace or modulate with this texture + if ((tmu.replaced & component_mask) == 0) { + // not replaced by a later tmu... + + Scratch scratches(registerFile()); + pixel_t texel(parts.texel[i]); + if (multiTexture && + tmu.swrap == GGL_NEEDS_WRAP_11 && + tmu.twrap == GGL_NEEDS_WRAP_11) + { + texel.reg = scratches.obtain(); + texel.flags |= CORRUPTIBLE; + comment("fetch texel (multitexture 1:1)"); + load(parts.coords[i].ptr, texel, WRITE_BACK); + } + + component_t incoming(fragment); + modify(fragment, regs); + + switch (tmu.env) { + case GGL_REPLACE: + extract(fragment, texel, component); + break; + case GGL_MODULATE: + modulate(fragment, incoming, texel, component); + break; + case GGL_DECAL: + decal(fragment, incoming, texel, component); + break; + case GGL_BLEND: + blend(fragment, incoming, texel, component, i); + break; + } + } + } + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::wrapping( + int d, + int coord, int size, + int tx_wrap, int tx_linear) +{ + // notes: + // if tx_linear is set, we need 4 extra bits of precision on the result + // SMULL/UMULL is 3 cycles + Scratch scratches(registerFile()); + int c = coord; + if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) { + // UMULL takes 4 cycles (interlocked), and we can get away with + // 2 cycles using SMULWB, but we're loosing 16 bits of precision + // out of 32 (this is not a problem because the iterator keeps + // its full precision) + // UMULL(AL, 0, size, d, c, size); + // note: we can't use SMULTB because it's signed. + MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear)); + SMULWB(AL, d, d, size); + } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) { + if (tx_linear) { + // 1 cycle + MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear)); + } else { + // 4 cycles (common case) + MOV(AL, 0, d, reg_imm(coord, ASR, 16)); + BIC(AL, 0, d, d, reg_imm(d, ASR, 31)); + CMP(AL, d, size); + SUB(GE, 0, d, size, imm(1)); + } + } +} + +// --------------------------------------------------------------------------- + +void GGLAssembler::modulate( + component_t& dest, + const component_t& incoming, + const pixel_t& incomingTexel, int component) +{ + Scratch locals(registerFile()); + integer_t texel(locals.obtain(), 32, CORRUPTIBLE); + extract(texel, incomingTexel, component); + + const int Nt = texel.size(); + // Nt should always be less than 10 bits because it comes + // from the TMU. + + int Ni = incoming.size(); + // Ni could be big because it comes from previous MODULATEs + + if (Nt == 1) { + // texel acts as a bit-mask + // dest = incoming & ((texel << incoming.h)-texel) + RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h)); + AND(AL, 0, dest.reg, dest.reg, incoming.reg); + dest.l = incoming.l; + dest.h = incoming.h; + dest.flags |= (incoming.flags & CLEAR_LO); + } else if (Ni == 1) { + MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h)); + AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31)); + dest.l = 0; + dest.h = Nt; + } else { + int inReg = incoming.reg; + int shift = incoming.l; + if ((Nt + Ni) > 32) { + // we will overflow, reduce the precision of Ni to 8 bits + // (Note Nt cannot be more than 10 bits which happens with + // 565 textures and GGL_LINEAR) + shift += Ni-8; + Ni = 8; + } + + // modulate by the component with the lowest precision + if (Nt >= Ni) { + if (shift) { + // XXX: we should be able to avoid this shift + // when shift==16 && Nt<16 && Ni<16, in which + // we could use SMULBT below. + MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); + inReg = dest.reg; + shift = 0; + } + // operation: (Cf*Ct)/((1<<Ni)-1) + // approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni + // this operation doesn't change texel's size + ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1)); + if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg); + else MUL(AL, 0, dest.reg, texel.reg, dest.reg); + dest.l = Ni; + dest.h = Nt + Ni; + } else { + if (shift && (shift != 16)) { + // if shift==16, we can use 16-bits mul instructions later + MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); + inReg = dest.reg; + shift = 0; + } + // operation: (Cf*Ct)/((1<<Nt)-1) + // approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt + // this operation doesn't change incoming's size + Scratch scratches(registerFile()); + int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg; + if (t == inReg) + t = scratches.obtain(); + ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1)); + if (Nt<16 && Ni<16) { + if (shift==16) SMULBT(AL, dest.reg, t, inReg); + else SMULBB(AL, dest.reg, t, inReg); + } else MUL(AL, 0, dest.reg, t, inReg); + dest.l = Nt; + dest.h = Nt + Ni; + } + + // low bits are not valid + dest.flags |= CLEAR_LO; + + // no need to keep more than 8 bits/component + if (dest.size() > 8) + dest.l = dest.h-8; + } +} + +void GGLAssembler::decal( + component_t& dest, + const component_t& incoming, + const pixel_t& incomingTexel, int component) +{ + // RGBA: + // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At + // Av = Af + Scratch locals(registerFile()); + integer_t texel(locals.obtain(), 32, CORRUPTIBLE); + integer_t factor(locals.obtain(), 32, CORRUPTIBLE); + extract(texel, incomingTexel, component); + extract(factor, incomingTexel, GGLFormat::ALPHA); + + // no need to keep more than 8-bits for decal + int Ni = incoming.size(); + int shift = incoming.l; + if (Ni > 8) { + shift += Ni-8; + Ni = 8; + } + integer_t incomingNorm(incoming.reg, Ni, incoming.flags); + if (shift) { + MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); + incomingNorm.reg = dest.reg; + incomingNorm.flags |= CORRUPTIBLE; + } + ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); + build_blendOneMinusFF(dest, factor, incomingNorm, texel); +} + +void GGLAssembler::blend( + component_t& dest, + const component_t& incoming, + const pixel_t& incomingTexel, int component, int tmu) +{ + // RGBA: + // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct + // Av = At*Af + + if (component == GGLFormat::ALPHA) { + modulate(dest, incoming, incomingTexel, component); + return; + } + + Scratch locals(registerFile()); + integer_t color(locals.obtain(), 8, CORRUPTIBLE); + integer_t factor(locals.obtain(), 32, CORRUPTIBLE); + LDRB(AL, color.reg, mBuilderContext.Rctx, + immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component]))); + extract(factor, incomingTexel, component); + + // no need to keep more than 8-bits for blend + int Ni = incoming.size(); + int shift = incoming.l; + if (Ni > 8) { + shift += Ni-8; + Ni = 8; + } + integer_t incomingNorm(incoming.reg, Ni, incoming.flags); + if (shift) { + MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); + incomingNorm.reg = dest.reg; + incomingNorm.flags |= CORRUPTIBLE; + } + ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); + build_blendOneMinusFF(dest, factor, incomingNorm, color); +} + +// ---------------------------------------------------------------------------- + +}; // namespace android + |