47 files changed, 14322 insertions, 0 deletions
diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk
new file mode 100644
index 0000000..50eb5f5
--- /dev/null
+++ b/libpixelflinger/Android.mk
@@ -0,0 +1,92 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+#
+# ARMv6 specific objects
+#
+
+ifeq ($(TARGET_ARCH),arm)
+LOCAL_ASFLAGS := -march=armv6
+LOCAL_SRC_FILES := rotate90CW_4x4_16v6.S
+LOCAL_MODULE := libpixelflinger_armv6
+include $(BUILD_STATIC_LIBRARY)
+endif
+
+#
+# C/C++ and ARMv5 objects
+#
+
+include $(CLEAR_VARS)
+PIXELFLINGER_SRC_FILES:= \
+    codeflinger/ARMAssemblerInterface.cpp \
+    codeflinger/ARMAssemblerProxy.cpp \
+    codeflinger/ARMAssembler.cpp \
+    codeflinger/CodeCache.cpp \
+    codeflinger/GGLAssembler.cpp \
+    codeflinger/load_store.cpp \
+    codeflinger/blending.cpp \
+    codeflinger/texturing.cpp \
+    codeflinger/disassem.c \
+	tinyutils/SharedBuffer.cpp \
+	tinyutils/VectorImpl.cpp \
+	fixed.cpp.arm \
+	picker.cpp.arm \
+	pixelflinger.cpp.arm \
+	trap.cpp.arm \
+	scanline.cpp.arm \
+	format.cpp \
+	clear.cpp \
+	raster.cpp \
+	buffer.cpp
+
+ifeq ($(TARGET_ARCH),arm)
+PIXELFLINGER_SRC_FILES += t32cb16blend.S
+endif
+
+ifeq ($(TARGET_ARCH),arm)
+# special optimization flags for pixelflinger
+PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer
+endif
+
+LOCAL_SHARED_LIBRARIES := libcutils
+
+ifneq ($(TARGET_ARCH),arm)
+# Required to define logging functions on the simulator.
+# TODO: move the simulator logging functions into libcutils with
+# the rest of the basic log stuff.
+LOCAL_SHARED_LIBRARIES += libutils
+endif
+
+#
+# Shared library
+#
+
+LOCAL_MODULE:= libpixelflinger
+LOCAL_SRC_FILES := $(PIXELFLINGER_SRC_FILES)
+LOCAL_CFLAGS := $(PIXELFLINGER_CFLAGS)
+ifneq ($(BUILD_TINY_ANDROID),true)
+# Really this should go away entirely or at least not depend on
+# libhardware, but this at least gets us built.
+LOCAL_SHARED_LIBRARIES += libhardware_legacy
+LOCAL_CFLAGS += -DWITH_LIB_HARDWARE
+endif
+ifeq ($(TARGET_ARCH),arm)
+LOCAL_WHOLE_STATIC_LIBRARIES := libpixelflinger_armv6
+endif
+include $(BUILD_SHARED_LIBRARY)
+
+#
+# Static library version
+#
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libpixelflinger_static
+LOCAL_SRC_FILES := $(PIXELFLINGER_SRC_FILES)
+LOCAL_CFLAGS := $(PIXELFLINGER_CFLAGS) 
+ifeq ($(TARGET_ARCH),arm)
+LOCAL_WHOLE_STATIC_LIBRARIES := libpixelflinger_armv6
+endif
+include $(BUILD_STATIC_LIBRARY)
+
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
diff --git a/libpixelflinger/MODULE_LICENSE_APACHE2 b/libpixelflinger/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libpixelflinger/MODULE_LICENSE_APACHE2
diff --git a/libpixelflinger/NOTICE b/libpixelflinger/NOTICE
new file mode 100644
index 0000000..c5b1efa
--- /dev/null
+++ b/libpixelflinger/NOTICE
@@ -0,0 +1,190 @@
+
+   Copyright (c) 2005-2008, The Android Open Source Project
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
diff --git a/libpixelflinger/buffer.cpp b/libpixelflinger/buffer.cpp
new file mode 100644
index 0000000..af7356b
--- /dev/null
+++ b/libpixelflinger/buffer.cpp
@@ -0,0 +1,384 @@
+/* libs/pixelflinger/buffer.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <assert.h>
+
+#include "buffer.h"
+
+namespace android {
+// ----------------------------------------------------------------------------
+
+static void read_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+static void write_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, const pixel_t* pixel);
+static void readRGB565(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+static void readABGR8888(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+
+static uint32_t logic_op(int op, uint32_t s, uint32_t d);
+static uint32_t extract(uint32_t v, int h, int l, int bits);
+static uint32_t expand(uint32_t v, int sbits, int dbits);
+static uint32_t downshift_component(uint32_t in, uint32_t v,
+        int sh, int sl, int dh, int dl, int ch, int cl, int dither);
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_texture(context_t* c)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_t& t = c->state.texture[i];
+        t.s_coord = GGL_ONE_TO_ONE;
+        t.t_coord = GGL_ONE_TO_ONE;
+        t.s_wrap = GGL_REPEAT;
+        t.t_wrap = GGL_REPEAT;
+        t.min_filter = GGL_NEAREST;
+        t.mag_filter = GGL_NEAREST;
+        t.env = GGL_MODULATE;
+    }
+    c->activeTMU = &(c->state.texture[0]);
+}
+
+void ggl_set_surface(context_t* c, surface_t* dst, const GGLSurface* src)
+{
+    dst->width = src->width;
+    dst->height = src->height;
+    dst->stride = src->stride;
+    dst->data = src->data;
+    dst->format = src->format;
+    dst->dirty = 1;
+    if (__builtin_expect(dst->stride < 0, false)) {
+        const GGLFormat& pixelFormat(c->formats[dst->format]);
+        const int32_t bpr = -dst->stride * pixelFormat.size;
+        dst->data += bpr * (dst->height-1);
+    }
+}
+
+static void pick_read_write(surface_t* s)
+{
+    // Choose best reader/writers.
+    switch (s->format) {
+        case GGL_PIXEL_FORMAT_RGBA_8888:    s->read = readABGR8888;  break;
+        case GGL_PIXEL_FORMAT_RGB_565:      s->read = readRGB565;    break;
+        default:                            s->read = read_pixel;    break;
+    }
+    s->write = write_pixel;
+}
+
+void ggl_pick_texture(context_t* c)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        surface_t& s = c->state.texture[i].surface;
+        if ((!c->state.texture[i].enable) || (!s.dirty))
+            continue;
+        s.dirty = 0;
+        pick_read_write(&s);
+        generated_tex_vars_t& gen = c->generated_vars.texture[i];
+        gen.width   = s.width;
+        gen.height  = s.height;
+        gen.stride  = s.stride;
+        gen.data    = int32_t(s.data);
+    }
+}
+
+void ggl_pick_cb(context_t* c)
+{
+    surface_t& s = c->state.buffers.color;
+    if (s.dirty) {
+        s.dirty = 0;
+        pick_read_write(&s);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+void read_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    assert((x < s->width) && (y < s->height));
+
+    const GGLFormat* f = &(c->formats[s->format]);
+    int32_t index = x + (s->stride * y);
+    uint8_t* const data = s->data + index * f->size;
+    uint32_t v = 0;
+    switch (f->size) {
+        case 1:		v = *data;									break;
+        case 2:		v = *(uint16_t*)data;						break;
+        case 3:		v = (data[2]<<16)|(data[1]<<8)|data[0];     break;
+        case 4:		v = GGL_RGBA_TO_HOST(*(uint32_t*)data);		break;
+    }
+    for (int i=0 ; i<4 ; i++) {
+        pixel->s[i] = f->c[i].h - f->c[i].l;
+        if (pixel->s[i])
+            pixel->c[i] = extract(v,  f->c[i].h,  f->c[i].l, f->size*8);
+    }
+}
+
+void readRGB565(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    uint16_t v = *(reinterpret_cast<uint16_t*>(s->data) + (x + (s->stride * y)));
+    pixel->c[0] = 0;
+    pixel->c[1] = v>>11;
+    pixel->c[2] = (v>>5)&0x3F;
+    pixel->c[3] = v&0x1F;
+    pixel->s[0] = 0;
+    pixel->s[1] = 5;
+    pixel->s[2] = 6;
+    pixel->s[3] = 5;
+}
+
+void readABGR8888(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    uint32_t v = *(reinterpret_cast<uint32_t*>(s->data) + (x + (s->stride * y)));
+    v = GGL_RGBA_TO_HOST(v);
+    pixel->c[0] = v>>24;        // A
+    pixel->c[1] = v&0xFF;       // R
+    pixel->c[2] = (v>>8)&0xFF;  // G
+    pixel->c[3] = (v>>16)&0xFF; // B
+    pixel->s[0] = 
+    pixel->s[1] = 
+    pixel->s[2] = 
+    pixel->s[3] = 8;
+}
+
+void write_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, const pixel_t* pixel)
+{
+    assert((x < s->width) && (y < s->height));
+
+    int dither = -1;
+    if (c->state.enables & GGL_ENABLE_DITHER) {
+        dither = c->ditherMatrix[ (x & GGL_DITHER_MASK) +
+                ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
+    }
+
+    const GGLFormat* f = &(c->formats[s->format]);
+    int32_t index = x + (s->stride * y);
+    uint8_t* const data = s->data + index * f->size;
+        
+    uint32_t mask = 0;
+    uint32_t v = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1 << i;
+        if (f->components>=GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            // destinations L formats don't have G or B
+            continue;
+        }
+        const int l = f->c[i].l;
+        const int h = f->c[i].h;
+        if (h && (c->state.mask.color & component_mask)) {
+            mask |= (((1<<(h-l))-1)<<l);
+            uint32_t u = pixel->c[i];
+            int32_t pixelSize = pixel->s[i];
+            if (pixelSize < (h-l)) {
+                u = expand(u, pixelSize, h-l);
+                pixelSize = h-l;
+            }
+            v = downshift_component(v, u, pixelSize, 0, h, l, 0, 0, dither);
+        }
+    }
+
+    if ((c->state.mask.color != 0xF) || 
+        (c->state.enables & GGL_ENABLE_LOGIC_OP)) {
+        uint32_t d = 0;
+        switch (f->size) {
+            case 1:	d = *data;									break;
+            case 2:	d = *(uint16_t*)data;						break;
+            case 3:	d = (data[2]<<16)|(data[1]<<8)|data[0];     break;
+            case 4:	d = GGL_RGBA_TO_HOST(*(uint32_t*)data);		break;
+        }
+        if (c->state.enables & GGL_ENABLE_LOGIC_OP) {
+            v = logic_op(c->state.logic_op.opcode, v, d);            
+            v &= mask;
+        }
+        v |= (d & ~mask);
+    }
+
+    switch (f->size) {
+        case 1:		*data = v;									break;
+        case 2:		*(uint16_t*)data = v;						break;
+        case 3:
+            data[0] = v;
+            data[1] = v>>8;
+            data[2] = v>>16;
+            break;
+        case 4:		*(uint32_t*)data = GGL_HOST_TO_RGBA(v);     break;
+    }
+}
+
+static uint32_t logic_op(int op, uint32_t s, uint32_t d)
+{
+    switch(op) {
+    case GGL_CLEAR:         return 0;
+    case GGL_AND:           return s & d;
+    case GGL_AND_REVERSE:   return s & ~d;
+    case GGL_COPY:          return s;
+    case GGL_AND_INVERTED:  return ~s & d;
+    case GGL_NOOP:          return d;
+    case GGL_XOR:           return s ^ d;
+    case GGL_OR:            return s | d;
+    case GGL_NOR:           return ~(s | d);
+    case GGL_EQUIV:         return ~(s ^ d);
+    case GGL_INVERT:        return ~d;
+    case GGL_OR_REVERSE:    return s | ~d;
+    case GGL_COPY_INVERTED: return ~s;
+    case GGL_OR_INVERTED:   return ~s | d;
+    case GGL_NAND:          return ~(s & d);
+    case GGL_SET:           return ~0;
+    };
+    return s;
+}            
+
+
+uint32_t ggl_expand(uint32_t v, int sbits, int dbits)
+{
+    return expand(v, sbits, dbits);
+}
+
+uint32_t ggl_pack_color(context_t* c, int32_t format,
+        GGLcolor r, GGLcolor g, GGLcolor b, GGLcolor a)
+{
+    const GGLFormat* f = &(c->formats[format]);
+    uint32_t p = 0;
+    const int32_t hbits = GGL_COLOR_BITS;
+    const int32_t lbits = GGL_COLOR_BITS - 8;
+    p = downshift_component(p, r,   hbits, lbits,  f->rh, f->rl, 0, 1, -1);
+    p = downshift_component(p, g,   hbits, lbits,  f->gh, f->gl, 0, 1, -1);
+    p = downshift_component(p, b,   hbits, lbits,  f->bh, f->bl, 0, 1, -1);
+    p = downshift_component(p, a,   hbits, lbits,  f->ah, f->al, 0, 1, -1);
+    switch (f->size) {
+    case 1: p |= p << 8;    // fallthrough
+    case 2: p |= p << 16;
+    }
+    return p;
+}
+
+// ----------------------------------------------------------------------------
+
+// extract a component from a word
+uint32_t extract(uint32_t v, int h, int l, int bits)
+{
+	assert(h);
+	if (l) {
+		v >>= l;
+	}
+	if (h != bits) {
+		v &= (1<<(h-l))-1;
+	}
+	return v;
+}
+
+// expand a component from sbits to dbits
+uint32_t expand(uint32_t v, int sbits, int dbits)
+{
+    if (dbits > sbits) {
+        assert(sbits);
+        if (sbits==1) {
+            v = (v<<dbits) - v;
+        } else {
+            if (dbits % sbits) {
+                v <<= (dbits-sbits);
+                dbits -= sbits;
+                do {
+                    v |= v>>sbits;
+                    dbits -= sbits;
+                    sbits *= 2;
+                } while (dbits>0);
+            } else {
+                dbits -= sbits;
+                do {
+                    v |= v<<sbits;
+                    dbits -= sbits;
+                    if (sbits*2 < dbits) {
+                        sbits *= 2;
+                    }
+                } while (dbits > 0);
+            }
+        }
+    }
+	return v;
+}
+
+// downsample a component from sbits to dbits
+// and shift / construct the pixel
+uint32_t downshift_component(	uint32_t in, uint32_t v,
+                                int sh, int sl,		// src
+                                int dh, int dl,		// dst
+                                int ch, int cl,		// clear
+                                int dither)
+{
+	const int sbits = sh-sl;
+	const int dbits = dh-dl;
+    
+	assert(sbits>=dbits);
+
+
+    if (sbits>dbits) {
+        if (dither>=0) {
+            v -= (v>>dbits);				// fix up
+            const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+            if (shift >= 0)   v += (dither >> shift) << sl;
+            else              v += (dither << (-shift)) << sl;
+        } else {
+            // don't do that right now, so we can reproduce the same
+            // artifacts we get on ARM (Where we don't do this)
+            // -> this is not really needed if we don't dither
+            //if (dBits > 1) { // result already OK if dBits==1
+            //    v -= (v>>dbits);				// fix up
+            //    v += 1 << ((sbits-dbits)-1);	// rounding
+            //}
+        }
+    }
+
+
+	// we need to clear the high bits of the source
+	if (ch) {
+		v <<= 32-sh;
+		sl += 32-sh;
+        sh = 32;
+	}
+	
+	if (dl) {
+		if (cl || (sbits>dbits)) {
+			v >>= sh-dbits;
+			sl = 0;
+			sh = dbits;
+            in |= v<<dl;
+		} else {
+			// sbits==dbits and we don't need to clean the lower bits
+			// so we just have to shift the component to the right location
+            int shift = dh-sh;
+            in |= v<<shift;
+		}
+	} else {
+		// destination starts at bit 0
+		// ie: sh-dh == sh-dbits
+		int shift = sh-dh;
+		if (shift > 0)      in |= v>>shift;
+		else if (shift < 0) in |= v<<shift;
+		else                in |= v;
+	}
+	return in;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/libpixelflinger/buffer.h b/libpixelflinger/buffer.h
new file mode 100644
index 0000000..9c9e4bc
--- /dev/null
+++ b/libpixelflinger/buffer.h
@@ -0,0 +1,39 @@
+/* libs/pixelflinger/buffer.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGL_TEXTURE_H
+#define ANDROID_GGL_TEXTURE_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_texture(context_t* c);
+
+void ggl_set_surface(context_t* c, surface_t* dst, const GGLSurface* src);
+
+void ggl_pick_texture(context_t* c);
+void ggl_pick_cb(context_t* c);
+
+uint32_t ggl_expand(uint32_t v, int sbits, int dbits);
+uint32_t ggl_pack_color(context_t* c, int32_t format,
+            GGLcolor r, GGLcolor g, GGLcolor b, GGLcolor a);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_TEXTURE_H
diff --git a/libpixelflinger/clear.cpp b/libpixelflinger/clear.cpp
new file mode 100644
index 0000000..b962456
--- /dev/null
+++ b/libpixelflinger/clear.cpp
@@ -0,0 +1,171 @@
+/* libs/pixelflinger/clear.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <cutils/memory.h>
+
+#include "clear.h"
+#include "buffer.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+static void ggl_clear(void* c, GGLbitfield mask);
+static void ggl_clearColorx(void* c,
+        GGLclampx r, GGLclampx g, GGLclampx b, GGLclampx a);        
+static void ggl_clearDepthx(void* c, GGLclampx depth);
+static void ggl_clearStencil(void* c, GGLint s);
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_clear(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, clear);
+    GGL_INIT_PROC(procs, clearColorx);
+    GGL_INIT_PROC(procs, clearDepthx);
+    GGL_INIT_PROC(procs, clearStencil);
+    c->state.clear.dirty =  GGL_STENCIL_BUFFER_BIT |
+                            GGL_COLOR_BUFFER_BIT |
+                            GGL_DEPTH_BUFFER_BIT;
+    c->state.clear.depth = FIXED_ONE;
+}
+
+// ----------------------------------------------------------------------------
+
+static void memset2d(context_t* c, const surface_t& s, uint32_t packed,
+        uint32_t l, uint32_t t, uint32_t w, uint32_t h)
+{
+    const uint32_t size = c->formats[s.format].size;
+    const int32_t stride = s.stride * size;
+    uint8_t* dst = (uint8_t*)s.data + (l + t*s.stride)*size;
+    w *= size;
+
+    if (ggl_likely(int32_t(w) == stride)) {
+        // clear the whole thing in one call
+        w *= h;
+        h = 1;
+    }
+
+    switch (size) {
+    case 1:
+        do {
+            memset(dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    case 2:
+        do {
+            android_memset16((uint16_t*)dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    case 3: // XXX: 24-bit clear.
+        break;
+    case 4:
+        do {
+            android_memset32((uint32_t*)dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    }    
+}
+
+static inline GGLfixed fixedToZ(GGLfixed z) {
+    return GGLfixed(((int64_t(z) << 16) - z) >> 16);
+}
+
+static void ggl_clear(void* con, GGLbitfield mask)
+{
+    GGL_CONTEXT(c, con);
+
+    // XXX: rgba-dithering, rgba-masking
+    // XXX: handle all formats of Z and S
+
+    const uint32_t l = c->state.scissor.left;
+    const uint32_t t = c->state.scissor.top;
+    uint32_t w = c->state.scissor.right - l;
+    uint32_t h = c->state.scissor.bottom - t;
+
+    if (!w || !h)
+        return;
+
+    // unexsiting buffers have no effect...
+    if (c->state.buffers.color.format == 0)
+        mask &= ~GGL_COLOR_BUFFER_BIT;
+
+    if (c->state.buffers.depth.format == 0)
+        mask &= ~GGL_DEPTH_BUFFER_BIT;
+
+    if (c->state.buffers.stencil.format == 0)
+        mask &= ~GGL_STENCIL_BUFFER_BIT;
+
+    if (mask & GGL_COLOR_BUFFER_BIT) {
+        if (c->state.clear.dirty & GGL_COLOR_BUFFER_BIT) {
+            c->state.clear.dirty &= ~GGL_COLOR_BUFFER_BIT;
+
+            uint32_t colorPacked = ggl_pack_color(c,
+                    c->state.buffers.color.format,
+                    gglFixedToIteratedColor(c->state.clear.r),
+                    gglFixedToIteratedColor(c->state.clear.g),
+                    gglFixedToIteratedColor(c->state.clear.b),
+                    gglFixedToIteratedColor(c->state.clear.a));
+
+            c->state.clear.colorPacked = GGL_HOST_TO_RGBA(colorPacked);
+        }
+        const uint32_t packed = c->state.clear.colorPacked;
+        memset2d(c, c->state.buffers.color, packed, l, t, w, h);
+    }
+    if (mask & GGL_DEPTH_BUFFER_BIT) {
+        if (c->state.clear.dirty & GGL_DEPTH_BUFFER_BIT) {
+            c->state.clear.dirty &= ~GGL_DEPTH_BUFFER_BIT;
+            uint32_t depth = fixedToZ(c->state.clear.depth);
+            c->state.clear.depthPacked = (depth<<16)|depth;
+        }
+        const uint32_t packed = c->state.clear.depthPacked;
+        memset2d(c, c->state.buffers.depth, packed, l, t, w, h);
+    }
+
+    // XXX: do stencil buffer
+}
+
+static void ggl_clearColorx(void* con,
+        GGLclampx r, GGLclampx g, GGLclampx b, GGLclampx a)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.r = gglClampx(r);
+    c->state.clear.g = gglClampx(g);
+    c->state.clear.b = gglClampx(b);
+    c->state.clear.a = gglClampx(a);
+    c->state.clear.dirty |= GGL_COLOR_BUFFER_BIT;
+}
+
+static void ggl_clearDepthx(void* con, GGLclampx depth)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.depth = gglClampx(depth);
+    c->state.clear.dirty |= GGL_DEPTH_BUFFER_BIT;
+}
+
+static void ggl_clearStencil(void* con, GGLint s)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.stencil = s;
+    c->state.clear.dirty |= GGL_STENCIL_BUFFER_BIT;
+}
+
+}; // namespace android
diff --git a/libpixelflinger/clear.h b/libpixelflinger/clear.h
new file mode 100644
index 0000000..b071df0
--- /dev/null
+++ b/libpixelflinger/clear.h
@@ -0,0 +1,30 @@
+/* libs/pixelflinger/clear.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_GGL_CLEAR_H
+#define ANDROID_GGL_CLEAR_H
+
+#include <pixelflinger/pixelflinger.h>
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_clear(context_t* c);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_CLEAR_H
diff --git a/libpixelflinger/codeflinger/ARMAssembler.cpp b/libpixelflinger/codeflinger/ARMAssembler.cpp
new file mode 100644
index 0000000..ff7b0b3
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.cpp
@@ -0,0 +1,428 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#define LOG_TAG "ARMAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cutils/log.h>
+#include <cutils/properties.h>
+
+#if defined(WITH_LIB_HARDWARE)
+#include <hardware_legacy/qemu_tracing.h>
+#endif
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/ARMAssembler.h"
+#include "codeflinger/CodeCache.h"
+#include "codeflinger/disassem.h"
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ARMAssembler...
+#endif
+
+ARMAssembler::ARMAssembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+#if defined(WITH_LIB_HARDWARE)
+    mQemuTracing = true;
+#endif
+}
+
+ARMAssembler::~ARMAssembler()
+{
+}
+
+uint32_t* ARMAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ARMAssembler::base() const
+{
+    return mBase;
+}
+
+void ARMAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+// ----------------------------------------------------------------------------
+
+void ARMAssembler::disassemble(const char* name)
+{
+    if (name) {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0) {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0) {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%08x:    %08x    ", int(i), int(i[0]));
+        ::disassemble((u_int)i);
+        i++;
+    }
+}
+
+void ARMAssembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+}
+
+void ARMAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+}
+
+void ARMAssembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xA<<24) | 0;
+}
+
+void ARMAssembler::BL(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xB<<24) | 0;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+
+void ARMAssembler::prolog()
+{
+    // write dummy prolog code
+    mPrologPC = mPC;
+    STM(AL, FD, SP, 1, LSAVED);
+}
+
+void ARMAssembler::epilog(uint32_t touched)
+{
+    touched &= LSAVED;
+    if (touched) {
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        STM(AL, FD, SP, 1, touched | LLR);
+        mPC = pc;
+        // write epilog code
+        LDM(AL, FD, SP, 1, touched | LLR);
+        BX(AL, LR);
+    } else {   // heh, no registers to save!
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        MOV(AL, 0, R0, R0); // NOP
+        mPC = pc;
+        // write epilog code
+        BX(AL, LR);
+    }
+}
+
+int ARMAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+2));
+        *bt.pc |= offset & 0xFFFFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+    
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %lld ns\n";
+    LOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+#if defined(WITH_LIB_HARDWARE)
+    if (__builtin_expect(mQemuTracing, 0)) {
+        int err = qemu_add_mapping(int(base()), name);
+        mQemuTracing = (err >= 0);
+    }
+#endif
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0) {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+    
+    return NO_ERROR;
+}
+
+uint32_t* ARMAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+void ARMAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    *mPC++ = (cc<<28) | (opcode<<21) | (s<<20) | (Rn<<16) | (Rd<<12) | Op2;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply...
+void ARMAssembler::MLA(int cc, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } 
+    LOG_FATAL_IF(Rd==Rm, "MLA(r%u,r%u,r%u,r%u)", Rd,Rm,Rs,Rn);
+    *mPC++ =    (cc<<28) | (1<<21) | (s<<20) |
+                (Rd<<16) | (Rn<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::MUL(int cc, int s,
+        int Rd, int Rm, int Rs) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } 
+    LOG_FATAL_IF(Rd==Rm, "MUL(r%u,r%u,r%u)", Rd,Rm,Rs);
+    *mPC++ = (cc<<28) | (s<<20) | (Rd<<16) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+void ARMAssembler::B(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xA<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BL(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xB<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BX(int cc, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x12FFF10 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfert...
+void ARMAssembler::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (Rn<<16) | (Rd<<12) | offset;
+}
+
+void ARMAssembler::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+void ARMAssembler::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xD0 | offset;
+}
+void ARMAssembler::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xF0 | offset;
+}
+void ARMAssembler::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ARMAssembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+void ARMAssembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    FA EA FD ED      IB IA DB DA
+    const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ARMAssembler::SWP(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWPB(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWI(int cc, uint32_t comment) {
+    *mPC++ = (cc<<28) | (0xF<<24) | comment;
+}
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ARMAssembler::PLD(int Rn, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+}
+
+void ARMAssembler::CLZ(int cc, int Rd, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x16F0F10| (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x1600080 | (Rd<<16) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMULW(int cc, int y,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x12000A0 | (Rd<<16) | (Rs<<8) | (y<<4) | Rm;
+}
+
+void ARMAssembler::SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+}
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/ARMAssembler.h b/libpixelflinger/codeflinger/ARMAssembler.h
new file mode 100644
index 0000000..8837e07
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.h
@@ -0,0 +1,155 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_ARMASSEMBLER_H
+#define ANDROID_ARMASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <utils/Vector.h>
+#include <utils/KeyedVector.h>
+
+#include "tinyutils/smartpointer.h"
+#include "codeflinger/ARMAssemblerInterface.h"
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssembler : public ARMAssemblerInterface
+{
+public:
+                ARMAssembler(const sp<Assembly>& assembly);
+    virtual     ~ARMAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+private:
+                ARMAssembler(const ARMAssembler& rhs);
+                ARMAssembler& operator = (const ARMAssembler& rhs);
+
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+#if defined(WITH_LIB_HARDWARE)
+    bool            mQemuTracing;
+#endif
+    
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+    
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp
new file mode 100644
index 0000000..7fa0de0
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp
@@ -0,0 +1,173 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+#include "codeflinger/ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerInterface::~ARMAssemblerInterface()
+{
+}
+
+int ARMAssemblerInterface::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    if (imm > 0x7F) { // skip the easy cases
+        while (!(imm&3)  || (imm&0xFC000000)) {
+            uint32_t newval;
+            newval = imm >> 2;
+            newval |= (imm&3) << 30;
+            imm = newval;
+            rot += 2;
+            if (rot == 32) {
+                rot = 0;
+                break;
+            }
+        }
+    }
+    rot = (16 - (rot>>1)) & 0xF;
+
+    if (imm>=0x100)
+        return -EINVAL;
+
+    if (((imm>>(rot<<1)) | (imm<<(32-(rot<<1)))) != immediate)
+        return -1;
+
+    return 0;
+}
+
+// shifters...
+
+bool ARMAssemblerInterface::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ARMAssemblerInterface::imm(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    int err = buildImmediate(immediate, rot, imm);
+
+    LOG_ALWAYS_FATAL_IF(err==-EINVAL,
+                        "immediate %08x cannot be encoded",
+                        immediate);
+
+    LOG_ALWAYS_FATAL_IF(err,
+                        "immediate (%08x) encoding bogus!",
+                        immediate);
+
+    return (1<<25) | (rot<<8) | imm;
+}
+
+uint32_t ARMAssemblerInterface::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return ((shift&0x1F)<<7) | ((type&0x3)<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_rrx(int Rm)
+{
+    return (ROR<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_reg(int Rm, int type, int Rs)
+{
+    return ((Rs&0xF)<<8) | ((type&0x3)<<5) | (1<<4) | (Rm&0xF);
+}
+
+// addressing modes... 
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssemblerInterface::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    return (((uint32_t(immed12)>>31)^1)<<23) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::reg_scale_pre(int Rm, int type, 
+        uint32_t shift, int W)
+{
+    return  (1<<25) | (1<<24) | 
+            (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) |
+            reg_imm(abs(Rm), type, shift);
+}
+
+uint32_t ARMAssemblerInterface::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return (1<<25) | (((uint32_t(Rm)>>31)^1)<<23) | reg_imm(abs(Rm), type, shift);
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssemblerInterface::immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+uint32_t ARMAssemblerInterface::immed8_post(int32_t immed8)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            (((offset&0xF0)<<4) | (offset&0xF));
+}
+
+uint32_t ARMAssemblerInterface::reg_pre(int Rm, int W)
+{
+    return (1<<24) | (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | (abs(Rm)&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_post(int Rm)
+{
+    return (((uint32_t(Rm)>>31)^1)<<23) | (abs(Rm)&0xF);
+}
+
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.h b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
new file mode 100644
index 0000000..465b3bd
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
@@ -0,0 +1,324 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_INTERFACE_H
+#define ANDROID_ARMASSEMBLER_INTERFACE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerInterface
+{
+public:
+    virtual ~ARMAssemblerInterface();
+
+    enum {
+        EQ, NE, CS, CC, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV,
+        HS = CS,
+        LO = CC
+    };
+    enum {
+        S = 1
+    };
+    enum {
+        LSL, LSR, ASR, ROR
+    };
+    enum {
+        ED, FD, EA, FA,
+        IB, IA, DB, DA
+    };
+    enum {
+        R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15,
+        SP = R13,
+        LR = R14,
+        PC = R15
+    };
+    enum {
+        #define LIST(rr) L##rr=1<<rr
+        LIST(R0), LIST(R1), LIST(R2), LIST(R3), LIST(R4), LIST(R5), LIST(R6),
+        LIST(R7), LIST(R8), LIST(R9), LIST(R10), LIST(R11), LIST(R12),
+        LIST(R13), LIST(R14), LIST(R15),
+        LIST(SP), LIST(LR), LIST(PC),
+        #undef LIST
+        LSAVED = LR4|LR5|LR6|LR7|LR8|LR9|LR10|LR11 | LLR
+    };
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    static bool        isValidImmediate(uint32_t immed);
+    static int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    static uint32_t    imm(uint32_t immediate);
+    static uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    static uint32_t    reg_rrx(int Rm);
+    static uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes... 
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    static uint32_t    immed12_pre(int32_t immed12, int W=0);
+    static uint32_t    immed12_post(int32_t immed12);
+    static uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    static uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    static uint32_t    immed8_pre(int32_t immed8, int W=0);
+    static uint32_t    immed8_post(int32_t immed8);
+    static uint32_t    reg_pre(int Rm, int W=0);
+    static uint32_t    reg_post(int Rm);
+
+    // -----------------------------------------------------------------------
+    // basic instructions & code generation
+    // -----------------------------------------------------------------------
+
+    // generate the code
+    virtual void reset() = 0;
+    virtual int  generate(const char* name) = 0;
+    virtual void disassemble(const char* name) = 0;
+    
+    // construct prolog and epilog
+    virtual void prolog() = 0;
+    virtual void epilog(uint32_t touched) = 0;
+    virtual void comment(const char* string) = 0;
+
+    // data processing...
+    enum {
+        opAND, opEOR, opSUB, opRSB, opADD, opADC, opSBC, opRSC, 
+        opTST, opTEQ, opCMP, opCMN, opORR, opMOV, opBIC, opMVN
+    };
+
+    virtual void
+            dataProcessing( int opcode, int cc, int s,
+                            int Rd, int Rn,
+                            uint32_t Op2) = 0;
+    
+    // multiply...
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+
+    // branches...
+    virtual void B(int cc, uint32_t* pc) = 0;
+    virtual void BL(int cc, uint32_t* pc) = 0;
+    virtual void BX(int cc, int Rn) = 0;
+
+    virtual void label(const char* theLabel) = 0;
+    virtual void B(int cc, const char* label) = 0;
+    virtual void BL(int cc, const char* label) = 0;
+
+    // valid only after generate() has been called
+    virtual uint32_t* pcForLabel(const char* label) = 0;
+
+    // data transfer...
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+
+    // block data transfer...
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+
+    // special...
+    virtual void SWP(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWI(int cc, uint32_t comment) = 0;
+
+    // DSP instructions...
+    enum {
+        // B=0, T=1
+        //     yx
+        xyBB = 0, // 0000,
+        xyTB = 2, // 0010,
+        xyBT = 4, // 0100,
+        xyTT = 6, // 0110,
+        yB   = 0, // 0000,
+        yT   = 4, // 0100
+    };
+
+    virtual void PLD(int Rn, uint32_t offset) = 0;
+
+    virtual void CLZ(int cc, int Rd, int Rm) = 0;
+    
+    virtual void QADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm) = 0;
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+
+    // -----------------------------------------------------------------------
+    // convenience...
+    // -----------------------------------------------------------------------
+    inline void
+    ADC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    ADD(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    AND(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opAND, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    BIC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opBIC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    EOR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opEOR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    MOV(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMOV, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    MVN(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMVN, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    ORR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opORR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SBC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSBC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SUB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    TEQ(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTEQ, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    TST(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTST, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMP(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMP, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMN(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMN, cc, 1, 0, Rn, Op2);
+    }
+
+    inline void SMULBB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBB, Rd, Rm, Rs);    }
+    inline void SMULTB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTB, Rd, Rm, Rs);    }
+    inline void SMULBT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBT, Rd, Rm, Rs);    }
+    inline void SMULTT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTT, Rd, Rm, Rs);    }
+
+    inline void SMULWB(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yB, Rd, Rm, Rs);    }
+    inline void SMULWT(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yT, Rd, Rm, Rs);    }
+
+    inline void
+    SMLABB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLABT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBT, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTT, Rd, Rm, Rs, Rn);    }
+
+    inline void
+    SMLALBB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALBT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBT, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTT, RdHi, RdLo, Rs, Rm);    }
+
+    inline void
+    SMLAWB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLAWT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yT, Rd, Rm, Rs, Rn);    }
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_INTERFACE_H
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
new file mode 100644
index 0000000..18c4618
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
@@ -0,0 +1,200 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "codeflinger/ARMAssemblerProxy.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerProxy::ARMAssemblerProxy()
+    : mTarget(0)
+{
+}
+
+ARMAssemblerProxy::ARMAssemblerProxy(ARMAssemblerInterface* target)
+    : mTarget(target)
+{
+}
+
+ARMAssemblerProxy::~ARMAssemblerProxy()
+{
+    delete mTarget;
+}
+
+void ARMAssemblerProxy::setTarget(ARMAssemblerInterface* target)
+{
+    delete mTarget;
+    mTarget = target;
+}
+
+void ARMAssemblerProxy::reset() {
+    mTarget->reset();
+}
+int ARMAssemblerProxy::generate(const char* name) {
+    return mTarget->generate(name);
+}
+void ARMAssemblerProxy::disassemble(const char* name) {
+    return mTarget->disassemble(name);
+}
+void ARMAssemblerProxy::prolog() {
+    mTarget->prolog();
+}
+void ARMAssemblerProxy::epilog(uint32_t touched) {
+    mTarget->epilog(touched);
+}
+void ARMAssemblerProxy::comment(const char* string) {
+    mTarget->comment(string);
+}
+
+
+void ARMAssemblerProxy::dataProcessing( int opcode, int cc, int s,
+                                        int Rd, int Rn, uint32_t Op2)
+{
+    mTarget->dataProcessing(opcode, cc, s, Rd, Rn, Op2);
+}
+
+void ARMAssemblerProxy::MLA(int cc, int s, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->MLA(cc, s, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::MUL(int cc, int s, int Rd, int Rm, int Rs) {
+    mTarget->MUL(cc, s, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::UMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::UMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+
+void ARMAssemblerProxy::B(int cc, uint32_t* pc) {
+    mTarget->B(cc, pc); 
+}
+void ARMAssemblerProxy::BL(int cc, uint32_t* pc) {
+    mTarget->BL(cc, pc); 
+}
+void ARMAssemblerProxy::BX(int cc, int Rn) {
+    mTarget->BX(cc, Rn); 
+}
+void ARMAssemblerProxy::label(const char* theLabel) {
+    mTarget->label(theLabel);
+}
+void ARMAssemblerProxy::B(int cc, const char* label) {
+    mTarget->B(cc, label);
+}
+void ARMAssemblerProxy::BL(int cc, const char* label) {
+    mTarget->BL(cc, label);
+}
+
+uint32_t* ARMAssemblerProxy::pcForLabel(const char* label) {
+    return mTarget->pcForLabel(label);
+}
+
+void ARMAssemblerProxy::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->LDM(cc, dir, Rn, W, reg_list);
+}
+void ARMAssemblerProxy::STM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->STM(cc, dir, Rn, W, reg_list);
+}
+
+void ARMAssemblerProxy::SWP(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWP(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWPB(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWPB(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWI(int cc, uint32_t comment) {
+    mTarget->SWI(cc, comment);
+}
+
+
+void ARMAssemblerProxy::PLD(int Rn, uint32_t offset) {
+    mTarget->PLD(Rn, offset);
+}
+void ARMAssemblerProxy::CLZ(int cc, int Rd, int Rm) {
+    mTarget->CLZ(cc, Rd, Rm);
+}
+void ARMAssemblerProxy::QADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::SMUL(int cc, int xy, int Rd, int Rm, int Rs) {
+    mTarget->SMUL(cc, xy, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMULW(int cc, int y, int Rd, int Rm, int Rs) {
+    mTarget->SMULW(cc, y, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLA(cc, xy, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::SMLAL(  int cc, int xy,
+                                int RdHi, int RdLo, int Rs, int Rm) {
+    mTarget->SMLAL(cc, xy, RdHi, RdLo, Rs, Rm);
+}
+void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn);
+}
+
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.h b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
new file mode 100644
index 0000000..4bdca9c
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
@@ -0,0 +1,123 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_PROXY_H
+#define ANDROID_ARMASSEMBLER_PROXY_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "codeflinger/ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerProxy : public ARMAssemblerInterface
+{
+public:
+    // ARMAssemblerProxy take ownership of the target
+
+                ARMAssemblerProxy();
+                ARMAssemblerProxy(ARMAssemblerInterface* target);
+    virtual     ~ARMAssemblerProxy();
+
+    void setTarget(ARMAssemblerInterface* target);
+
+    virtual void    reset();
+    virtual int     generate(const char* name);
+    virtual void    disassemble(const char* name);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+private:
+    ARMAssemblerInterface*  mTarget;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_PROXY_H
diff --git a/libpixelflinger/codeflinger/CodeCache.cpp b/libpixelflinger/codeflinger/CodeCache.cpp
new file mode 100644
index 0000000..29410c8
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.cpp
@@ -0,0 +1,151 @@
+/* libs/pixelflinger/codeflinger/CodeCache.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/log.h>
+#include <cutils/atomic.h>
+
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#if defined(__arm__)
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+// ----------------------------------------------------------------------------
+
+Assembly::Assembly(size_t size)
+    : mCount(1), mSize(0)
+{
+    mBase = (uint32_t*)malloc(size);
+    if (mBase) {
+        mSize = size;
+    }
+}
+
+Assembly::~Assembly()
+{
+    free(mBase);
+}
+
+void Assembly::incStrong(const void*) const
+{
+    android_atomic_inc(&mCount);
+}
+
+void Assembly::decStrong(const void*) const
+{
+    if (android_atomic_dec(&mCount) == 1) {
+        delete this;
+    }
+}
+
+ssize_t Assembly::size() const
+{
+    if (!mBase) return NO_MEMORY;
+    return mSize;
+}
+
+uint32_t* Assembly::base() const
+{
+    return mBase;
+}
+
+ssize_t Assembly::resize(size_t newSize)
+{
+    mBase = (uint32_t*)realloc(mBase, newSize);
+    mSize = newSize;
+    return size();
+}
+
+// ----------------------------------------------------------------------------
+
+CodeCache::CodeCache(size_t size)
+    : mCacheSize(size), mCacheInUse(0)
+{
+    pthread_mutex_init(&mLock, 0);
+}
+
+CodeCache::~CodeCache()
+{
+    pthread_mutex_destroy(&mLock);
+}
+
+sp<Assembly> CodeCache::lookup(const AssemblyKeyBase& keyBase) const
+{
+    pthread_mutex_lock(&mLock);
+    sp<Assembly> r;
+    ssize_t index = mCacheData.indexOfKey(key_t(keyBase));
+    if (index >= 0) {
+        const cache_entry_t& e = mCacheData.valueAt(index);
+        e.when = mWhen++;
+        r = e.entry;
+    }
+    pthread_mutex_unlock(&mLock);
+    return r;
+}
+
+int CodeCache::cache(  const AssemblyKeyBase& keyBase,
+                            const sp<Assembly>& assembly)
+{
+    pthread_mutex_lock(&mLock);
+
+    const ssize_t assemblySize = assembly->size();
+    while (mCacheInUse + assemblySize > mCacheSize) {
+        // evict the LRU
+        size_t lru = 0;
+        size_t count = mCacheData.size();
+        for (size_t i=0 ; i<count ; i++) {
+            const cache_entry_t& e = mCacheData.valueAt(i);
+            if (e.when < mCacheData.valueAt(lru).when) {
+                lru = i;
+            }
+        }
+        const cache_entry_t& e = mCacheData.valueAt(lru);
+        mCacheInUse -= e.entry->size();
+        mCacheData.removeItemsAt(lru);
+    }
+
+    ssize_t err = mCacheData.add(key_t(keyBase), cache_entry_t(assembly, mWhen));
+    if (err >= 0) {
+        mCacheInUse += assemblySize;
+        mWhen++;
+        // synchronize caches...
+#if defined(__arm__)
+        const long base = long(assembly->base());
+        const long curr = base + long(assembly->size());
+        err = cacheflush(base, curr, 0);
+        LOGE_IF(err, "__ARM_NR_cacheflush error %s\n",
+                strerror(errno));
+#endif
+    }
+
+    pthread_mutex_unlock(&mLock);
+    return err;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/CodeCache.h b/libpixelflinger/codeflinger/CodeCache.h
new file mode 100644
index 0000000..370ce17
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.h
@@ -0,0 +1,134 @@
+/* libs/pixelflinger/codeflinger/CodeCache.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_CODECACHE_H
+#define ANDROID_CODECACHE_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/types.h>
+
+#include <utils/KeyedVector.h>
+
+#include "tinyutils/smartpointer.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class AssemblyKeyBase {
+public:
+    virtual ~AssemblyKeyBase() { }
+    virtual int compare_type(const AssemblyKeyBase& key) const = 0;
+};
+
+template  <typename T>
+class AssemblyKey : public AssemblyKeyBase
+{
+public:
+    AssemblyKey(const T& rhs) : mKey(rhs) { }
+    virtual int compare_type(const AssemblyKeyBase& key) const {
+        const T& rhs = static_cast<const AssemblyKey&>(key).mKey;
+        return android::compare_type(mKey, rhs);
+    }
+private:
+    T mKey;
+};
+
+// ----------------------------------------------------------------------------
+
+class Assembly
+{
+public:
+                Assembly(size_t size);
+    virtual     ~Assembly();
+
+    ssize_t     size() const;
+    uint32_t*   base() const;
+    ssize_t     resize(size_t size);
+
+    // protocol for sp<>
+            void    incStrong(const void* id) const;
+            void    decStrong(const void* id) const;
+    typedef void    weakref_type;
+
+private:
+    mutable int32_t     mCount;
+            uint32_t*   mBase;
+            ssize_t     mSize;
+};
+
+// ----------------------------------------------------------------------------
+
+class CodeCache
+{
+public:
+// pretty simple cache API...
+                CodeCache(size_t size);
+                ~CodeCache();
+    
+            sp<Assembly>        lookup(const AssemblyKeyBase& key) const;
+
+            int                 cache(  const AssemblyKeyBase& key,
+                                        const sp<Assembly>& assembly);
+
+private:
+    // nothing to see here...
+    struct cache_entry_t {
+        inline cache_entry_t() { }
+        inline cache_entry_t(const sp<Assembly>& a, int64_t w)
+                : entry(a), when(w) { }
+        sp<Assembly>            entry;
+        mutable int64_t         when;
+    };
+
+    class key_t {
+        friend int compare_type(
+            const key_value_pair_t<key_t, cache_entry_t>&,
+            const key_value_pair_t<key_t, cache_entry_t>&);
+        const AssemblyKeyBase* mKey;
+    public:
+        key_t() { };
+        key_t(const AssemblyKeyBase& k) : mKey(&k)  { }
+    };
+
+    mutable pthread_mutex_t             mLock;
+    mutable int64_t                     mWhen;
+    size_t                              mCacheSize;
+    size_t                              mCacheInUse;
+    KeyedVector<key_t, cache_entry_t>   mCacheData;
+
+    friend int compare_type(
+        const key_value_pair_t<key_t, cache_entry_t>&,
+        const key_value_pair_t<key_t, cache_entry_t>&);
+};
+
+// KeyedVector uses compare_type(), which is more efficient, than
+// just using operator < ()
+inline int compare_type(
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& lhs,
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& rhs)
+{
+    return lhs.key.mKey->compare_type(*(rhs.key.mKey));
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif //ANDROID_CODECACHE_H
diff --git a/libpixelflinger/codeflinger/GGLAssembler.cpp b/libpixelflinger/codeflinger/GGLAssembler.cpp
new file mode 100644
index 0000000..1cd189c
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.cpp
@@ -0,0 +1,1150 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#define LOG_TAG "GGLAssembler"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
+    : ARMAssemblerProxy(target), RegisterAllocator(), mOptLevel(7)
+{
+}
+
+GGLAssembler::~GGLAssembler()
+{
+}
+
+void GGLAssembler::prolog()
+{
+    ARMAssemblerProxy::prolog();
+}
+
+void GGLAssembler::epilog(uint32_t touched)
+{
+    ARMAssemblerProxy::epilog(touched);
+}
+
+void GGLAssembler::reset(int opt_level)
+{
+    ARMAssemblerProxy::reset();
+    RegisterAllocator::reset();
+    mOptLevel = opt_level;
+}
+
+// ---------------------------------------------------------------------------
+
+int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
+{
+    int err = 0;
+    int opt_level = mOptLevel;
+    while (opt_level >= 0) {
+        reset(opt_level);
+        err = scanline_core(needs, c);
+        if (err == 0)
+            break;
+        opt_level--;
+    }
+    
+    // XXX: in theory, pcForLabel is not valid before generate()
+    uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
+    uint32_t* fragment_end_pc = pcForLabel("epilog");
+    const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
+    
+    // build a name for our pipeline
+    char name[64];    
+    sprintf(name,
+            "scanline__%08X:%08X_%08X_%08X [%3d ipp]",
+            needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);
+
+    if (err) {
+        LOGE("Error while generating ""%s""\n", name);
+        disassemble(name);
+        return -1;
+    }
+
+    return generate(name);
+}
+
+int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
+{
+    int64_t duration = ggl_system_time();
+
+    mBlendFactorCached = 0;
+    mBlending = 0;
+    mMasking = 0;
+    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
+    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
+    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
+    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
+    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
+    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
+    mBuilderContext.needs = needs;
+    mBuilderContext.c = c;
+    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
+    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
+
+    // ------------------------------------------------------------------------
+
+    decodeLogicOpNeeds(needs);
+
+    decodeTMUNeeds(needs, c);
+
+    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
+    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
+    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
+    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
+
+    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
+        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrc == GGL_DST_ALPHA)) {
+            mBlendSrc = GGL_ONE;
+        }
+        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrcA == GGL_DST_ALPHA)) {
+            mBlendSrcA = GGL_ONE;
+        }
+        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDst == GGL_DST_ALPHA)) {
+            mBlendDst = GGL_ONE;
+        }
+        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDstA == GGL_DST_ALPHA)) {
+            mBlendDstA = GGL_ONE;
+        }
+    }
+
+    // if we need the framebuffer, read it now
+    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
+                            blending_codes(mBlendSrcA, mBlendDstA);
+
+    // XXX: handle special cases, destination not modified...
+    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
+        // Destination unmodified (beware of logic ops)
+    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
+        // Destination is zero (beware of logic ops)
+    }
+    
+    int fbComponents = 0;
+    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
+    for (int i=0 ; i<4 ; i++) {
+        const int mask = 1<<i;
+        component_info_t& info = mInfo[i];
+        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
+            fs = GGL_ONE;
+        info.masked =   !!(masking & mask);
+        info.inDest =   !info.masked && mCbFormat.c[i].h && 
+                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
+        if (mCbFormat.components >= GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            info.inDest = false;
+        }
+        info.needed =   (i==GGLFormat::ALPHA) && 
+                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
+        info.replaced = !!(mTextureMachine.replaced & mask);
+        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
+        info.smooth =   mSmooth && info.iterated;
+        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
+        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+
+        mBlending |= (info.blend ? mask : 0);
+        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
+        fbComponents |= mCbFormat.c[i].h ? mask : 0;
+    }
+
+    mAllMasked = (mMasking == fbComponents);
+    if (mAllMasked) {
+        mDithering = 0;
+    }
+    
+    fragment_parts_t parts;
+
+    // ------------------------------------------------------------------------
+    prolog();
+    // ------------------------------------------------------------------------
+
+    build_scanline_prolog(parts, needs);
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // ------------------------------------------------------------------------
+    label("fragment_loop");
+    // ------------------------------------------------------------------------
+    {
+        Scratch regs(registerFile());
+
+        if (mDithering) {
+            // update the dither index.
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
+            ADD(AL, 0, parts.count.reg, parts.count.reg,
+                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
+        }
+
+        // XXX: could we do an early alpha-test here in some cases?
+        // It would probaly be used only with smooth-alpha and no texture
+        // (or no alpha component in the texture).
+
+        // Early z-test
+        if (mAlphaTest==GGL_ALWAYS) {
+            build_depth_test(parts, Z_TEST|Z_WRITE);
+        } else {
+            // we cannot do the z-write here, because
+            // it might be killed by the alpha-test later
+            build_depth_test(parts, Z_TEST);
+        }
+
+        { // texture coordinates
+            Scratch scratches(registerFile());
+
+            // texel generation
+            build_textures(parts, regs);
+        }        
+
+        if ((blending & (FACTOR_DST|BLEND_DST)) || 
+                (mMasking && !mAllMasked) ||
+                (mLogicOp & LOGIC_OP_DST)) 
+        {
+            // blending / logic_op / masking need the framebuffer
+            mDstPixel.setTo(regs.obtain(), &mCbFormat);
+
+            // load the framebuffer pixel
+            comment("fetch color-buffer");
+            load(parts.cbPtr, mDstPixel);
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+
+        pixel_t pixel;
+        int directTex = mTextureMachine.directTexture;
+        if (directTex | parts.packed) {
+            // note: we can't have both here
+            // iterated color or direct texture
+            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
+            pixel.flags &= ~CORRUPTIBLE;
+        } else {
+            if (mDithering) {
+                const int ctxtReg = mBuilderContext.Rctx;
+                const int mask = GGL_DITHER_SIZE-1;
+                parts.dither = reg_t(regs.obtain());
+                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
+                ADD(AL, 0, parts.dither.reg, parts.dither.reg, ctxtReg);
+                LDRB(AL, parts.dither.reg, parts.dither.reg,
+                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
+            }
+        
+            // allocate a register for the resulting pixel
+            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
+
+            build_component(pixel, parts, GGLFormat::ALPHA,    regs);
+
+            if (mAlphaTest!=GGL_ALWAYS) {
+                // only handle the z-write part here. We know z-test
+                // was successful, as well as alpha-test.
+                build_depth_test(parts, Z_WRITE);
+            }
+
+            build_component(pixel, parts, GGLFormat::RED,      regs);
+            build_component(pixel, parts, GGLFormat::GREEN,    regs);
+            build_component(pixel, parts, GGLFormat::BLUE,     regs);
+
+            pixel.flags |= CORRUPTIBLE;
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+        
+        if (pixel.reg == -1) {
+            // be defensive here. if we're here it's probably
+            // that this whole fragment is a no-op.
+            pixel = mDstPixel;
+        }
+        
+        if (!mAllMasked) {
+            // logic operation
+            build_logic_op(pixel, regs);
+    
+            // masking
+            build_masking(pixel, regs); 
+    
+            comment("store");
+            store(parts.cbPtr, pixel, WRITE_BACK);
+        }
+    }
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // update the iterated color...
+    if (parts.reload != 3) {
+        build_smooth_shade(parts);
+    }
+
+    // update iterated z
+    build_iterate_z(parts);
+
+    // update iterated fog
+    build_iterate_f(parts);
+
+    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+    B(PL, "fragment_loop");
+    label("epilog");
+    epilog(registerFile().touched());
+
+    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
+        if (mDepthTest!=GGL_ALWAYS) {
+            label("discard_before_textures");
+            build_iterate_texture_coordinates(parts);
+        }
+        label("discard_after_textures");
+        build_smooth_shade(parts);
+        build_iterate_z(parts);
+        build_iterate_f(parts);
+        if (!mAllMasked) {
+            ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
+        }
+        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+        B(PL, "fragment_loop");
+        epilog(registerFile().touched());
+    }
+
+    return registerFile().status();
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_scanline_prolog(
+    fragment_parts_t& parts, const needs_t& needs)
+{
+    Scratch scratches(registerFile());
+    int Rctx = mBuilderContext.Rctx;
+
+    // compute count
+    comment("compute ct (# of pixels to process)");
+    parts.count.setTo(obtainReg());
+    int Rx = scratches.obtain();    
+    int Ry = scratches.obtain();
+    CONTEXT_LOAD(Rx, iterators.xl);
+    CONTEXT_LOAD(parts.count.reg, iterators.xr);
+    CONTEXT_LOAD(Ry, iterators.y);
+
+    // parts.count = iterators.xr - Rx
+    SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
+    SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));
+
+    if (mDithering) {
+        // parts.count.reg = 0xNNNNXXDD
+        // NNNN = count-1
+        // DD   = dither offset
+        // XX   = 0xxxxxxx (x = garbage)
+        Scratch scratches(registerFile());
+        int tx = scratches.obtain();
+        int ty = scratches.obtain();
+        AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
+        AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
+        ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
+        ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
+    } else {
+        // parts.count.reg = 0xNNNN0000
+        // NNNN = count-1
+        MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
+    }
+
+    if (!mAllMasked) {
+        // compute dst ptr
+        comment("compute color-buffer pointer");
+        const int cb_bits = mCbFormat.size*8;
+        int Rs = scratches.obtain();
+        parts.cbPtr.setTo(obtainReg(), cb_bits);
+        CONTEXT_LOAD(Rs, state.buffers.color.stride);
+        CONTEXT_LOAD(parts.cbPtr.reg, state.buffers.color.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);  // Rs = Rx + Ry*Rs
+        base_offset(parts.cbPtr, parts.cbPtr, Rs);
+        scratches.recycle(Rs);
+    }
+    
+    // init fog
+    const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
+    if (need_fog) {
+        comment("compute initial fog coordinate");
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int ydfdy = scratches.obtain();
+        int f = ydfdy;
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
+        CONTEXT_LOAD(ydfdy, iterators.ydfdy);
+        MLA(AL, 0, f, Rx, dfdx, ydfdy);
+        CONTEXT_STORE(f, generated_vars.f);
+    }
+
+    // init Z coordinate
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        parts.z = reg_t(obtainReg());
+        comment("compute initial Z coordinate");
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        int ydzdy = parts.z.reg;
+        CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
+        CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
+        MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);
+
+        // we're going to index zbase of parts.count
+        // zbase = base + (xl-count + stride*y)*2
+        int Rs = dzdx;
+        int zbase = scratches.obtain();
+        CONTEXT_LOAD(Rs, state.buffers.depth.stride);
+        CONTEXT_LOAD(zbase, state.buffers.depth.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);
+        ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
+        ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
+        CONTEXT_STORE(zbase, generated_vars.zbase);
+    }
+
+    // init texture coordinates
+    init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
+    scratches.recycle(Ry);
+
+    // iterated color
+    init_iterated_color(parts, reg_t(Rx));
+
+    // init coverage factor application (anti-aliasing)
+    if (mAA) {
+        parts.covPtr.setTo(obtainReg(), 16);
+        CONTEXT_LOAD(parts.covPtr.reg, state.buffers.coverage);
+        ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_component( pixel_t& pixel,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs)
+{
+    static char const * comments[] = {"alpha", "red", "green", "blue"};
+    comment(comments[component]);
+
+    // local register file
+    Scratch scratches(registerFile());
+    const int dst_component_size = pixel.component_size(component);
+
+    component_t temp(-1);
+    build_incoming_component( temp, dst_component_size,
+            parts, component, scratches, regs);
+
+    if (mInfo[component].inDest) {
+
+        // blending...
+        build_blending( temp, mDstPixel, component, scratches );
+
+        // downshift component and rebuild pixel...
+        downshift(pixel, component, temp, parts.dither);
+    }
+}
+
+void GGLAssembler::build_incoming_component(
+                                    component_t& temp,
+                                    int dst_size,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& scratches,
+                                    Scratch& global_regs)
+{
+    const uint32_t component_mask = 1<<component;
+
+    // Figure out what we need for the blending stage...
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
+        fs = GGL_ONE;
+    }
+
+    // Figure out what we need to extract and for what reason
+    const int blending = blending_codes(fs, fd);
+
+    // Are we actually going to blend?
+    const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+    
+    // expand the source if the destination has more bits
+    int need_expander = false;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if ((tmu.format_idx) &&
+            (parts.texel[i].component_size(component) < dst_size)) {
+            need_expander = true;
+        }
+    }
+
+    // do we need to extract this component?
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
+                                        (isAlphaSourceNeeded());
+    int need_extract = mInfo[component].needed;
+    if (mInfo[component].inDest)
+    {
+        need_extract |= ((need_blending ?
+                (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
+        need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
+        need_extract |= mInfo[component].smooth;
+        need_extract |= mInfo[component].fog;
+        need_extract |= mDithering;
+        need_extract |= multiTexture;
+    }
+
+    if (need_extract) {
+        Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
+        component_t fragment;
+
+        // iterated color
+        build_iterated_color(fragment, parts, component, regs);
+
+        // texture environement (decal, modulate, replace)
+        build_texture_environment(fragment, parts, component, regs);
+
+        // expand the source if the destination has more bits
+        if (need_expander && (fragment.size() < dst_size)) {
+            // we're here only if we fetched a texel
+            // (so we know for sure fragment is CORRUPTIBLE)
+            expand(fragment, fragment, dst_size);
+        }
+
+        // We have a few specific things to do for the alpha-channel
+        if ((component==GGLFormat::ALPHA) &&
+            (mInfo[component].needed || fragment.size()<dst_size))
+        {
+            // convert to integer_t first and make sure
+            // we don't corrupt a needed register
+            if (fragment.l) {
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
+                fragment.h -= fragment.l;
+                fragment.l = 0;
+            }
+
+            // coverage factor application
+            build_coverage_application(fragment, parts, regs);
+
+            // alpha-test
+            build_alpha_test(fragment, parts);
+
+            if (blend_needs_alpha_source) {
+                // We keep only 8 bits for the blending stage
+                const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
+                if (fragment.flags & CORRUPTIBLE) {
+                    fragment.flags &= ~CORRUPTIBLE;
+                    mAlphaSource.setTo(fragment.reg,
+                            fragment.size(), fragment.flags);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(mAlphaSource.reg, LSR, shift));
+                    }
+                } else {
+                    // XXX: it would better to do this in build_blend_factor()
+                    // so we can avoid the extra MOV below.
+                    mAlphaSource.setTo(regs.obtain(),
+                            fragment.size(), CORRUPTIBLE);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(fragment.reg, LSR, shift));
+                    } else {
+                        MOV(AL, 0, mAlphaSource.reg, fragment.reg);
+                    }
+                }
+                mAlphaSource.s -= shift;
+            }
+        }
+
+        // fog...
+        build_fog( fragment, component, regs );
+
+        temp = fragment;
+    } else {
+        if (mInfo[component].inDest) {
+            // extraction not needed and replace
+            // we just select the right component
+            if ((mTextureMachine.replaced & component_mask) == 0) {
+                // component wasn't replaced, so use it!
+                temp = component_t(parts.iterated, component);
+            }
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+                const texture_unit_t& tmu = mTextureMachine.tmu[i];
+                if ((tmu.mask & component_mask) &&
+                    ((tmu.replaced & component_mask) == 0)) {
+                    temp = component_t(parts.texel[i], component);
+                }
+            }
+        }
+    }
+}
+
+bool GGLAssembler::isAlphaSourceNeeded() const
+{
+    // XXX: also needed for alpha-test
+    const int bs = mBlendSrc;
+    const int bd = mBlendDst;
+    return  bs==GGL_SRC_ALPHA_SATURATE ||
+            bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
+            bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ; 
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
+{
+    if (mSmooth && !parts.iterated_packed) {
+        // update the iterated color in a pipelined way...
+        comment("update iterated color");
+        Scratch scratches(registerFile());
+
+        const int reload = parts.reload;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated) 
+                continue;
+                
+            int c = parts.argb[i].reg;
+            int dx = parts.argb_dx[i].reg;
+            
+            if (reload & 1) {
+                c = scratches.obtain();
+                CONTEXT_LOAD(c, generated_vars.argb[i].c);
+            }
+            if (reload & 2) {
+                dx = scratches.obtain();
+                CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
+            }
+            
+            if (mSmooth) {
+                ADD(AL, 0, c, c, dx);
+            }
+            
+            if (reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+                scratches.recycle(c);
+            }
+            if (reload & 2) {
+                scratches.recycle(dx);
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_coverage_application(component_t& fragment,
+        const fragment_parts_t& parts, Scratch& regs)
+{
+    // here fragment.l is guarenteed to be 0
+    if (mAA) {
+        // coverages are 1.15 fixed-point numbers
+        comment("coverage application");
+
+        component_t incoming(fragment);
+        modify(fragment, regs);
+
+        Scratch scratches(registerFile());
+        int cf = scratches.obtain();
+        LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
+        if (fragment.h > 31) {
+            fragment.h--;
+            SMULWB(AL, fragment.reg, incoming.reg, cf);
+        } else {
+            MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
+            SMULWB(AL, fragment.reg, fragment.reg, cf);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_alpha_test(component_t& fragment,
+                                    const fragment_parts_t& parts)
+{
+    if (mAlphaTest != GGL_ALWAYS) {
+        comment("Alpha Test");
+        Scratch scratches(registerFile());
+        int ref = scratches.obtain();
+        const int shift = GGL_COLOR_BITS-fragment.size();
+        CONTEXT_LOAD(ref, state.alpha_test.ref);
+        if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
+        else       CMP(AL, fragment.reg, ref);
+        int cc = NV;
+        switch (mAlphaTest) {
+        case GGL_NEVER:     cc = NV;    break;
+        case GGL_LESS:      cc = LT;    break;
+        case GGL_EQUAL:     cc = EQ;    break;
+        case GGL_LEQUAL:    cc = LS;    break;
+        case GGL_GREATER:   cc = HI;    break;
+        case GGL_NOTEQUAL:  cc = NE;    break;
+        case GGL_GEQUAL:    cc = HS;    break;
+        }
+        B(cc^1, "discard_after_textures");
+    }
+}
+
+// ---------------------------------------------------------------------------
+            
+void GGLAssembler::build_depth_test(
+        const fragment_parts_t& parts, uint32_t mask)
+{
+    mask &= Z_TEST|Z_WRITE;
+    const needs_t& needs = mBuilderContext.needs;
+    const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
+    Scratch scratches(registerFile());
+
+    if (mDepthTest != GGL_ALWAYS || zmask) {
+        int cc=AL, ic=AL;
+        switch (mDepthTest) {
+        case GGL_LESS:      ic = HI;    break;
+        case GGL_EQUAL:     ic = EQ;    break;
+        case GGL_LEQUAL:    ic = HS;    break;
+        case GGL_GREATER:   ic = LT;    break;
+        case GGL_NOTEQUAL:  ic = NE;    break;
+        case GGL_GEQUAL:    ic = LS;    break;
+        case GGL_NEVER:
+            // this never happens, because it's taken care of when 
+            // computing the needs. but we keep it for completness.
+            comment("Depth Test (NEVER)");
+            B(AL, "discard_before_textures");
+            return;
+        case GGL_ALWAYS:
+            // we're here because zmask is enabled
+            mask &= ~Z_TEST;    // test always passes.
+            break;
+        }
+        
+        // inverse the condition
+        cc = ic^1;
+        
+        if ((mask & Z_WRITE) && !zmask) {
+            mask &= ~Z_WRITE;
+        }
+        
+        if (!mask)
+            return;
+
+        comment("Depth Test");
+
+        int zbase = scratches.obtain();
+        int depth = scratches.obtain();
+        int z = parts.z.reg;
+        
+        CONTEXT_LOAD(zbase, generated_vars.zbase);  // stall
+        SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
+            // above does zbase = zbase + ((count >> 16) << 1)
+
+        if (mask & Z_TEST) {
+            LDRH(AL, depth, zbase);  // stall
+            CMP(AL, depth, reg_imm(z, LSR, 16));
+            B(cc, "discard_before_textures");
+        }
+        if (mask & Z_WRITE) {
+            if (mask == Z_WRITE) {
+                // only z-write asked, cc is meaningless
+                ic = AL;
+            }
+            MOV(AL, 0, depth, reg_imm(z, LSR, 16));
+            STRH(ic, depth, zbase);
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
+        ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx); 
+    }
+}
+
+void GGLAssembler::build_iterate_f(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if (GGL_READ_NEEDS(P_FOG, needs.p)) {
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int f = scratches.obtain();
+        CONTEXT_LOAD(f,     generated_vars.f);
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
+        ADD(AL, 0, f, f, dfdx);
+        CONTEXT_STORE(f,    generated_vars.f);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    if (opcode == GGL_COPY)
+        return;
+    
+    comment("logic operation");
+
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+    
+    pixel_t d(mDstPixel);
+    switch(opcode) {
+    case GGL_CLEAR:         MOV(AL, 0, pixel.reg, imm(0));          break;
+    case GGL_AND:           AND(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_AND_REVERSE:   BIC(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_COPY:                                                  break;
+    case GGL_AND_INVERTED:  BIC(AL, 0, pixel.reg, d.reg, s.reg);    break;
+    case GGL_NOOP:          MOV(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_XOR:           EOR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_OR:            ORR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_NOR:           ORR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_EQUIV:         EOR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_INVERT:        MVN(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
+                            BIC(AL, 0, pixel.reg, d.reg, s.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg);           break;
+    case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
+                            BIC(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_NAND:          AND(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_SET:           MVN(AL, 0, pixel.reg, imm(0));          break;
+    };        
+}
+
+// ---------------------------------------------------------------------------
+
+static uint32_t find_bottom(uint32_t val)
+{
+    uint32_t i = 0;
+    while (!(val & (3<<i)))
+        i+= 2;
+    return i;
+}
+
+static void normalize(uint32_t& val, uint32_t& rot)
+{
+    rot = 0;
+    while (!(val&3)  || (val & 0xFC000000)) {
+        uint32_t newval;
+        newval = val >> 2;
+        newval |= (val&3) << 30;
+        val = newval;
+        rot += 2;
+        if (rot == 32) {
+            rot = 0;
+            break;
+        }
+    }
+}
+
+void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
+{
+    uint32_t rot;
+    uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+    mask &= size;
+
+    if (mask == size) {
+        if (d != s)
+            MOV( AL, 0, d, s);
+        return;
+    }
+    
+    int negative_logic = !isValidImmediate(mask);
+    if (negative_logic) {
+        mask = ~mask & size;
+    }
+    normalize(mask, rot);
+
+    if (mask) {
+        while (mask) {
+            uint32_t bitpos = find_bottom(mask);
+            int shift = rot + bitpos;
+            uint32_t m = mask & (0xff << bitpos);
+            mask &= ~m;
+            m >>= bitpos;
+            int32_t newMask =  (m<<shift) | (m>>(32-shift));
+            if (!negative_logic) {
+                AND( AL, 0, d, s, imm(newMask) );
+            } else {
+                BIC( AL, 0, d, s, imm(newMask) );
+            }
+            s = d;
+        }
+    } else {
+        MOV( AL, 0, d, imm(0));
+    }
+}		
+
+void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
+{
+    if (!mMasking || mAllMasked) {
+        return;
+    }
+
+    comment("color mask");
+
+    pixel_t fb(mDstPixel);
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+
+    int mask = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1<<i;
+        const int h = fb.format.c[i].h;
+        const int l = fb.format.c[i].l;
+        if (h && (!(mMasking & component_mask))) {
+            mask |= ((1<<(h-l))-1) << l;
+        }
+    }
+
+    // There is no need to clear the masked components of the source
+    // (unless we applied a logic op), because they're already zeroed 
+    // by construction (masked components are not computed)
+
+    if (mLogicOp) {
+        const needs_t& needs = mBuilderContext.needs;
+        const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+        if (opcode != GGL_CLEAR) {
+            // clear masked component of source
+            build_and_immediate(pixel.reg, s.reg, mask, fb.size());
+            s = pixel;
+        }
+    }
+
+    // clear non masked components of destination
+    build_and_immediate(fb.reg, fb.reg, ~mask, fb.size()); 
+
+    // or back the channels that were masked
+    if (s.reg == fb.reg) {
+         // this is in fact a MOV
+        if (s.reg == pixel.reg) {
+            // ugh. this in in fact a nop
+        } else {
+            MOV(AL, 0, pixel.reg, fb.reg);
+        }
+    } else {
+        ORR(AL, 0, pixel.reg, s.reg, fb.reg);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::base_offset(
+        const pointer_t& d, const pointer_t& b, const reg_t& o)
+{
+    switch (b.size) {
+    case 32:
+        ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
+        break;
+    case 24:
+        if (d.reg == b.reg) {
+            ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+            ADD(AL, 0, d.reg, d.reg, o.reg);
+        } else {
+            ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
+            ADD(AL, 0, d.reg, d.reg, b.reg);
+        }
+        break;
+    case 16:
+        ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+        break;
+    case 8:
+        ADD(AL, 0, d.reg, b.reg, o.reg);
+        break;
+    }
+}
+
+// ----------------------------------------------------------------------------
+// cheezy register allocator...
+// ----------------------------------------------------------------------------
+
+void RegisterAllocator::reset()
+{
+    mRegs.reset();
+}
+
+int RegisterAllocator::reserveReg(int reg)
+{
+    return mRegs.reserve(reg);
+}
+
+int RegisterAllocator::obtainReg()
+{
+    return mRegs.obtain();
+}
+
+void RegisterAllocator::recycleReg(int reg)
+{
+    mRegs.recycle(reg);
+}
+
+RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
+{
+    return mRegs;
+}
+
+// ----------------------------------------------------------------------------
+
+RegisterAllocator::RegisterFile::RegisterFile()
+    : mRegs(0), mTouched(0), mStatus(0)
+{
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs)
+    : mRegs(rhs.mRegs), mTouched(rhs.mTouched)
+{
+}
+
+RegisterAllocator::RegisterFile::~RegisterFile()
+{
+}
+
+bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
+{
+    return (mRegs == rhs.mRegs);
+}
+
+void RegisterAllocator::RegisterFile::reset()
+{
+    mRegs = mTouched = mStatus = 0;
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+int RegisterAllocator::RegisterFile::reserve(int reg)
+{
+    LOG_ALWAYS_FATAL_IF(isUsed(reg),
+                        "reserving register %d, but already in use",
+                        reg);
+    mRegs |= (1<<reg);
+    mTouched |= mRegs;
+    return reg;
+}
+
+void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
+{
+    mRegs |= regMask;
+    mTouched |= regMask;
+}
+
+int RegisterAllocator::RegisterFile::isUsed(int reg) const
+{
+    LOG_ALWAYS_FATAL_IF(reg>=16, "invalid register %d", reg);
+    return mRegs & (1<<reg);
+}
+
+int RegisterAllocator::RegisterFile::obtain()
+{
+    const char priorityList[14] = {  0,  1, 2, 3, 
+                                    12, 14, 4, 5, 
+                                     6,  7, 8, 9,
+                                    10, 11 };
+    const int nbreg = sizeof(priorityList);
+    int i, r;
+    for (i=0 ; i<nbreg ; i++) {
+        r = priorityList[i];
+        if (!isUsed(r)) {
+            break;
+        }
+    }
+    // this is not an error anymore because, we'll try again with
+    // a lower optimization level.
+    //LOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
+    if (i >= nbreg) {
+        mStatus |= OUT_OF_REGISTERS;
+        // we return SP so we can more easily debug things
+        // the code will never be run anyway.
+        return ARMAssemblerInterface::SP; 
+    }
+    reserve(r);
+    return r;
+}
+
+bool RegisterAllocator::RegisterFile::hasFreeRegs() const
+{
+    return ((mRegs & 0xFFFF) == 0xFFFF) ? false : true;
+}
+
+int RegisterAllocator::RegisterFile::countFreeRegs() const
+{
+    int f = ~mRegs & 0xFFFF;
+    // now count number of 1
+   f = (f & 0x5555) + ((f>>1) & 0x5555);
+   f = (f & 0x3333) + ((f>>2) & 0x3333);
+   f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
+   f = (f & 0x00FF) + ((f>>8) & 0x00FF);
+   return f;
+}
+
+void RegisterAllocator::RegisterFile::recycle(int reg)
+{
+    LOG_FATAL_IF(!isUsed(reg),
+            "recycling unallocated register %d",
+            reg);
+    mRegs &= ~(1<<reg);
+}
+
+void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
+{
+    LOG_FATAL_IF((mRegs & regMask)!=regMask,
+            "recycling unallocated registers "
+            "(recycle=%08x, allocated=%08x, unallocated=%08x)",
+            regMask, mRegs, mRegs&regMask);
+    mRegs &= ~regMask;
+}
+
+uint32_t RegisterAllocator::RegisterFile::touched() const
+{
+    return mTouched;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/GGLAssembler.h b/libpixelflinger/codeflinger/GGLAssembler.h
new file mode 100644
index 0000000..d1d29f0
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.h
@@ -0,0 +1,554 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGLASSEMBLER_H
+#define ANDROID_GGLASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/ARMAssemblerProxy.h"
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#define CONTEXT_LOAD(REG, FIELD) \
+    LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_STORE(REG, FIELD) \
+    STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+
+class RegisterAllocator
+{
+public:
+    class RegisterFile;
+    
+    RegisterFile&   registerFile();
+    int             reserveReg(int reg);
+    int             obtainReg();
+    void            recycleReg(int reg);
+    void            reset();
+
+    class RegisterFile
+    {
+    public:
+                            RegisterFile();
+                            RegisterFile(const RegisterFile& rhs);
+                            ~RegisterFile();
+
+                void        reset();
+
+                bool operator == (const RegisterFile& rhs) const;
+                bool operator != (const RegisterFile& rhs) const {
+                    return !operator == (rhs);
+                }
+
+                int         reserve(int reg);
+                void        reserveSeveral(uint32_t regMask);
+
+                void        recycle(int reg);
+                void        recycleSeveral(uint32_t regMask);
+
+                int         obtain();
+        inline  int         isUsed(int reg) const;
+
+                bool        hasFreeRegs() const;
+                int         countFreeRegs() const;                
+
+                uint32_t    touched() const;
+        inline  uint32_t    status() const { return mStatus; }
+        
+        enum {
+            OUT_OF_REGISTERS = 0x1
+        };
+
+    private:
+        uint32_t    mRegs;
+        uint32_t    mTouched;
+        uint32_t    mStatus;
+    };
+ 
+    class Scratch
+    {
+    public:
+            Scratch(RegisterFile& regFile)
+                : mRegFile(regFile), mScratch(0) { 
+            }
+            ~Scratch() {
+                mRegFile.recycleSeveral(mScratch);
+            }
+        int obtain() { 
+            int reg = mRegFile.obtain();
+            mScratch |= 1<<reg;
+            return reg;
+        }
+        void recycle(int reg) {
+            mRegFile.recycle(reg);
+            mScratch &= ~(1<<reg);
+        }
+        bool isUsed(int reg) {
+            return (mScratch & (1<<reg));
+        }
+        int countFreeRegs() {
+            return mRegFile.countFreeRegs();
+        }
+    private:
+        RegisterFile&   mRegFile;
+        uint32_t        mScratch;
+    };
+
+    class Spill
+    {
+    public:
+        Spill(RegisterFile& regFile, ARMAssemblerInterface& gen, uint32_t reglist)
+            : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0)
+        {
+            if (reglist) {
+                int count = 0;
+                while (reglist) {
+                    count++;
+                    reglist &= ~(1 << (31 - __builtin_clz(reglist)));
+                }
+                if (count == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.STR(mGen.AL, reg, mGen.SP, mGen.immed12_pre(-4, 1));
+                } else {
+                    mGen.STM(mGen.AL, mGen.DB, mGen.SP, 1, mRegList);
+                }
+                mRegFile.recycleSeveral(mRegList);
+                mCount = count;
+            }
+        }
+        ~Spill() {
+            if (mRegList) {
+                if (mCount == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.LDR(mGen.AL, reg, mGen.SP, mGen.immed12_post(4));
+                } else {
+                    mGen.LDM(mGen.AL, mGen.IA, mGen.SP, 1, mRegList);
+                }
+                mRegFile.reserveSeveral(mRegList);
+            }
+        }
+    private:
+        RegisterFile&           mRegFile;
+        ARMAssemblerInterface&  mGen;
+        uint32_t                mRegList;
+        int                     mCount;
+    };
+    
+private:
+    RegisterFile    mRegs;
+};
+
+// ----------------------------------------------------------------------------
+
+class GGLAssembler : public ARMAssemblerProxy, public RegisterAllocator
+{
+public:
+
+                    GGLAssembler(ARMAssemblerInterface* target);
+        virtual     ~GGLAssembler();
+
+    uint32_t*   base() const { return 0; } // XXX
+    uint32_t*   pc() const { return 0; } // XXX
+
+    void        reset(int opt_level);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+
+        // generate scanline code for given needs
+    int         scanline(const needs_t& needs, context_t const* c);
+    int         scanline_core(const needs_t& needs, context_t const* c);
+
+        enum {
+            CLEAR_LO    = 0x0001,
+            CLEAR_HI    = 0x0002,
+            CORRUPTIBLE = 0x0004,
+            FIRST       = 0x0008
+        };
+
+        enum { //load/store flags
+            WRITE_BACK  = 0x0001
+        };
+
+        struct reg_t {
+            reg_t() : reg(-1), flags(0) {
+            }
+            reg_t(int r, int f=0)
+                : reg(r), flags(f) {
+            }
+            void setTo(int r, int f=0) {
+                reg=r; flags=f;
+            }
+            int         reg;
+            uint16_t    flags;
+        };
+
+        struct integer_t : public reg_t {
+            integer_t() : reg_t(), s(0) {
+            }
+            integer_t(int r, int sz=32, int f=0)
+                : reg_t(r, f), s(sz) {
+            }
+            void setTo(int r, int sz=32, int f=0) {
+                reg_t::setTo(r, f); s=sz;
+            }
+            int8_t s;
+            inline int size() const { return s; }
+        };
+        
+        struct pixel_t : public reg_t {
+            pixel_t() : reg_t() {
+                memset(&format, 0, sizeof(GGLFormat));
+            }
+            pixel_t(int r, const GGLFormat* fmt, int f=0)
+                : reg_t(r, f), format(*fmt) {
+            }
+            void setTo(int r, const GGLFormat* fmt, int f=0) {
+                reg_t::setTo(r, f); format = *fmt;
+            }
+            GGLFormat format;
+            inline int hi(int c) const { return format.c[c].h; }
+            inline int low(int c) const { return format.c[c].l; }
+            inline int mask(int c) const { return ((1<<size(c))-1) << low(c); }
+            inline int size() const { return format.size*8; }
+            inline int size(int c) const { return component_size(c); }
+            inline int component_size(int c) const { return hi(c) - low(c); }
+        };
+
+        struct component_t : public reg_t {
+            component_t() : reg_t(), h(0), l(0) {
+            }
+            component_t(int r, int f=0)
+                : reg_t(r, f), h(0), l(0) {
+            }
+            component_t(int r, int lo, int hi, int f=0)
+                : reg_t(r, f), h(hi), l(lo) {
+            }
+            explicit component_t(const integer_t& rhs)
+                : reg_t(rhs.reg, rhs.flags), h(rhs.s), l(0) {
+            }
+            explicit component_t(const pixel_t& rhs, int component) {
+                setTo(  rhs.reg, 
+                        rhs.format.c[component].l,
+                        rhs.format.c[component].h,
+                        rhs.flags|CLEAR_LO|CLEAR_HI);
+            }
+            void setTo(int r, int lo=0, int hi=0, int f=0) {
+                reg_t::setTo(r, f); h=hi; l=lo;
+            }
+            int8_t h;
+            int8_t l;
+            inline int size() const { return h-l; }
+        };
+
+        struct pointer_t : public reg_t {
+            pointer_t() : reg_t(), size(0) {
+            }
+            pointer_t(int r, int s, int f=0)
+                : reg_t(r, f), size(s) {
+            }
+            void setTo(int r, int s, int f=0) {
+                reg_t::setTo(r, f); size=s;
+            }
+            int8_t size;
+        };
+
+
+private:
+    struct tex_coord_t {
+        reg_t       s;
+        reg_t       t;
+        pointer_t   ptr;
+    };
+
+    struct fragment_parts_t {
+        uint32_t    packed  : 1;
+        uint32_t    reload  : 2;
+        uint32_t    iterated_packed  : 1;
+        pixel_t     iterated;
+        pointer_t   cbPtr;
+        pointer_t   covPtr;
+        reg_t       count;
+        reg_t       argb[4];
+        reg_t       argb_dx[4];
+        reg_t       z;
+        reg_t       dither;
+        pixel_t     texel[GGL_TEXTURE_UNIT_COUNT];
+        tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT];
+    };
+    
+    struct texture_unit_t {
+        int         format_idx;
+        GGLFormat   format;
+        int         bits;
+        int         swrap;
+        int         twrap;
+        int         env;
+        int         pot;
+        int         linear;
+        uint8_t     mask;
+        uint8_t     replaced;
+    };
+
+    struct texture_machine_t {
+        texture_unit_t  tmu[GGL_TEXTURE_UNIT_COUNT];
+        uint8_t         mask;
+        uint8_t         replaced;
+        uint8_t         directTexture;
+        uint8_t         activeUnits;
+    };
+
+    struct component_info_t {
+        bool    masked      : 1;
+        bool    inDest      : 1;
+        bool    needed      : 1;
+        bool    replaced    : 1;
+        bool    iterated    : 1;
+        bool    smooth      : 1;
+        bool    blend       : 1;
+        bool    fog         : 1;
+    };
+
+    struct builder_context_t {
+        context_t const*    c;
+        needs_t             needs;
+        int                 Rctx;
+    };
+
+    template <typename T>
+    void modify(T& r, Scratch& regs)
+    {
+        if (!(r.flags & CORRUPTIBLE)) {
+            r.reg = regs.obtain();
+            r.flags |= CORRUPTIBLE;
+        }
+    }
+
+    // helpers
+    void    base_offset(const pointer_t& d, const pointer_t& b, const reg_t& o);
+
+    // texture environement
+    void    modulate(   component_t& dest,
+                        const component_t& incoming,
+                        const pixel_t& texel, int component);
+
+    void    decal(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    void    blend(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component, int tmu);
+
+    void    add(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    // load/store stuff
+    void    store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0);
+    void    load(const pointer_t& addr, const pixel_t& dest, uint32_t flags=0);
+    void    extract(integer_t& d, const pixel_t& s, int component);    
+    void    extract(component_t& d, const pixel_t& s, int component);    
+    void    extract(integer_t& d, int s, int h, int l, int bits=32);
+    void    expand(integer_t& d, const integer_t& s, int dbits);
+    void    expand(integer_t& d, const component_t& s, int dbits);
+    void    expand(component_t& d, const component_t& s, int dbits);
+    void    downshift(pixel_t& d, int component, component_t s, const reg_t& dither);
+
+
+    void    mul_factor( component_t& d,
+                        const integer_t& v,
+                        const integer_t& f);
+
+    void    mul_factor_add( component_t& d,
+                            const integer_t& v,
+                            const integer_t& f,
+                            const component_t& a);
+
+    void    component_add(  component_t& d,
+                            const integer_t& dst,
+                            const integer_t& src);
+
+    void    component_sat(  const component_t& v);
+
+
+    void    build_scanline_prolog(  fragment_parts_t& parts,
+                                    const needs_t& needs);
+
+    void    build_smooth_shade(const fragment_parts_t& parts);
+
+    void    build_component(    pixel_t& pixel,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& global_scratches);
+                                
+    void    build_incoming_component(
+                                component_t& temp,
+                                int dst_size,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& scratches,
+                                Scratch& global_scratches);
+
+    void    init_iterated_color(fragment_parts_t& parts, const reg_t& x);
+
+    void    build_iterated_color(   component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs);
+
+    void    decodeLogicOpNeeds(const needs_t& needs);
+    
+    void    decodeTMUNeeds(const needs_t& needs, context_t const* c);
+
+    void    init_textures(  tex_coord_t* coords,
+                            const reg_t& x,
+                            const reg_t& y);
+
+    void    build_textures( fragment_parts_t& parts,
+                            Scratch& regs);
+
+    void    filter8(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter16(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter24(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter32(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    build_texture_environment(  component_t& fragment,
+                                        const fragment_parts_t& parts,
+                                        int component,
+                                        Scratch& regs);
+
+    void    wrapping(   int d,
+                        int coord, int size,
+                        int tx_wrap, int tx_linear);
+
+    void    build_fog(  component_t& temp,
+                        int component,
+                        Scratch& parent_scratches);
+
+    void    build_blending(     component_t& in_out,
+                                const pixel_t& pixel,
+                                int component,
+                                Scratch& parent_scratches);
+
+    void    build_blend_factor(
+                integer_t& factor, int f, int component,
+                const pixel_t& dst_pixel,
+                integer_t& fragment,
+                integer_t& fb,
+                Scratch& scratches);
+
+    void    build_blendFOneMinusF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void    build_blendOneMinusFF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void build_coverage_application(component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    Scratch& regs);
+
+    void build_alpha_test(component_t& fragment, const fragment_parts_t& parts);
+
+    enum { Z_TEST=1, Z_WRITE=2 }; 
+    void build_depth_test(const fragment_parts_t& parts, uint32_t mask);
+    void build_iterate_z(const fragment_parts_t& parts);
+    void build_iterate_f(const fragment_parts_t& parts);
+    void build_iterate_texture_coordinates(const fragment_parts_t& parts);
+
+    void build_logic_op(pixel_t& pixel, Scratch& regs);
+
+    void build_masking(pixel_t& pixel, Scratch& regs);
+
+    void build_and_immediate(int d, int s, uint32_t mask, int bits);
+
+    bool    isAlphaSourceNeeded() const;
+
+    enum {
+        FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8 
+    };
+    
+    enum {
+        LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4
+    };
+
+    static int blending_codes(int fs, int fd);
+
+    builder_context_t   mBuilderContext;
+    texture_machine_t   mTextureMachine;
+    component_info_t    mInfo[4];
+    int                 mBlending;
+    int                 mMasking;
+    int                 mAllMasked;
+    int                 mLogicOp;
+    int                 mAlphaTest;
+    int                 mAA;
+    int                 mDithering;
+    int                 mDepthTest;
+
+    int             mSmooth;
+    int             mFog;
+    pixel_t         mDstPixel;
+    
+    GGLFormat       mCbFormat;
+    
+    int             mBlendFactorCached;
+    integer_t       mAlphaSource;
+    
+    int             mBaseRegister;
+    
+    int             mBlendSrc;
+    int             mBlendDst;
+    int             mBlendSrcA;
+    int             mBlendDstA;
+    
+    int             mOptLevel;
+};
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif // ANDROID_GGLASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/armreg.h b/libpixelflinger/codeflinger/armreg.h
new file mode 100644
index 0000000..fde81ba
--- /dev/null
+++ b/libpixelflinger/codeflinger/armreg.h
@@ -0,0 +1,300 @@
+/*	$NetBSD: armreg.h,v 1.28 2003/10/31 16:30:15 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1998, 2001 Ben Harris
+ * Copyright (c) 1994-1996 Mark Brinicombe.
+ * Copyright (c) 1994 Brini.
+ * All rights reserved.
+ *
+ * This code is derived from software written for Brini by Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/armreg.h,v 1.3 2005/11/21 19:06:25 cognet Exp $
+ */
+
+#ifndef MACHINE_ARMREG_H
+#define MACHINE_ARMREG_H
+#define INSN_SIZE	4
+#define INSN_COND_MASK	0xf0000000	/* Condition mask */
+#define PSR_MODE        0x0000001f      /* mode mask */
+#define PSR_USR26_MODE  0x00000000
+#define PSR_FIQ26_MODE  0x00000001
+#define PSR_IRQ26_MODE  0x00000002
+#define PSR_SVC26_MODE  0x00000003
+#define PSR_USR32_MODE  0x00000010
+#define PSR_FIQ32_MODE  0x00000011
+#define PSR_IRQ32_MODE  0x00000012
+#define PSR_SVC32_MODE  0x00000013
+#define PSR_ABT32_MODE  0x00000017
+#define PSR_UND32_MODE  0x0000001b
+#define PSR_SYS32_MODE  0x0000001f
+#define PSR_32_MODE     0x00000010
+#define PSR_FLAGS	0xf0000000    /* flags */
+
+#define PSR_C_bit (1 << 29)       /* carry */
+
+/* The high-order byte is always the implementor */
+#define CPU_ID_IMPLEMENTOR_MASK	0xff000000
+#define CPU_ID_ARM_LTD		0x41000000 /* 'A' */
+#define CPU_ID_DEC		0x44000000 /* 'D' */
+#define CPU_ID_INTEL		0x69000000 /* 'i' */
+#define	CPU_ID_TI		0x54000000 /* 'T' */
+
+/* How to decide what format the CPUID is in. */
+#define CPU_ID_ISOLD(x)		(((x) & 0x0000f000) == 0x00000000)
+#define CPU_ID_IS7(x)		(((x) & 0x0000f000) == 0x00007000)
+#define CPU_ID_ISNEW(x)		(!CPU_ID_ISOLD(x) && !CPU_ID_IS7(x))
+
+/* On ARM3 and ARM6, this byte holds the foundry ID. */
+#define CPU_ID_FOUNDRY_MASK	0x00ff0000
+#define CPU_ID_FOUNDRY_VLSI	0x00560000
+
+/* On ARM7 it holds the architecture and variant (sub-model) */
+#define CPU_ID_7ARCH_MASK	0x00800000
+#define CPU_ID_7ARCH_V3		0x00000000
+#define CPU_ID_7ARCH_V4T	0x00800000
+#define CPU_ID_7VARIANT_MASK	0x007f0000
+
+/* On more recent ARMs, it does the same, but in a different format */
+#define CPU_ID_ARCH_MASK	0x000f0000
+#define CPU_ID_ARCH_V3		0x00000000
+#define CPU_ID_ARCH_V4		0x00010000
+#define CPU_ID_ARCH_V4T		0x00020000
+#define CPU_ID_ARCH_V5		0x00030000
+#define CPU_ID_ARCH_V5T		0x00040000
+#define CPU_ID_ARCH_V5TE	0x00050000
+#define CPU_ID_VARIANT_MASK	0x00f00000
+
+/* Next three nybbles are part number */
+#define CPU_ID_PARTNO_MASK	0x0000fff0
+
+/* Intel XScale has sub fields in part number */
+#define CPU_ID_XSCALE_COREGEN_MASK	0x0000e000 /* core generation */
+#define CPU_ID_XSCALE_COREREV_MASK	0x00001c00 /* core revision */
+#define CPU_ID_XSCALE_PRODUCT_MASK	0x000003f0 /* product number */
+
+/* And finally, the revision number. */
+#define CPU_ID_REVISION_MASK	0x0000000f
+
+/* Individual CPUs are probably best IDed by everything but the revision. */
+#define CPU_ID_CPU_MASK		0xfffffff0
+
+/* Fake CPU IDs for ARMs without CP15 */
+#define CPU_ID_ARM2		0x41560200
+#define CPU_ID_ARM250		0x41560250
+
+/* Pre-ARM7 CPUs -- [15:12] == 0 */
+#define CPU_ID_ARM3		0x41560300
+#define CPU_ID_ARM600		0x41560600
+#define CPU_ID_ARM610		0x41560610
+#define CPU_ID_ARM620		0x41560620
+
+/* ARM7 CPUs -- [15:12] == 7 */
+#define CPU_ID_ARM700		0x41007000 /* XXX This is a guess. */
+#define CPU_ID_ARM710		0x41007100
+#define CPU_ID_ARM7500		0x41027100 /* XXX This is a guess. */
+#define CPU_ID_ARM710A		0x41047100 /* inc ARM7100 */
+#define CPU_ID_ARM7500FE	0x41077100
+#define CPU_ID_ARM710T		0x41807100
+#define CPU_ID_ARM720T		0x41807200
+#define CPU_ID_ARM740T8K	0x41807400 /* XXX no MMU, 8KB cache */
+#define CPU_ID_ARM740T4K	0x41817400 /* XXX no MMU, 4KB cache */
+
+/* Post-ARM7 CPUs */
+#define CPU_ID_ARM810		0x41018100
+#define CPU_ID_ARM920T		0x41129200
+#define CPU_ID_ARM920T_ALT	0x41009200
+#define CPU_ID_ARM922T		0x41029220
+#define CPU_ID_ARM940T		0x41029400 /* XXX no MMU */
+#define CPU_ID_ARM946ES		0x41049460 /* XXX no MMU */
+#define	CPU_ID_ARM966ES		0x41049660 /* XXX no MMU */
+#define	CPU_ID_ARM966ESR1	0x41059660 /* XXX no MMU */
+#define CPU_ID_ARM1020E		0x4115a200 /* (AKA arm10 rev 1) */
+#define CPU_ID_ARM1022ES	0x4105a220
+#define CPU_ID_SA110		0x4401a100
+#define CPU_ID_SA1100		0x4401a110
+#define	CPU_ID_TI925T		0x54029250
+#define CPU_ID_SA1110		0x6901b110
+#define CPU_ID_IXP1200		0x6901c120
+#define CPU_ID_80200		0x69052000
+#define CPU_ID_PXA250    	0x69052100 /* sans core revision */
+#define CPU_ID_PXA210    	0x69052120
+#define CPU_ID_PXA250A		0x69052100 /* 1st version Core */
+#define CPU_ID_PXA210A		0x69052120 /* 1st version Core */
+#define CPU_ID_PXA250B		0x69052900 /* 3rd version Core */
+#define CPU_ID_PXA210B		0x69052920 /* 3rd version Core */
+#define CPU_ID_PXA250C		0x69052d00 /* 4th version Core */
+#define CPU_ID_PXA210C		0x69052d20 /* 4th version Core */
+#define	CPU_ID_80321_400	0x69052420
+#define	CPU_ID_80321_600	0x69052430
+#define	CPU_ID_80321_400_B0	0x69052c20
+#define	CPU_ID_80321_600_B0	0x69052c30
+#define	CPU_ID_IXP425_533	0x690541c0
+#define	CPU_ID_IXP425_400	0x690541d0
+#define	CPU_ID_IXP425_266	0x690541f0
+
+/* ARM3-specific coprocessor 15 registers */
+#define ARM3_CP15_FLUSH		1
+#define ARM3_CP15_CONTROL	2
+#define ARM3_CP15_CACHEABLE	3
+#define ARM3_CP15_UPDATEABLE	4
+#define ARM3_CP15_DISRUPTIVE	5	
+
+/* ARM3 Control register bits */
+#define ARM3_CTL_CACHE_ON	0x00000001
+#define ARM3_CTL_SHARED		0x00000002
+#define ARM3_CTL_MONITOR	0x00000004
+
+/*
+ * Post-ARM3 CP15 registers:
+ *
+ *	1	Control register
+ *
+ *	2	Translation Table Base
+ *
+ *	3	Domain Access Control
+ *
+ *	4	Reserved
+ *
+ *	5	Fault Status
+ *
+ *	6	Fault Address
+ *
+ *	7	Cache/write-buffer Control
+ *
+ *	8	TLB Control
+ *
+ *	9	Cache Lockdown
+ *
+ *	10	TLB Lockdown
+ *
+ *	11	Reserved
+ *
+ *	12	Reserved
+ *
+ *	13	Process ID (for FCSE)
+ *
+ *	14	Reserved
+ *
+ *	15	Implementation Dependent
+ */
+
+/* Some of the definitions below need cleaning up for V3/V4 architectures */
+
+/* CPU control register (CP15 register 1) */
+#define CPU_CONTROL_MMU_ENABLE	0x00000001 /* M: MMU/Protection unit enable */
+#define CPU_CONTROL_AFLT_ENABLE	0x00000002 /* A: Alignment fault enable */
+#define CPU_CONTROL_DC_ENABLE	0x00000004 /* C: IDC/DC enable */
+#define CPU_CONTROL_WBUF_ENABLE 0x00000008 /* W: Write buffer enable */
+#define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */
+#define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */
+#define CPU_CONTROL_LABT_ENABLE 0x00000040 /* L: Late abort enable */
+#define CPU_CONTROL_BEND_ENABLE 0x00000080 /* B: Big-endian mode */
+#define CPU_CONTROL_SYST_ENABLE 0x00000100 /* S: System protection bit */
+#define CPU_CONTROL_ROM_ENABLE	0x00000200 /* R: ROM protection bit */
+#define CPU_CONTROL_CPCLK	0x00000400 /* F: Implementation defined */
+#define CPU_CONTROL_BPRD_ENABLE 0x00000800 /* Z: Branch prediction enable */
+#define CPU_CONTROL_IC_ENABLE   0x00001000 /* I: IC enable */
+#define CPU_CONTROL_VECRELOC	0x00002000 /* V: Vector relocation */
+#define CPU_CONTROL_ROUNDROBIN	0x00004000 /* RR: Predictable replacement */
+#define CPU_CONTROL_V4COMPAT	0x00008000 /* L4: ARMv4 compat LDR R15 etc */
+
+#define CPU_CONTROL_IDC_ENABLE	CPU_CONTROL_DC_ENABLE
+
+/* XScale Auxillary Control Register (CP15 register 1, opcode2 1) */
+#define	XSCALE_AUXCTL_K		0x00000001 /* dis. write buffer coalescing */
+#define	XSCALE_AUXCTL_P		0x00000002 /* ECC protect page table access */
+#define	XSCALE_AUXCTL_MD_WB_RA	0x00000000 /* mini-D$ wb, read-allocate */
+#define	XSCALE_AUXCTL_MD_WB_RWA	0x00000010 /* mini-D$ wb, read/write-allocate */
+#define	XSCALE_AUXCTL_MD_WT	0x00000020 /* mini-D$ wt, read-allocate */
+#define	XSCALE_AUXCTL_MD_MASK	0x00000030
+
+/* Cache type register definitions */
+#define	CPU_CT_ISIZE(x)		((x) & 0xfff)		/* I$ info */
+#define	CPU_CT_DSIZE(x)		(((x) >> 12) & 0xfff)	/* D$ info */
+#define	CPU_CT_S		(1U << 24)		/* split cache */
+#define	CPU_CT_CTYPE(x)		(((x) >> 25) & 0xf)	/* cache type */
+
+#define	CPU_CT_CTYPE_WT		0	/* write-through */
+#define	CPU_CT_CTYPE_WB1	1	/* write-back, clean w/ read */
+#define	CPU_CT_CTYPE_WB2	2	/* w/b, clean w/ cp15,7 */
+#define	CPU_CT_CTYPE_WB6	6	/* w/b, cp15,7, lockdown fmt A */
+#define	CPU_CT_CTYPE_WB7	7	/* w/b, cp15,7, lockdown fmt B */
+
+#define	CPU_CT_xSIZE_LEN(x)	((x) & 0x3)		/* line size */
+#define	CPU_CT_xSIZE_M		(1U << 2)		/* multiplier */
+#define	CPU_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x7)	/* associativity */
+#define	CPU_CT_xSIZE_SIZE(x)	(((x) >> 6) & 0x7)	/* size */
+
+/* Fault status register definitions */
+
+#define FAULT_TYPE_MASK 0x0f
+#define FAULT_USER      0x10
+
+#define FAULT_WRTBUF_0  0x00 /* Vector Exception */
+#define FAULT_WRTBUF_1  0x02 /* Terminal Exception */
+#define FAULT_BUSERR_0  0x04 /* External Abort on Linefetch -- Section */
+#define FAULT_BUSERR_1  0x06 /* External Abort on Linefetch -- Page */
+#define FAULT_BUSERR_2  0x08 /* External Abort on Non-linefetch -- Section */
+#define FAULT_BUSERR_3  0x0a /* External Abort on Non-linefetch -- Page */
+#define FAULT_BUSTRNL1  0x0c /* External abort on Translation -- Level 1 */
+#define FAULT_BUSTRNL2  0x0e /* External abort on Translation -- Level 2 */
+#define FAULT_ALIGN_0   0x01 /* Alignment */
+#define FAULT_ALIGN_1   0x03 /* Alignment */
+#define FAULT_TRANS_S   0x05 /* Translation -- Section */
+#define FAULT_TRANS_P   0x07 /* Translation -- Page */
+#define FAULT_DOMAIN_S  0x09 /* Domain -- Section */
+#define FAULT_DOMAIN_P  0x0b /* Domain -- Page */
+#define FAULT_PERM_S    0x0d /* Permission -- Section */
+#define FAULT_PERM_P    0x0f /* Permission -- Page */
+
+#define	FAULT_IMPRECISE	0x400	/* Imprecise exception (XSCALE) */
+
+/*
+ * Address of the vector page, low and high versions.
+ */
+#define	ARM_VECTORS_LOW		0x00000000U
+#define	ARM_VECTORS_HIGH	0xffff0000U
+
+/*
+ * ARM Instructions
+ *
+ *       3 3 2 2 2                              
+ *       1 0 9 8 7                                                     0
+ *      +-------+-------------------------------------------------------+
+ *      | cond  |              instruction dependant                    |
+ *      |c c c c|                                                       |
+ *      +-------+-------------------------------------------------------+
+ */
+
+#define INSN_SIZE		4		/* Always 4 bytes */
+#define INSN_COND_MASK		0xf0000000	/* Condition mask */
+#define INSN_COND_AL		0xe0000000	/* Always condition */
+
+#endif /* !MACHINE_ARMREG_H */
diff --git a/libpixelflinger/codeflinger/blending.cpp b/libpixelflinger/codeflinger/blending.cpp
new file mode 100644
index 0000000..f10217b
--- /dev/null
+++ b/libpixelflinger/codeflinger/blending.cpp
@@ -0,0 +1,682 @@
+/* libs/pixelflinger/codeflinger/blending.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+
+namespace android {
+
+void GGLAssembler::build_fog(
+                        component_t& temp,      // incomming fragment / output
+                        int component,
+                        Scratch& regs)
+{
+   if (mInfo[component].fog) {
+        Scratch scratches(registerFile());
+        comment("fog");
+
+        integer_t fragment(temp.reg, temp.h, temp.flags);
+        if (!(temp.flags & CORRUPTIBLE)) {
+            temp.reg = regs.obtain();
+            temp.flags |= CORRUPTIBLE;
+        }
+
+        integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE); 
+        LDRB(AL, fogColor.reg, mBuilderContext.Rctx,
+                immed12_pre(GGL_OFFSETOF(state.fog.color[component])));
+
+        integer_t factor(scratches.obtain(), 16, CORRUPTIBLE);
+        CONTEXT_LOAD(factor.reg, generated_vars.f);
+
+        // clamp fog factor (TODO: see if there is a way to guarantee
+        // we won't overflow, when setting the iterators)
+        BIC(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, ASR, 31));
+        CMP(AL, factor.reg, imm( 0x10000 ));
+        MOV(HS, 0, factor.reg, imm( 0x10000 ));
+
+        build_blendFOneMinusF(temp, factor, fragment, fogColor);
+    }
+}
+
+void GGLAssembler::build_blending(
+                        component_t& temp,      // incomming fragment / output
+                        const pixel_t& pixel,   // framebuffer
+                        int component,
+                        Scratch& regs)
+{
+   if (!mInfo[component].blend)
+        return;
+        
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
+        fs = GGL_ONE;
+    const int blending = blending_codes(fs, fd);
+    if (!temp.size()) {
+        // here, blending will produce something which doesn't depend on
+        // that component (eg: GL_ZERO:GL_*), so the register has not been
+        // allocated yet. Will never be used as a source.
+        temp = component_t(regs.obtain(), CORRUPTIBLE);
+    }
+
+    // we are doing real blending...
+    // fb:          extracted dst
+    // fragment:    extracted src
+    // temp:        component_t(fragment) and result
+
+    // scoped register allocator
+    Scratch scratches(registerFile());
+    comment("blending");
+
+    // we can optimize these cases a bit...
+    // (1) saturation is not needed
+    // (2) we can use only one multiply instead of 2
+    // (3) we can reduce the register pressure
+    //      R = S*f + D*(1-f) = (S-D)*f + D
+    //      R = S*(1-f) + D*f = (D-S)*f + S
+
+    const bool same_factor_opt1 =
+        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
+        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
+        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
+        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);
+
+    const bool same_factor_opt2 =
+        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
+        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || 
+        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
+        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);
+
+
+    // XXX: we could also optimize these cases:
+    // R = S*f + D*f = (S+D)*f
+    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
+    // R = S*D + D*S = 2*S*D
+
+
+    // see if we need to extract 'component' from the destination (fb)
+    integer_t fb;
+    if (blending & (BLEND_DST|FACTOR_DST)) { 
+        fb.setTo(scratches.obtain(), 32); 
+        extract(fb, pixel, component);
+        if (mDithering) {
+            // XXX: maybe what we should do instead, is simply
+            // expand fb -or- fragment to the larger of the two
+            if (fb.size() < temp.size()) {
+                // for now we expand 'fb' to min(fragment, 8)
+                int new_size = temp.size() < 8 ? temp.size() : 8;
+                expand(fb, fb, new_size);
+            }
+        }
+    }
+
+
+    // convert input fragment to integer_t
+    if (temp.l && (temp.flags & CORRUPTIBLE)) {
+        MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l));
+        temp.h -= temp.l;
+        temp.l = 0;
+    }
+    integer_t fragment(temp.reg, temp.size(), temp.flags);
+
+    // if not done yet, convert input fragment to integer_t
+    if (temp.l) {
+        // here we know temp is not CORRUPTIBLE
+        fragment.reg = scratches.obtain();
+        MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l));
+        fragment.flags |= CORRUPTIBLE;
+    }
+
+    if (!(temp.flags & CORRUPTIBLE)) {
+        // temp is not corruptible, but since it's the destination it
+        // will be modified, so we need to allocate a new register.
+        temp.reg = regs.obtain();
+        temp.flags &= ~CORRUPTIBLE;
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+    if ((blending & BLEND_SRC) && !same_factor_opt1) {
+        // source (fragment) is needed for the blending stage
+        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+
+    if (same_factor_opt1) {
+        //  R = S*f + D*(1-f) = (S-D)*f + D
+        integer_t factor;
+        build_blend_factor(factor, fs, 
+                component, pixel, fragment, fb, scratches);
+        // fb is always corruptible from this point
+        fb.flags |= CORRUPTIBLE;
+        build_blendFOneMinusF(temp, factor, fragment, fb);
+    } else if (same_factor_opt2) {
+        //  R = S*(1-f) + D*f = (D-S)*f + S
+        integer_t factor;
+        // fb is always corrruptible here
+        fb.flags |= CORRUPTIBLE;
+        build_blend_factor(factor, fd,
+                component, pixel, fragment, fb, scratches);
+        build_blendOneMinusFF(temp, factor, fragment, fb);
+    } else {
+        integer_t src_factor;
+        integer_t dst_factor;
+
+        // if destination (fb) is not needed for the blending stage, 
+        // then it can be marked as CORRUPTIBLE
+        if (!(blending & BLEND_DST)) {
+            fb.flags |= CORRUPTIBLE;
+        }
+
+        // XXX: try to mark some registers as CORRUPTIBLE
+        // in most case we could make those corruptible
+        // when we're processing the last component
+        // but not always, for instance
+        //    when fragment is constant and not reloaded
+        //    when fb is needed for logic-ops or masking
+        //    when a register is aliased (for instance with mAlphaSource)
+
+        // blend away...
+        if (fs==GGL_ZERO) {
+            if (fd==GGL_ZERO) {         // R = 0
+                // already taken care of
+            } else if (fd==GGL_ONE) {   // R = D
+                // already taken care of
+            } else {                    // R = D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor(temp, fb, dst_factor);
+            }
+        } else if (fs==GGL_ONE) {
+            if (fd==GGL_ZERO) {         // R = S
+                // NOP, taken care of
+            } else if (fd==GGL_ONE) {   // R = S + D
+                component_add(temp, fb, fragment); // args order matters
+                component_sat(temp);
+            } else {                    // R = S + D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
+                if (fd==GGL_ONE_MINUS_SRC_ALPHA) {
+                    // XXX: in theory this is not correct, we should
+                    // saturate here. However, this mode is often
+                    // used for displaying alpha-premultiplied graphics,
+                    // in which case, saturation is not necessary.
+                    // unfortunatelly, we have no way to know.
+                    // This is a case, where we sacrifice correctness for
+                    // performance. we should probably have some heuristics.
+                } else {
+                    component_sat(temp);
+                }
+            }
+        } else {
+            // compute fs
+            build_blend_factor(src_factor, fs, 
+                    component, pixel, fragment, fb, scratches);
+            if (fd==GGL_ZERO) {         // R = S*fs
+                mul_factor(temp, fragment, src_factor);
+            } else if (fd==GGL_ONE) {   // R = S*fs + D
+                mul_factor_add(temp, fragment, src_factor, component_t(fb));
+                component_sat(temp);
+            } else {                    // R = S*fs + D*fd
+                mul_factor(temp, fragment, src_factor);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, temp);
+                if (!same_factor_opt1 && !same_factor_opt2) {
+                    component_sat(temp);
+                }
+            }
+        }
+    }
+
+    // now we can be corrupted (it's the dest)
+    temp.flags |= CORRUPTIBLE;
+}
+
+void GGLAssembler::build_blend_factor(
+        integer_t& factor, int f, int component,
+        const pixel_t& dst_pixel,
+        integer_t& fragment,
+        integer_t& fb,
+        Scratch& scratches)
+{
+    integer_t src_alpha(fragment);
+
+    // src_factor/dst_factor won't be used after blending,
+    // so it's fine to mark them as CORRUPTIBLE (if not aliased)
+    factor.flags |= CORRUPTIBLE;
+
+    switch(f) {
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) {
+            // we're processing alpha, so we already have
+            // src-alpha in fragment, and we need src-alpha just this time.
+        } else {
+           // alpha-src will be needed for other components
+            if (!mBlendFactorCached || mBlendFactorCached==f) {
+                src_alpha = mAlphaSource;
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                // we already computed the blend factor before, nothing to do.
+                if (mBlendFactorCached)
+                    return;
+                // this is the first time, make sure to compute the blend
+                // factor properly.
+                mBlendFactorCached = f;
+                break;
+            } else {
+                // we have a cached alpha blend factor, but we want another one,
+                // this should really not happen because by construction,
+                // we cannot have BOTH source and destination
+                // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because
+                // the blending stage uses the f/(1-f) optimization
+                
+                // for completeness, we handle this case though. Since there
+                // are only 2 choices, this meens we want "the other one"
+                // (1-factor)
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+                mBlendFactorCached = f;
+                return;
+            }                
+        }
+        // fall-through...
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        // help us find out what register we can use for the blend-factor
+        // CORRUPTIBLE registers are chosen first, or a new one is allocated.
+        if (fragment.flags & CORRUPTIBLE) {
+            factor.setTo(fragment.reg, 32, CORRUPTIBLE);
+            fragment.flags &= ~CORRUPTIBLE;
+        } else if (fb.flags & CORRUPTIBLE) {
+            factor.setTo(fb.reg, 32, CORRUPTIBLE);
+            fb.flags &= ~CORRUPTIBLE;
+        } else {
+            factor.setTo(scratches.obtain(), 32, CORRUPTIBLE);
+        } 
+        break;
+    }
+
+    // XXX: doesn't work if size==1
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        factor.s = fb.s;
+        ADD(AL, 0, factor.reg, fb.reg, reg_imm(fb.reg, LSR, fb.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        factor.s = fragment.s;
+        ADD(AL, 0, factor.reg, fragment.reg,
+            reg_imm(fragment.reg, LSR, fragment.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        factor.s = src_alpha.s;
+        ADD(AL, 0, factor.reg, src_alpha.reg,
+                reg_imm(src_alpha.reg, LSR, src_alpha.s-1));
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // XXX: should be precomputed
+        extract(factor, dst_pixel, GGLFormat::ALPHA);
+        ADD(AL, 0, factor.reg, factor.reg,
+                reg_imm(factor.reg, LSR, factor.s-1));
+        break;
+    case GGL_SRC_ALPHA_SATURATE:
+        // XXX: should be precomputed
+        // XXX: f = min(As, 1-Ad)
+        // btw, we're guaranteed that Ad's size is <= 8, because
+        // it's extracted from the framebuffer
+        break;
+    }
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_ONE_MINUS_SRC_ALPHA:
+        RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+    }
+    
+    // don't need more than 8-bits for the blend factor
+    // and this will prevent overflows in the multiplies later
+    if (factor.s > 8) {
+        MOV(AL, 0, factor.reg, reg_imm(factor.reg, LSR, factor.s-8));
+        factor.s = 8;
+    }
+}
+
+int GGLAssembler::blending_codes(int fs, int fd)
+{
+    int blending = 0;
+    switch(fs) {
+    case GGL_ONE:
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // no need to extract 'component' from the destination
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    }
+    switch(fd) {
+    case GGL_ONE:
+        blending |= BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        // no need to extract 'component' from the source
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_DST;
+        break;
+    }
+    return blending;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_blendFOneMinusF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute S-D
+    integer_t diff(fragment.flags & CORRUPTIBLE ?
+            fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                RSB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fb));
+}
+
+void GGLAssembler::build_blendOneMinusFF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute D-S
+    integer_t diff(fb.flags & CORRUPTIBLE ?
+            fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                SUB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fragment));
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::mul_factor(  component_t& d,
+                                const integer_t& v,
+                                const integer_t& f)
+{
+    int vs = v.size();
+    int fs = f.size();
+    int ms = vs+fs;
+
+    // XXX: we could have special cases for 1 bit mul
+
+    // all this code below to use the best multiply instruction
+    // wrt the parameters size. We take advantage of the fact
+    // that the 16-bits multiplies allow a 16-bit shift
+    // The trick is that we just make sure that we have at least 8-bits
+    // per component (which is enough for a 8 bits display).
+
+    int xy;
+    int vshift = 0;
+    int fshift = 0;
+    int smulw = 0;
+
+    if (vs<16) {
+        if (fs<16) {
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 16;
+            xy = xyTB;
+        } else {
+            // eg: 15 * 18  ->  15 * 15
+            fshift = fs - 15;
+            ms -= fshift;
+            xy = xyBB;
+        }
+    } else if (GGL_BETWEEN(vs, 24, 31)) {
+        if (fs<16) {
+            ms -= 16;
+            xy = xyTB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 32;
+            xy = xyTT;
+        } else {
+            // eg: 24 * 18  ->  8 * 18
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = xyTB;
+        }
+    } else {
+        if (fs<16) {
+            // eg: 18 * 15  ->  15 * 15
+            vshift = vs - 15;
+            ms -= vshift;
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            // eg: 18 * 24  ->  15 * 8
+            vshift = vs - 15;
+            ms -= 16 + vshift;
+            xy = xyBT;
+        } else {
+            // eg: 18 * 18  ->  (15 * 18)>>16
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = yB;    //XXX SMULWB
+            smulw = 1;
+        }
+    }
+
+    LOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs);
+
+    int vreg = v.reg;
+    int freg = f.reg;
+    if (vshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, vshift));
+        vreg = d.reg;
+    }
+    if (fshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, fshift));
+        freg = d.reg;
+    }
+    if (smulw)  SMULW(AL, xy, d.reg, vreg, freg);
+    else        SMUL(AL, xy, d.reg, vreg, freg);
+
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = 0; 
+    } else {
+        d.l = fs; 
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::mul_factor_add(  component_t& d,
+                                    const integer_t& v,
+                                    const integer_t& f,
+                                    const component_t& a)
+{
+    // XXX: we could have special cases for 1 bit mul
+    Scratch scratches(registerFile());
+
+    int vs = v.size();
+    int fs = f.size();
+    int as = a.h;
+    int ms = vs+fs;
+
+    LOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as);
+
+    integer_t add(a.reg, a.h, a.flags);
+
+    // 'a' is a component_t but it is guaranteed to have
+    // its high bits set to 0. However in the dithering case,
+    // we can't get away with truncating the potentially bad bits
+    // so extraction is needed.
+
+   if ((mDithering) && (a.size() < ms)) {
+        // we need to expand a
+        if (!(a.flags & CORRUPTIBLE)) {
+            // ... but it's not corruptible, so we need to pick a
+            // temporary register.
+            // Try to uses the destination register first (it's likely
+            // to be usable, unless it aliases an input).
+            if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) {
+                add.reg = d.reg;
+            } else {
+                add.reg = scratches.obtain();
+            }
+        }
+        expand(add, a, ms); // extracts and expands
+        as = ms;
+    }
+
+    if (ms == as) {
+        if (vs<16 && fs<16) SMLABB(AL, d.reg, v.reg, f.reg, add.reg);
+        else                MLA(AL, 0, d.reg, v.reg, f.reg, add.reg);
+    } else {
+        int temp = d.reg;
+        if (temp == add.reg) {
+            // the mul will modify add.reg, we need an intermediary reg
+            if (v.flags & CORRUPTIBLE)      temp = v.reg;
+            else if (f.flags & CORRUPTIBLE) temp = f.reg;
+            else                            temp = scratches.obtain();
+        }
+
+        if (vs<16 && fs<16) SMULBB(AL, temp, v.reg, f.reg);
+        else                MUL(AL, 0, temp, v.reg, f.reg);
+
+        if (ms>as) {
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSL, ms-as));
+        } else if (ms<as) {
+            // not sure if we should expand the mul instead?
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSR, as-ms));
+        }
+    }
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = a.l; 
+    } else {
+        d.l = fs>a.l ? fs : a.l;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_add(component_t& d,
+        const integer_t& dst, const integer_t& src)
+{
+    // here we're guaranteed that fragment.size() >= fb.size()
+    const int shift = src.size() - dst.size();
+    if (!shift) {
+        ADD(AL, 0, d.reg, src.reg, dst.reg);
+    } else {
+        ADD(AL, 0, d.reg, src.reg, reg_imm(dst.reg, LSL, shift));
+    }
+
+    d.h = src.size();
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = shift;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_sat(const component_t& v)
+{
+    const int one = ((1<<v.size())-1)<<v.l;
+    CMP(AL, v.reg, imm( 1<<v.h ));
+    if (isValidImmediate(one)) {
+        MOV(HS, 0, v.reg, imm( one ));
+    } else if (isValidImmediate(~one)) {
+        MVN(HS, 0, v.reg, imm( ~one ));
+    } else {
+        MOV(HS, 0, v.reg, imm( 1<<v.h ));
+        SUB(HS, 0, v.reg, v.reg, imm( 1<<v.l ));
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c
new file mode 100644
index 0000000..4676da0
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.c
@@ -0,0 +1,702 @@
+/*	$NetBSD: disassem.c,v 1.14 2003/03/27 16:58:36 mycroft Exp $	*/
+
+/*-
+ * Copyright (c) 1996 Mark Brinicombe.
+ * Copyright (c) 1996 Brini.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * RiscBSD kernel project
+ *
+ * db_disasm.c
+ *
+ * Kernel disassembler
+ *
+ * Created      : 10/02/96
+ *
+ * Structured after the sparc/sparc/db_disasm.c by David S. Miller &
+ * Paul Kranenburg
+ *
+ * This code is not complete. Not all instructions are disassembled.
+ */
+
+#include <sys/cdefs.h>
+//__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/arm/arm/disassem.c,v 1.2 2005/01/05 21:58:47 imp Exp $");
+#include <sys/param.h>
+#include <stdio.h>
+
+#include "disassem.h"
+#include "armreg.h"
+//#include <ddb/ddb.h>
+
+/*
+ * General instruction format
+ *
+ *	insn[cc][mod]	[operands]
+ *
+ * Those fields with an uppercase format code indicate that the field
+ * follows directly after the instruction before the separator i.e.
+ * they modify the instruction rather than just being an operand to
+ * the instruction. The only exception is the writeback flag which
+ * follows a operand.
+ *
+ *
+ * 2 - print Operand 2 of a data processing instruction
+ * d - destination register (bits 12-15)
+ * n - n register (bits 16-19)
+ * s - s register (bits 8-11)
+ * o - indirect register rn (bits 16-19) (used by swap)
+ * m - m register (bits 0-3)
+ * a - address operand of ldr/str instruction
+ * e - address operand of ldrh/strh instruction
+ * l - register list for ldm/stm instruction
+ * f - 1st fp operand (register) (bits 12-14)
+ * g - 2nd fp operand (register) (bits 16-18)
+ * h - 3rd fp operand (register/immediate) (bits 0-4)
+ * b - branch address
+ * t - thumb branch address (bits 24, 0-23)
+ * k - breakpoint comment (bits 0-3, 8-19)
+ * X - block transfer type
+ * Y - block transfer type (r13 base)
+ * c - comment field bits(0-23)
+ * p - saved or current status register
+ * F - PSR transfer fields
+ * D - destination-is-r15 (P) flag on TST, TEQ, CMP, CMN
+ * L - co-processor transfer size
+ * S - set status flag
+ * P - fp precision
+ * Q - fp precision (for ldf/stf)
+ * R - fp rounding
+ * v - co-processor data transfer registers + addressing mode
+ * W - writeback flag
+ * x - instruction in hex
+ * # - co-processor number
+ * y - co-processor data processing registers
+ * z - co-processor register transfer registers
+ */
+
+struct arm32_insn {
+	u_int mask;
+	u_int pattern;
+	char* name;
+	char* format;
+};
+
+static const struct arm32_insn arm32_i[] = {
+    { 0x0fffffff, 0x0ff00000, "imb",	"c" },		/* Before swi */
+    { 0x0fffffff, 0x0ff00001, "imbrange",	"c" },	/* Before swi */
+    { 0x0f000000, 0x0f000000, "swi",	"c" },
+    { 0xfe000000, 0xfa000000, "blx",	"t" },		/* Before b and bl */
+    { 0x0f000000, 0x0a000000, "b",	"b" },
+    { 0x0f000000, 0x0b000000, "bl",	"b" },
+    { 0x0fe000f0, 0x00000090, "mul",	"Snms" },
+    { 0x0fe000f0, 0x00200090, "mla",	"Snmsd" },
+    { 0x0fe000f0, 0x00800090, "umull",	"Sdnms" },
+    { 0x0fe000f0, 0x00c00090, "smull",	"Sdnms" },
+    { 0x0fe000f0, 0x00a00090, "umlal",	"Sdnms" },
+    { 0x0fe000f0, 0x00e00090, "smlal",	"Sdnms" },
+    { 0x0d700000, 0x04200000, "strt",	"daW" },
+    { 0x0d700000, 0x04300000, "ldrt",	"daW" },
+    { 0x0d700000, 0x04600000, "strbt",	"daW" },
+    { 0x0d700000, 0x04700000, "ldrbt",	"daW" },
+    { 0x0c500000, 0x04000000, "str",	"daW" },
+    { 0x0c500000, 0x04100000, "ldr",	"daW" },
+    { 0x0c500000, 0x04400000, "strb",	"daW" },
+    { 0x0c500000, 0x04500000, "ldrb",	"daW" },
+    { 0x0e1f0000, 0x080d0000, "stm",	"YnWl" },/* separate out r13 base */
+    { 0x0e1f0000, 0x081d0000, "ldm",	"YnWl" },/* separate out r13 base */    
+    { 0x0e100000, 0x08000000, "stm",	"XnWl" },
+    { 0x0e100000, 0x08100000, "ldm",	"XnWl" },    
+    { 0x0e1000f0, 0x00100090, "ldrb",	"deW" },
+    { 0x0e1000f0, 0x00000090, "strb",	"deW" },
+    { 0x0e1000f0, 0x001000d0, "ldrsb",	"deW" },
+    { 0x0e1000f0, 0x001000b0, "ldrh",	"deW" },
+    { 0x0e1000f0, 0x000000b0, "strh",	"deW" },
+    { 0x0e1000f0, 0x001000f0, "ldrsh",	"deW" },
+    { 0x0f200090, 0x00200090, "und",	"x" },	/* Before data processing */
+    { 0x0e1000d0, 0x000000d0, "und",	"x" },	/* Before data processing */
+    { 0x0ff00ff0, 0x01000090, "swp",	"dmo" },
+    { 0x0ff00ff0, 0x01400090, "swpb",	"dmo" },
+    { 0x0fbf0fff, 0x010f0000, "mrs",	"dp" },	/* Before data processing */
+    { 0x0fb0fff0, 0x0120f000, "msr",	"pFm" },/* Before data processing */
+    { 0x0fb0f000, 0x0320f000, "msr",	"pF2" },/* Before data processing */
+    { 0x0ffffff0, 0x012fff10, "bx",     "m" },
+    { 0x0fff0ff0, 0x016f0f10, "clz",	"dm" },
+    { 0x0ffffff0, 0x012fff30, "blx",	"m" },
+    { 0xfff000f0, 0xe1200070, "bkpt",	"k" },
+    { 0x0de00000, 0x00000000, "and",	"Sdn2" },
+    { 0x0de00000, 0x00200000, "eor",	"Sdn2" },
+    { 0x0de00000, 0x00400000, "sub",	"Sdn2" },
+    { 0x0de00000, 0x00600000, "rsb",	"Sdn2" },
+    { 0x0de00000, 0x00800000, "add",	"Sdn2" },
+    { 0x0de00000, 0x00a00000, "adc",	"Sdn2" },
+    { 0x0de00000, 0x00c00000, "sbc",	"Sdn2" },
+    { 0x0de00000, 0x00e00000, "rsc",	"Sdn2" },
+    { 0x0df00000, 0x01100000, "tst",	"Dn2" },
+    { 0x0df00000, 0x01300000, "teq",	"Dn2" },
+    { 0x0df00000, 0x01500000, "cmp",	"Dn2" },
+    { 0x0df00000, 0x01700000, "cmn",	"Dn2" },
+    { 0x0de00000, 0x01800000, "orr",	"Sdn2" },
+    { 0x0de00000, 0x01a00000, "mov",	"Sd2" },
+    { 0x0de00000, 0x01c00000, "bic",	"Sdn2" },
+    { 0x0de00000, 0x01e00000, "mvn",	"Sd2" },
+    { 0x0ff08f10, 0x0e000100, "adf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e100100, "muf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e200100, "suf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e300100, "rsf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e400100, "dvf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e500100, "rdf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e600100, "pow",	"PRfgh" },
+    { 0x0ff08f10, 0x0e700100, "rpw",	"PRfgh" },
+    { 0x0ff08f10, 0x0e800100, "rmf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e900100, "fml",	"PRfgh" },
+    { 0x0ff08f10, 0x0ea00100, "fdv",	"PRfgh" },
+    { 0x0ff08f10, 0x0eb00100, "frd",	"PRfgh" },
+    { 0x0ff08f10, 0x0ec00100, "pol",	"PRfgh" },
+    { 0x0f008f10, 0x0e000100, "fpbop",	"PRfgh" },
+    { 0x0ff08f10, 0x0e008100, "mvf",	"PRfh" },
+    { 0x0ff08f10, 0x0e108100, "mnf",	"PRfh" },
+    { 0x0ff08f10, 0x0e208100, "abs",	"PRfh" },
+    { 0x0ff08f10, 0x0e308100, "rnd",	"PRfh" },
+    { 0x0ff08f10, 0x0e408100, "sqt",	"PRfh" },
+    { 0x0ff08f10, 0x0e508100, "log",	"PRfh" },
+    { 0x0ff08f10, 0x0e608100, "lgn",	"PRfh" },
+    { 0x0ff08f10, 0x0e708100, "exp",	"PRfh" },
+    { 0x0ff08f10, 0x0e808100, "sin",	"PRfh" },
+    { 0x0ff08f10, 0x0e908100, "cos",	"PRfh" },
+    { 0x0ff08f10, 0x0ea08100, "tan",	"PRfh" },
+    { 0x0ff08f10, 0x0eb08100, "asn",	"PRfh" },
+    { 0x0ff08f10, 0x0ec08100, "acs",	"PRfh" },
+    { 0x0ff08f10, 0x0ed08100, "atn",	"PRfh" },
+    { 0x0f008f10, 0x0e008100, "fpuop",	"PRfh" },
+    { 0x0e100f00, 0x0c000100, "stf",	"QLv" },
+    { 0x0e100f00, 0x0c100100, "ldf",	"QLv" },
+    { 0x0ff00f10, 0x0e000110, "flt",	"PRgd" },
+    { 0x0ff00f10, 0x0e100110, "fix",	"PRdh" },
+    { 0x0ff00f10, 0x0e200110, "wfs",	"d" },
+    { 0x0ff00f10, 0x0e300110, "rfs",	"d" },
+    { 0x0ff00f10, 0x0e400110, "wfc",	"d" },
+    { 0x0ff00f10, 0x0e500110, "rfc",	"d" },
+    { 0x0ff0ff10, 0x0e90f110, "cmf",	"PRgh" },
+    { 0x0ff0ff10, 0x0eb0f110, "cnf",	"PRgh" },
+    { 0x0ff0ff10, 0x0ed0f110, "cmfe",	"PRgh" },
+    { 0x0ff0ff10, 0x0ef0f110, "cnfe",	"PRgh" },
+    { 0xff100010, 0xfe000010, "mcr2",	"#z" },
+    { 0x0f100010, 0x0e000010, "mcr",	"#z" },
+    { 0xff100010, 0xfe100010, "mrc2",	"#z" },
+    { 0x0f100010, 0x0e100010, "mrc",	"#z" },
+    { 0xff000010, 0xfe000000, "cdp2",	"#y" },
+    { 0x0f000010, 0x0e000000, "cdp",	"#y" },
+    { 0xfe100090, 0xfc100000, "ldc2",	"L#v" },
+    { 0x0e100090, 0x0c100000, "ldc",	"L#v" },
+    { 0xfe100090, 0xfc000000, "stc2",	"L#v" },
+    { 0x0e100090, 0x0c000000, "stc",	"L#v" },
+    { 0xf550f000, 0xf550f000, "pld",	"ne" },
+    { 0x0ff00ff0, 0x01000050, "qaad",	"dmn" },
+    { 0x0ff00ff0, 0x01400050, "qdaad",	"dmn" },
+    { 0x0ff00ff0, 0x01600050, "qdsub",	"dmn" },
+    { 0x0ff00ff0, 0x01200050, "dsub",	"dmn" },
+    { 0x0ff000f0, 0x01000080, "smlabb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000a0, "smlatb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000c0, "smlabt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000e0, "smlatt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01400080, "smlalbb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000a0, "smlaltb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000c0, "smlalbt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000e0, "smlaltt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01200080, "smlawb", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000a0, "smulwb","nms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x012000c0, "smlawt", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000e0, "smulwt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x01600080, "smulbb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000a0, "smultb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000c0, "smulbt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000e0, "smultt","nms" },   // d & n inverted!!
+    { 0x00000000, 0x00000000, NULL,	NULL }
+};
+
+static char const arm32_insn_conditions[][4] = {
+	"eq", "ne", "cs", "cc",
+	"mi", "pl", "vs", "vc",
+	"hi", "ls", "ge", "lt",
+	"gt", "le", "",   "nv"
+};
+
+static char const insn_block_transfers[][4] = {
+	"da", "ia", "db", "ib"
+};
+
+static char const insn_stack_block_transfers[][4] = {
+	"ed", "ea", "fd", "fa"
+};
+
+static char const op_shifts[][4] = {
+	"lsl", "lsr", "asr", "ror"
+};
+
+static char const insn_fpa_rounding[][2] = {
+	"", "p", "m", "z"
+};
+
+static char const insn_fpa_precision[][2] = {
+	"s", "d", "e", "p"
+};
+
+static char const insn_fpaconstants[][8] = {
+	"0.0", "1.0", "2.0", "3.0",
+	"4.0", "5.0", "0.5", "10.0"
+};
+
+#define insn_condition(x)	arm32_insn_conditions[(x >> 28) & 0x0f]
+#define insn_blktrans(x)	insn_block_transfers[(x >> 23) & 3]
+#define insn_stkblktrans(x)	insn_stack_block_transfers[(x >> 23) & 3]
+#define op2_shift(x)		op_shifts[(x >> 5) & 3]
+#define insn_fparnd(x)		insn_fpa_rounding[(x >> 5) & 0x03]
+#define insn_fpaprec(x)		insn_fpa_precision[(((x >> 18) & 2)|(x >> 7)) & 1]
+#define insn_fpaprect(x)	insn_fpa_precision[(((x >> 21) & 2)|(x >> 15)) & 1]
+#define insn_fpaimm(x)		insn_fpaconstants[x & 0x07]
+
+/* Local prototypes */
+static void disasm_register_shift(const disasm_interface_t *di, u_int insn);
+static void disasm_print_reglist(const disasm_interface_t *di, u_int insn);
+static void disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static u_int disassemble_readword(u_int address);
+static void disassemble_printaddr(u_int address);
+
+u_int
+disasm(const disasm_interface_t *di, u_int loc, int altfmt)
+{
+	const struct arm32_insn *i_ptr = &arm32_i[0];
+
+	u_int insn;
+	int matchp;
+	int branch;
+	char* f_ptr;
+	int fmt;
+
+	fmt = 0;
+	matchp = 0;
+	insn = di->di_readword(loc);
+
+/*	di->di_printf("loc=%08x insn=%08x : ", loc, insn);*/
+
+	while (i_ptr->name) {
+		if ((insn & i_ptr->mask) ==  i_ptr->pattern) {
+			matchp = 1;
+			break;
+		}
+		i_ptr++;
+	}
+
+	if (!matchp) {
+		di->di_printf("und%s\t%08x\n", insn_condition(insn), insn);
+		return(loc + INSN_SIZE);
+	}
+
+	/* If instruction forces condition code, don't print it. */
+	if ((i_ptr->mask & 0xf0000000) == 0xf0000000)
+		di->di_printf("%s", i_ptr->name);
+	else
+		di->di_printf("%s%s", i_ptr->name, insn_condition(insn));
+
+	f_ptr = i_ptr->format;
+
+	/* Insert tab if there are no instruction modifiers */
+
+	if (*(f_ptr) < 'A' || *(f_ptr) > 'Z') {
+		++fmt;
+		di->di_printf("\t");
+	}
+
+	while (*f_ptr) {
+		switch (*f_ptr) {
+		/* 2 - print Operand 2 of a data processing instruction */
+		case '2':
+			if (insn & 0x02000000) {
+				int rotate= ((insn >> 7) & 0x1e);
+
+				di->di_printf("#0x%08x",
+					      (insn & 0xff) << (32 - rotate) |
+					      (insn & 0xff) >> rotate);
+			} else {  
+				disasm_register_shift(di, insn);
+			}
+			break;
+		/* d - destination register (bits 12-15) */
+		case 'd':
+			di->di_printf("r%d", ((insn >> 12) & 0x0f));
+			break;
+		/* D - insert 'p' if Rd is R15 */
+		case 'D':
+			if (((insn >> 12) & 0x0f) == 15)
+				di->di_printf("p");
+			break;
+		/* n - n register (bits 16-19) */
+		case 'n':
+			di->di_printf("r%d", ((insn >> 16) & 0x0f));
+			break;
+		/* s - s register (bits 8-11) */
+		case 's':
+			di->di_printf("r%d", ((insn >> 8) & 0x0f));
+			break;
+		/* o - indirect register rn (bits 16-19) (used by swap) */
+		case 'o':
+			di->di_printf("[r%d]", ((insn >> 16) & 0x0f));
+			break;
+		/* m - m register (bits 0-4) */
+		case 'm':
+			di->di_printf("r%d", ((insn >> 0) & 0x0f));
+			break;
+		/* a - address operand of ldr/str instruction */
+		case 'a':
+			disasm_insn_ldrstr(di, insn, loc);
+			break;
+		/* e - address operand of ldrh/strh instruction */
+		case 'e':
+			disasm_insn_ldrhstrh(di, insn, loc);
+			break;
+		/* l - register list for ldm/stm instruction */
+		case 'l':
+			disasm_print_reglist(di, insn);
+			break;
+		/* f - 1st fp operand (register) (bits 12-14) */
+		case 'f':
+			di->di_printf("f%d", (insn >> 12) & 7);
+			break;
+		/* g - 2nd fp operand (register) (bits 16-18) */
+		case 'g':
+			di->di_printf("f%d", (insn >> 16) & 7);
+			break;
+		/* h - 3rd fp operand (register/immediate) (bits 0-4) */
+		case 'h':
+			if (insn & (1 << 3))
+				di->di_printf("#%s", insn_fpaimm(insn));
+			else
+				di->di_printf("f%d", insn & 7);
+			break;
+		/* b - branch address */
+		case 'b':
+			branch = ((insn << 2) & 0x03ffffff);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* t - blx address */
+		case 't':
+			branch = ((insn << 2) & 0x03ffffff) |
+			    (insn >> 23 & 0x00000002);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* X - block transfer type */
+		case 'X':
+			di->di_printf("%s", insn_blktrans(insn));
+			break;
+		/* Y - block transfer type (r13 base) */
+		case 'Y':
+			di->di_printf("%s", insn_stkblktrans(insn));
+			break;
+		/* c - comment field bits(0-23) */
+		case 'c':
+			di->di_printf("0x%08x", (insn & 0x00ffffff));
+			break;
+		/* k - breakpoint comment (bits 0-3, 8-19) */
+		case 'k':
+			di->di_printf("0x%04x",
+			    (insn & 0x000fff00) >> 4 | (insn & 0x0000000f));
+			break;
+		/* p - saved or current status register */
+		case 'p':
+			if (insn & 0x00400000)
+				di->di_printf("spsr");
+			else
+				di->di_printf("cpsr");
+			break;
+		/* F - PSR transfer fields */
+		case 'F':
+			di->di_printf("_");
+			if (insn & (1 << 16))
+				di->di_printf("c");
+			if (insn & (1 << 17))
+				di->di_printf("x");
+			if (insn & (1 << 18))
+				di->di_printf("s");
+			if (insn & (1 << 19))
+				di->di_printf("f");
+			break;
+		/* B - byte transfer flag */
+		case 'B':
+			if (insn & 0x00400000)
+				di->di_printf("b");
+			break;
+		/* L - co-processor transfer size */
+		case 'L':
+			if (insn & (1 << 22))
+				di->di_printf("l");
+			break;
+		/* S - set status flag */
+		case 'S':
+			if (insn & 0x00100000)
+				di->di_printf("s");
+			break;
+		/* P - fp precision */
+		case 'P':
+			di->di_printf("%s", insn_fpaprec(insn));
+			break;
+		/* Q - fp precision (for ldf/stf) */
+		case 'Q':
+			break;
+		/* R - fp rounding */
+		case 'R':
+			di->di_printf("%s", insn_fparnd(insn));
+			break;
+		/* W - writeback flag */
+		case 'W':
+			if (insn & (1 << 21))
+				di->di_printf("!");
+			break;
+		/* # - co-processor number */
+		case '#':
+			di->di_printf("p%d", (insn >> 8) & 0x0f);
+			break;
+		/* v - co-processor data transfer registers+addressing mode */
+		case 'v':
+			disasm_insn_ldcstc(di, insn, loc);
+			break;
+		/* x - instruction in hex */
+		case 'x':
+			di->di_printf("0x%08x", insn);
+			break;
+		/* y - co-processor data processing registers */
+		case 'y':
+			di->di_printf("%d, ", (insn >> 20) & 0x0f);
+
+			di->di_printf("c%d, c%d, c%d", (insn >> 12) & 0x0f,
+			    (insn >> 16) & 0x0f, insn & 0x0f);
+
+			di->di_printf(", %d", (insn >> 5) & 0x07);
+			break;
+		/* z - co-processor register transfer registers */
+		case 'z':
+			di->di_printf("%d, ", (insn >> 21) & 0x07);
+			di->di_printf("r%d, c%d, c%d, %d",
+			    (insn >> 12) & 0x0f, (insn >> 16) & 0x0f,
+			    insn & 0x0f, (insn >> 5) & 0x07);
+
+/*			if (((insn >> 5) & 0x07) != 0)
+				di->di_printf(", %d", (insn >> 5) & 0x07);*/
+			break;
+		default:
+			di->di_printf("[%c - unknown]", *f_ptr);
+			break;
+		}
+		if (*(f_ptr+1) >= 'A' && *(f_ptr+1) <= 'Z')
+			++f_ptr;
+		else if (*(++f_ptr)) {
+			++fmt;
+			if (fmt == 1)
+				di->di_printf("\t");
+			else
+				di->di_printf(", ");
+		}
+	};
+
+	di->di_printf("\n");
+
+	return(loc + INSN_SIZE);
+}
+
+
+static void
+disasm_register_shift(const disasm_interface_t *di, u_int insn)
+{
+	di->di_printf("r%d", (insn & 0x0f));
+	if ((insn & 0x00000ff0) == 0)
+		;
+	else if ((insn & 0x00000ff0) == 0x00000060)
+		di->di_printf(", rrx");
+	else {
+		if (insn & 0x10)
+			di->di_printf(", %s r%d", op2_shift(insn),
+			    (insn >> 8) & 0x0f);
+		else
+			di->di_printf(", %s #%d", op2_shift(insn),
+			    (insn >> 7) & 0x1f);
+	}
+}
+
+
+static void
+disasm_print_reglist(const disasm_interface_t *di, u_int insn)
+{
+	int loop;
+	int start;
+	int comma;
+
+	di->di_printf("{");
+	start = -1;
+	comma = 0;
+
+	for (loop = 0; loop < 17; ++loop) {
+		if (start != -1) {
+			if (loop == 16 || !(insn & (1 << loop))) {
+				if (comma)
+					di->di_printf(", ");
+				else
+					comma = 1;
+        			if (start == loop - 1)
+        				di->di_printf("r%d", start);
+        			else
+        				di->di_printf("r%d-r%d", start, loop - 1);
+        			start = -1;
+        		}
+        	} else {
+        		if (insn & (1 << loop))
+        			start = loop;
+        	}
+        }
+	di->di_printf("}");
+
+	if (insn & (1 << 22))
+		di->di_printf("^");
+}
+
+static void
+disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = insn & 0xfff;
+	if ((insn & 0x032f0000) == 0x010f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x03000fff) != 0x01000000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 25))
+				disasm_register_shift(di, insn);
+			else
+				di->di_printf("#0x%03x", offset);
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = ((insn & 0xf00) >> 4) | (insn & 0xf);
+	if ((insn & 0x004f0000) == 0x004f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x01400f0f) != 0x01400000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 22))
+				di->di_printf("#0x%02x", offset);
+			else
+				di->di_printf("r%d", (insn & 0x0f));
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	if (((insn >> 8) & 0xf) == 1)
+		di->di_printf("f%d, ", (insn >> 12) & 0x07);
+	else
+		di->di_printf("c%d, ", (insn >> 12) & 0x0f);
+
+	di->di_printf("[r%d", (insn >> 16) & 0x0f);
+
+	di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+
+	if (!(insn & (1 << 23)))
+		di->di_printf("-");
+
+	di->di_printf("#0x%03x", (insn & 0xff) << 2);
+
+	if (insn & (1 << 24))
+		di->di_printf("]");
+
+	if (insn & (1 << 21))
+		di->di_printf("!");
+}
+
+static u_int
+disassemble_readword(u_int address)
+{
+	return(*((u_int *)address));
+}
+
+static void
+disassemble_printaddr(u_int address)
+{
+	printf("0x%08x", address);
+}
+
+static const disasm_interface_t disassemble_di = {
+	disassemble_readword, disassemble_printaddr, printf
+};
+
+void
+disassemble(u_int address)
+{
+
+	(void)disasm(&disassemble_di, address, 0);
+}
+
+/* End of disassem.c */
diff --git a/libpixelflinger/codeflinger/disassem.h b/libpixelflinger/codeflinger/disassem.h
new file mode 100644
index 0000000..02747cd
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.h
@@ -0,0 +1,65 @@
+/*	$NetBSD: disassem.h,v 1.4 2001/03/04 04:15:58 matt Exp $	*/
+
+/*-
+ * Copyright (c) 1997 Mark Brinicombe.
+ * Copyright (c) 1997 Causality Limited.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Mark Brinicombe.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Define the interface structure required by the disassembler.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/disassem.h,v 1.2 2005/01/05 21:58:48 imp Exp $
+ */
+
+#ifndef ANDROID_MACHINE_DISASSEM_H
+#define ANDROID_MACHINE_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	u_int	(*di_readword)(u_int);
+	void	(*di_printaddr)(u_int);	
+	void	(*di_printf)(const char *, ...);
+} disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+u_int disasm(const disasm_interface_t *, u_int, int);
+void disassemble(u_int);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MACHINE_DISASSEM_H */
diff --git a/libpixelflinger/codeflinger/load_store.cpp b/libpixelflinger/codeflinger/load_store.cpp
new file mode 100644
index 0000000..93c5825
--- /dev/null
+++ b/libpixelflinger/codeflinger/load_store.cpp
@@ -0,0 +1,378 @@
+/* libs/pixelflinger/codeflinger/load_store.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void GGLAssembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{    
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    STR(AL, s.reg, addr.reg, immed12_post(4));
+        else        STR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // 0x00BBGGRR is unpacked as R,G,B
+        STRB(AL, s.reg, addr.reg, immed12_pre(0));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(1));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(2));
+        if (!(s.flags & CORRUPTIBLE)) {
+            MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    STRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        STRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    STRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        STRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::load(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{    
+    Scratch scratches(registerFile());    
+    int s0;
+
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    LDR(AL, s.reg, addr.reg, immed12_post(4));
+        else        LDR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // R,G,B is packed as 0x00BBGGRR 
+        s0 = scratches.obtain();
+        if (s.reg != addr.reg) {
+            LDRB(AL, s.reg, addr.reg, immed12_pre(0));      // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 16));
+        } else {
+            int s1 = scratches.obtain();
+            LDRB(AL, s1, addr.reg, immed12_pre(0));         // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s1, s1, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s1, reg_imm(s0, LSL, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;        
+    case 16:
+        if (inc)    LDRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        LDRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    LDRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        LDRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::extract(integer_t& d, int s, int h, int l, int bits)
+{
+    const int maskLen = h-l;
+
+    assert(maskLen<=8);
+    assert(h);
+    
+    if (h != bits) {
+        const int mask = ((1<<maskLen)-1) << l;
+        if (isValidImmediate(mask)) {
+            AND(AL, 0, d.reg, s, imm(mask));    // component = packed & mask;
+        } else if (isValidImmediate(~mask)) {
+            BIC(AL, 0, d.reg, s, imm(~mask));   // component = packed & mask;
+        } else {
+            MOV(AL, 0, d.reg, reg_imm(s, LSL, 32-h));
+            l += 32-h;
+            h = 32;
+        }
+        s = d.reg;
+    }
+    
+    if (l) {
+        MOV(AL, 0, d.reg, reg_imm(s, LSR, l));  // component = packed >> l;
+        s = d.reg;
+    }
+    
+    if (s != d.reg) {
+        MOV(AL, 0, d.reg, s);
+    }
+
+    d.s = maskLen;
+}
+
+void GGLAssembler::extract(integer_t& d, const pixel_t& s, int component)
+{
+    extract(d,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+}
+
+void GGLAssembler::extract(component_t& d, const pixel_t& s, int component)
+{
+    integer_t r(d.reg, 32, d.flags);
+    extract(r,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+    d = component_t(r);
+}
+
+
+void GGLAssembler::expand(integer_t& d, const component_t& s, int dbits)
+{
+    if (s.l || (s.flags & CLEAR_HI)) {
+        extract(d, s.reg, s.h, s.l, 32);
+        expand(d, d, dbits);
+    } else {
+        expand(d, integer_t(s.reg, s.size(), s.flags), dbits);
+    }
+}
+
+void GGLAssembler::expand(component_t& d, const component_t& s, int dbits)
+{
+    integer_t r(d.reg, 32, d.flags);
+    expand(r, s, dbits);
+    d = component_t(r);
+}
+
+void GGLAssembler::expand(integer_t& dst, const integer_t& src, int dbits)
+{
+    assert(src.size());
+
+    int sbits = src.size();
+    int s = src.reg;
+    int d = dst.reg;
+
+    // be sure to set 'dst' after we read 'src' as they may be identical
+    dst.s = dbits;
+    dst.flags = 0;
+
+    if (dbits<=sbits) {
+        if (s != d) {
+            MOV(AL, 0, d, s);
+        }
+        return;
+    }
+
+    if (sbits == 1) {
+        RSB(AL, 0, d, s, reg_imm(s, LSL, dbits));
+            // d = (s<<dbits) - s;
+        return;
+    }
+
+    if (dbits % sbits) {
+        MOV(AL, 0, d, reg_imm(s, LSL, dbits-sbits));
+            // d = s << (dbits-sbits);
+        dbits -= sbits;
+        do {
+            ORR(AL, 0, d, d, reg_imm(d, LSR, sbits));
+                // d |= d >> sbits;
+            dbits -= sbits;
+            sbits *= 2;
+        } while(dbits>0);
+        return;
+    }
+    
+    dbits -= sbits;
+    do {
+        ORR(AL, 0, d, s, reg_imm(s, LSL, sbits));
+            // d |= d<<sbits;
+        s = d;        
+        dbits -= sbits;
+        if (sbits*2 < dbits) {
+            sbits *= 2;
+        }
+    } while(dbits>0);
+}
+
+void GGLAssembler::downshift(
+        pixel_t& d, int component, component_t s, const reg_t& dither)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    Scratch scratches(registerFile());
+
+    int sh = s.h;
+    int sl = s.l;
+    int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0;
+    int maskLoBits = (sl!=0)  ? ((s.flags & CLEAR_LO)?1:0) : 0;
+    int sbits = sh - sl;
+
+    int dh = d.format.c[component].h;
+    int dl = d.format.c[component].l;
+    int dbits = dh - dl;
+    int dithering = 0;
+    
+    LOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits);
+
+    if (sbits>dbits) {
+        // see if we need to dither
+        dithering = mDithering;
+    }
+    
+    int ireg = d.reg;
+    if (!(d.flags & FIRST)) {
+        if (s.flags & CORRUPTIBLE)  {
+            ireg = s.reg;
+        } else {
+            ireg = scratches.obtain();
+        }
+    }
+    d.flags &= ~FIRST;
+
+    if (maskHiBits) {
+        // we need to mask the high bits (and possibly the lowbits too)
+        // and we might be able to use immediate mask.
+        if (!dithering) {
+            // we don't do this if we only have maskLoBits because we can
+            // do it more efficiently below (in the case where dl=0)
+            const int offset = sh - dbits;
+            if (dbits<=8 && offset >= 0) {
+                const uint32_t mask = ((1<<dbits)-1) << offset;
+                if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                    build_and_immediate(ireg, s.reg, mask, 32);
+                    sl = offset;
+                    s.reg = ireg; 
+                    sbits = dbits;
+                    maskLoBits = maskHiBits = 0;
+                }
+            }
+        } else {
+            // in the dithering case though, we need to preserve the lower bits
+            const uint32_t mask = ((1<<sbits)-1) << sl;
+            if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                build_and_immediate(ireg, s.reg, mask, 32);
+                s.reg = ireg; 
+                maskLoBits = maskHiBits = 0;
+            }
+        }
+    }
+
+    // XXX: we could special case (maskHiBits & !maskLoBits)
+    // like we do for maskLoBits below, but it happens very rarely
+    // that we have maskHiBits only and the conditions necessary to lead
+    // to better code (like doing d |= s << 24)
+
+    if (maskHiBits) {
+        MOV(AL, 0, ireg, reg_imm(s.reg, LSL, 32-sh));
+        sl += 32-sh;
+        sh = 32;
+        s.reg = ireg;
+        maskHiBits = 0;
+    }
+
+    //	Downsampling should be performed as follows:
+    //  V * ((1<<dbits)-1) / ((1<<sbits)-1)
+    //	V * [(1<<dbits)/((1<<sbits)-1)	-	1/((1<<sbits)-1)]
+    //	V * [1/((1<<sbits)-1)>>dbits	-	1/((1<<sbits)-1)]
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/((1<<sbits)-1)>>sbits
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/(1-(1>>sbits))
+    //
+    //	By approximating (1>>dbits) and (1>>sbits) to 0:
+    //
+    //		V>>(sbits-dbits)	-	V>>sbits
+    //
+	//  A good approximation is V>>(sbits-dbits),
+    //  but better one (needed for dithering) is:
+    //
+    //		(V>>(sbits-dbits)<<sbits	-	V)>>sbits
+    //		(V<<dbits	-	V)>>sbits
+    //		(V	-	V>>dbits)>>(sbits-dbits)
+
+    // Dithering is done here
+    if (dithering) {
+        comment("dithering");
+        if (sl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, sl));
+            sh -= sl;
+            sl = 0;
+            s.reg = ireg; 
+        }
+        // scaling (V-V>>dbits)
+        SUB(AL, 0, ireg, s.reg, reg_imm(s.reg, LSR, dbits));
+        const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+        if (shift>0)        ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSR, shift));
+        else if (shift<0)   ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSL,-shift));
+        else                ADD(AL, 0, ireg, ireg, dither.reg);
+        s.reg = ireg; 
+    }
+
+    if ((maskLoBits|dithering) && (sh > dbits)) {
+        int shift = sh-dbits;
+        if (dl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, shift));
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(ireg, LSL, dl));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(ireg, LSL, dl));
+            }
+        } else {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        }
+    } else {
+        int shift = sh-dh;
+        if (shift>0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        } else if (shift<0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSL, -shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSL, -shift));
+            }
+        } else {
+            if (ireg == d.reg) {
+                if (s.reg != d.reg) {
+                    MOV(AL, 0, d.reg, s.reg);
+                }
+            } else {
+                ORR(AL, 0, d.reg, d.reg, s.reg);
+            }
+        }
+    }
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp
new file mode 100644
index 0000000..90e6584
--- /dev/null
+++ b/libpixelflinger/codeflinger/texturing.cpp
@@ -0,0 +1,1251 @@
+/* libs/pixelflinger/codeflinger/texturing.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+
+namespace android {
+
+// ---------------------------------------------------------------------------
+
+// iterators are initialized like this:
+// (intToFixedCenter(x) * dx)>>16 + x0
+// ((x<<16 + 0x8000) * dx)>>16 + x0
+// ((x<<16)*dx + (0x8000*dx))>>16 + x0
+// ( (x*dx) + dx>>1 ) + x0
+// (x*dx) + (dx>>1 + x0)
+
+void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+
+    if (mSmooth) {
+        // NOTE: we could take this case in the mDithering + !mSmooth case,
+        // but this would use up to 4 more registers for the color components
+        // for only a little added quality.
+        // Currently, this causes the system to run out of registers in
+        // some case (see issue #719496)
+
+        comment("compute initial iterated color (smooth and/or dither case)");
+
+        parts.iterated_packed = 0;
+        parts.packed = 0;
+
+        // 0x1: color component
+        // 0x2: iterators
+        const int optReload = mOptLevel >> 1;
+        if (optReload >= 3)         parts.reload = 0; // reload nothing
+        else if (optReload == 2)    parts.reload = 2; // reload iterators
+        else if (optReload == 1)    parts.reload = 1; // reload colors
+        else if (optReload <= 0)    parts.reload = 3; // reload both
+
+        if (!mSmooth) {
+            // we're not smoothing (just dithering), we never have to 
+            // reload the iterators
+            parts.reload &= ~2;
+        }
+
+        Scratch scratches(registerFile());
+        const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
+        const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated)
+                continue;            
+            
+            // this component exists in the destination and is not replaced
+            // by a texture unit.
+            const int c = (parts.reload & 1) ? t0 : obtainReg();              
+            if (i==0) CONTEXT_LOAD(c, iterators.ydady);
+            if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
+            if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
+            if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
+            parts.argb[i].reg = c;
+
+            if (mInfo[i].smooth) {
+                parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
+                const int dvdx = parts.argb_dx[i].reg;
+                CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
+                MLA(AL, 0, c, x.reg, dvdx, c);
+                
+                // adjust the color iterator to make sure it won't overflow
+                if (!mAA) {
+                    // this is not needed when we're using anti-aliasing
+                    // because we will (have to) clamp the components
+                    // anyway.
+                    int end = scratches.obtain();
+                    MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
+                    MLA(AL, 1, end, dvdx, end, c);
+                    SUB(MI, 0, c, c, end);
+                    BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 
+                    scratches.recycle(end);
+                }
+            }
+            
+            if (parts.reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+            }
+        }
+    } else {
+        // We're not smoothed, so we can 
+        // just use a packed version of the color and extract the
+        // components as needed (or not at all if we don't blend)
+
+        // figure out if we need the iterated color
+        int load = 0;
+        for (int i=0 ; i<4 ; i++) {
+            component_info_t& info = mInfo[i];
+            if ((info.inDest || info.needed) && !info.replaced)
+                load |= 1;
+        }
+        
+        parts.iterated_packed = 1;
+        parts.packed = (!mTextureMachine.mask && !mBlending
+                && !mFog && !mDithering);
+        parts.reload = 0;
+        if (load || parts.packed) {
+            if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
+                comment("load initial iterated color (8888 packed)");
+                parts.iterated.setTo(obtainReg(),
+                        &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
+                CONTEXT_LOAD(parts.iterated.reg, packed8888);
+            } else {
+                comment("load initial iterated color (dest format packed)");
+
+                parts.iterated.setTo(obtainReg(), &mCbFormat);
+
+                // pre-mask the iterated color
+                const int bits = parts.iterated.size();
+                const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+                uint32_t mask = 0;
+                if (mMasking) {
+                    for (int i=0 ; i<4 ; i++) {
+                        const int component_mask = 1<<i;
+                        const int h = parts.iterated.format.c[i].h;
+                        const int l = parts.iterated.format.c[i].l;
+                        if (h && (!(mMasking & component_mask))) {
+                            mask |= ((1<<(h-l))-1) << l;
+                        }
+                    }
+                }
+
+                if (mMasking && ((mask & size)==0)) {
+                    // none of the components are present in the mask
+                } else {
+                    CONTEXT_LOAD(parts.iterated.reg, packed);
+                    if (mCbFormat.size == 1) {
+                        AND(AL, 0, parts.iterated.reg,
+                                parts.iterated.reg, imm(0xFF));
+                    } else if (mCbFormat.size == 2) {
+                        MOV(AL, 0, parts.iterated.reg,
+                                reg_imm(parts.iterated.reg, LSR, 16));
+                    }
+                }
+
+                // pre-mask the iterated color
+                if (mMasking) {
+                    build_and_immediate(parts.iterated.reg, parts.iterated.reg,
+                            mask, bits);
+                }
+            }
+        }
+    }
+}
+
+void GGLAssembler::build_iterated_color(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 
+
+    if (!mInfo[component].iterated)
+        return;
+
+    if (parts.iterated_packed) {
+        // iterated colors are packed, extract the one we need
+        extract(fragment, parts.iterated, component);
+    } else {
+        fragment.h = GGL_COLOR_BITS;
+        fragment.l = GGL_COLOR_BITS - 8;
+        fragment.flags |= CLEAR_LO;
+        // iterated colors are held in their own register,
+        // (smooth and/or dithering case)
+        if (parts.reload==3) {
+            // this implies mSmooth
+            Scratch scratches(registerFile());
+            int dx = scratches.obtain();
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+            CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
+            ADD(AL, 0, dx, fragment.reg, dx);
+            CONTEXT_STORE(dx, generated_vars.argb[component].c);
+        } else if (parts.reload & 1) {
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+        } else {
+            // we don't reload, so simply rename the register and mark as
+            // non CORRUPTIBLE so that the texture env or blending code
+            // won't modify this (renamed) register
+            regs.recycle(fragment.reg);
+            fragment.reg = parts.argb[component].reg;
+            fragment.flags &= ~CORRUPTIBLE;
+        }
+        if (mInfo[component].smooth && mAA) {
+            // when using smooth shading AND anti-aliasing, we need to clamp
+            // the iterators because there is always an extra pixel on the
+            // edges, which most of the time will cause an overflow
+            // (since technically its outside of the domain).
+            BIC(AL, 0, fragment.reg, fragment.reg,
+                    reg_imm(fragment.reg, ASR, 31));
+            component_sat(fragment);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
+{
+    // gather some informations about the components we need to process...
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    switch(opcode) {
+    case GGL_COPY:
+        mLogicOp = 0;
+        break;
+    case GGL_CLEAR:
+    case GGL_SET:
+        mLogicOp = LOGIC_OP;
+        break;
+    case GGL_AND:
+    case GGL_AND_REVERSE:
+    case GGL_AND_INVERTED:
+    case GGL_XOR:
+    case GGL_OR:
+    case GGL_NOR:
+    case GGL_EQUIV:
+    case GGL_OR_REVERSE:
+    case GGL_OR_INVERTED:
+    case GGL_NAND:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
+        break;
+    case GGL_NOOP:
+    case GGL_INVERT:
+        mLogicOp = LOGIC_OP|LOGIC_OP_DST;
+        break;        
+    case GGL_COPY_INVERTED:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
+        break;
+    };        
+}
+
+void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
+{
+    uint8_t replaced=0;
+    mTextureMachine.mask = 0;
+    mTextureMachine.activeUnits = 0;
+    for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (replaced == 0xF) {
+            // all components are replaced, skip this TMU.
+            tmu.format_idx = 0;
+            tmu.mask = 0;
+            tmu.replaced = replaced;
+            continue;
+        }
+        tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
+        tmu.format = c->formats[tmu.format_idx];
+        tmu.bits = tmu.format.size*8;
+        tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
+        tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
+        tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
+        tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
+        tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
+                && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
+
+        // 5551 linear filtering is not supported
+        if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
+            tmu.linear = 0;
+        
+        tmu.mask = 0;
+        tmu.replaced = replaced;
+
+        if (tmu.format_idx) {
+            mTextureMachine.activeUnits++;
+            if (tmu.format.c[0].h)    tmu.mask |= 0x1;
+            if (tmu.format.c[1].h)    tmu.mask |= 0x2;
+            if (tmu.format.c[2].h)    tmu.mask |= 0x4;
+            if (tmu.format.c[3].h)    tmu.mask |= 0x8;
+            if (tmu.env == GGL_REPLACE) {
+                replaced |= tmu.mask;
+            } else if (tmu.env == GGL_DECAL) {
+                if (!tmu.format.c[GGLFormat::ALPHA].h) {
+                    // if we don't have alpha, decal does nothing
+                    tmu.mask = 0;
+                } else {
+                    // decal always ignores At
+                    tmu.mask &= ~(1<<GGLFormat::ALPHA);
+                }
+            }
+        }
+        mTextureMachine.mask |= tmu.mask;
+        //printf("%d: mask=%08lx, replaced=%08lx\n",
+        //    i, int(tmu.mask), int(tmu.replaced));
+    }
+    mTextureMachine.replaced = replaced;
+    mTextureMachine.directTexture = 0;
+    //printf("replaced=%08lx\n", mTextureMachine.replaced);
+}
+
+
+void GGLAssembler::init_textures(
+        tex_coord_t* coords,
+        const reg_t& x, const reg_t& y)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+    int Rctx = mBuilderContext.Rctx;
+    int Rx = x.reg;
+    int Ry = y.reg;
+
+    if (mTextureMachine.mask) {
+        comment("compute texture coordinates");
+    }
+
+    // init texture coordinates for each tmu
+    const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11)) 
+        {
+            // 1:1 texture
+            pointer_t& txPtr = coords[i].ptr;
+            txPtr.setTo(obtainReg(), tmu.bits);
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
+            ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
+            ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
+            // merge base & offset
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
+            SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            base_offset(txPtr, txPtr, Rx);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = coords[i].s;
+            reg_t& t = coords[i].t;
+            // s = (x * dsdx)>>16 + ydsdy
+            // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
+            // t = (x * dtdx)>>16 + ydtdy
+            // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
+            s.setTo(obtainReg());
+            t.setTo(obtainReg());
+            const int need_w = GGL_READ_NEEDS(W, needs.n);
+            if (need_w) {
+                CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
+            } else {
+                int ydsdy = scratches.obtain();
+                int ydtdy = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
+                CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
+                CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
+                MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
+                MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
+            }
+            
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                recycleReg(s.reg);
+                recycleReg(t.reg);
+            }
+        }
+
+        // direct texture?
+        if (!multiTexture && !mBlending && !mDithering && !mFog && 
+            cb_format_idx == tmu.format_idx && !tmu.linear &&
+            mTextureMachine.replaced == tmu.mask) 
+        {
+                mTextureMachine.directTexture = i + 1; 
+        }
+    }
+}
+
+void GGLAssembler::build_textures(  fragment_parts_t& parts,
+                                    Scratch& regs)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+    int Rctx = mBuilderContext.Rctx;
+
+    // We don't have a way to spill registers automatically
+    // spill depth and AA regs, when we know we may have to.
+    // build the spill list...
+    uint32_t spill_list = 0;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if (tmu.linear) {
+            // we may run out of register if we have linear filtering
+            // at 1 or 4 bytes / pixel on any texture unit.
+            if (tmu.format.size == 1) {
+                // if depth and AA enabled, we'll run out of 1 register
+                if (parts.z.reg > 0 && parts.covPtr.reg > 0)
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+            if (tmu.format.size == 4) {
+                // if depth or AA enabled, we'll run out of 1 or 2 registers
+                if (parts.z.reg > 0)
+                    spill_list |= 1<<parts.z.reg;
+                if (parts.covPtr.reg > 0)   
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+        }
+    }
+
+    Spill spill(registerFile(), *this, spill_list);
+
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        pointer_t& txPtr = parts.coords[i].ptr;
+        pixel_t& texel = parts.texel[i];
+            
+        // repeat...
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            comment("fetch texel");
+            texel.setTo(regs.obtain(), &tmu.format);
+            load(txPtr, texel, WRITE_BACK);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = parts.coords[i].s;
+            reg_t& t = parts.coords[i].t;
+            if ((mOptLevel&1)==0) {
+                comment("reload s/t (multitexture or linear filtering)");
+                s.reg = scratches.obtain();
+                t.reg = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
+            }
+
+            comment("compute repeat/clamp");
+            int u       = scratches.obtain();
+            int v       = scratches.obtain();
+            int width   = scratches.obtain();
+            int height  = scratches.obtain();
+            int U = 0;
+            int V = 0;
+
+            CONTEXT_LOAD(width,  generated_vars.texture[i].width);
+            CONTEXT_LOAD(height, generated_vars.texture[i].height);
+
+            int FRAC_BITS = 0;
+            if (tmu.linear) {
+                // linear interpolation
+                if (tmu.format.size == 1) {
+                    // for 8-bits textures, we can afford
+                    // 7 bits of fractional precision at no
+                    // additional cost (we can't do 8 bits
+                    // because filter8 uses signed 16 bits muls)
+                    FRAC_BITS = 7;
+                } else if (tmu.format.size == 2) {
+                    // filter16() is internally limited to 4 bits, so:
+                    // FRAC_BITS=2 generates less instructions,
+                    // FRAC_BITS=3,4,5 creates unpleasant artifacts,
+                    // FRAC_BITS=6+ looks good
+                    FRAC_BITS = 6;
+                } else if (tmu.format.size == 4) {
+                    // filter32() is internally limited to 8 bits, so:
+                    // FRAC_BITS=4 looks good
+                    // FRAC_BITS=5+ looks better, but generates 3 extra ipp
+                    FRAC_BITS = 6;
+                } else {
+                    // for all other cases we use 4 bits.
+                    FRAC_BITS = 4;
+                }
+            }
+            wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
+            wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
+
+            if (tmu.linear) {
+                comment("compute linear filtering offsets");
+                // pixel size scale
+                const int shift = 31 - gglClz(tmu.format.size);
+                U = scratches.obtain();
+                V = scratches.obtain();
+
+                // sample the texel center
+                SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
+                SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
+
+                // get the fractionnal part of U,V
+                AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
+                AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
+
+                // compute width-1 and height-1
+                SUB(AL, 0, width,  width,  imm(1));
+                SUB(AL, 0, height, height, imm(1));
+
+                // get the integer part of U,V and clamp/wrap
+                // and compute offset to the next texel
+                if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // u has already been REPEATed
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, width);                    
+                    CMP(AL, u, width);
+                    MOV(LT, 0, width, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, width, reg_imm(width, LSL, shift));
+                    RSB(GE, 0, width, width, imm(0));
+                } else {
+                    // u has not been CLAMPed yet
+                    // algorithm:
+                    // if ((u>>4) >= width)
+                    //      u = width<<4
+                    //      width = 0
+                    // else
+                    //      width = 1<<shift
+                    // u = u>>4; // get integer part
+                    // if (u<0)
+                    //      u = 0
+                    //      width = 0
+                    // generated_vars.rt = width
+                    
+                    CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
+                    MOV(LE, 0, width, imm(0));
+                    MOV(GT, 0, width, imm(1 << shift));
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, imm(0));
+                    MOV(MI, 0, width, imm(0));
+                }
+                CONTEXT_STORE(width, generated_vars.rt);
+
+                const int stride = width;
+                CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
+                if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // v has already been REPEATed
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, height);
+                    CMP(AL, v, height);
+                    MOV(LT, 0, height, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, height, reg_imm(height, LSL, shift));
+                    RSB(GE, 0, height, height, imm(0));
+                    MUL(AL, 0, height, stride, height);
+                } else {
+                    // u has not been CLAMPed yet
+                    CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
+                    MOV(LE, 0, height, imm(0));
+                    if (shift) {
+                        MOV(GT, 0, height, reg_imm(stride, LSL, shift));
+                    } else {
+                        MOV(GT, 0, height, stride);
+                    }
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, imm(0));
+                    MOV(MI, 0, height, imm(0));
+                }
+                CONTEXT_STORE(height, generated_vars.lb);
+            }
+    
+            scratches.recycle(width);
+            scratches.recycle(height);
+
+            // iterate texture coordinates...
+            comment("iterate s,t");
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s.reg, s.reg, dsdx);
+            ADD(AL, 0, t.reg, t.reg, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                scratches.recycle(s.reg);
+                scratches.recycle(t.reg);
+            }
+            scratches.recycle(dsdx);
+            scratches.recycle(dtdx);
+
+            // merge base & offset...
+            comment("merge base & offset");
+            texel.setTo(regs.obtain(), &tmu.format);
+            txPtr.setTo(texel.reg, tmu.bits);
+            int stride = scratches.obtain();
+            CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            SMLABB(AL, u, v, stride, u);    // u+v*stride 
+            base_offset(txPtr, txPtr, u);
+
+            // load texel
+            if (!tmu.linear) {
+                comment("fetch texel");
+                load(txPtr, texel, 0);
+            } else {
+                // recycle registers we don't need anymore
+                scratches.recycle(u);
+                scratches.recycle(v);
+                scratches.recycle(stride);
+
+                comment("fetch texel, bilinear");
+                switch (tmu.format.size) {
+                case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                }
+            }            
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_texture_coordinates(
+    const fragment_parts_t& parts)
+{
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            const pointer_t& txPtr = parts.coords[i].ptr;
+            ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
+        } else {
+            Scratch scratches(registerFile());
+            int s = parts.coords[i].s.reg;
+            int t = parts.coords[i].t.reg;
+            if ((mOptLevel&1)==0) {
+                s = scratches.obtain();
+                t = scratches.obtain();
+                CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
+            }
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s, s, dsdx);
+            ADD(AL, 0, t, t, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
+            }
+        }
+    }
+}
+
+void GGLAssembler::filter8(
+        const fragment_parts_t& parts,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    if (tmu.format.components != GGL_ALPHA &&
+        tmu.format.components != GGL_LUMINANCE)
+    {
+        // this is a packed format, and we don't support
+        // linear filtering (it's probably RGB 332)
+        // Should not happen with OpenGL|ES
+        LDRB(AL, texel.reg, txPtr.reg);
+        return;
+    }
+
+    // ------------------------
+    // about ~22 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+    int rt   = scratches.obtain();
+    int lb   = scratches.obtain();
+
+    // RB -> U * V
+
+    CONTEXT_LOAD(rt, generated_vars.rt);
+    CONTEXT_LOAD(lb, generated_vars.lb);
+
+    int offset = pixel;
+    ADD(AL, 0, offset, lb, rt);
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    SMULBB(AL, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
+    
+    // LB -> (1-U) * V
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+
+    // RT -> U*(1-V)
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
+    SUB(AL, 0, u, k, u);
+    SMLABB(AL, texel.reg, pixel, u, d);
+    
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        texel.format.c[i].h = FRAC_BITS*2+8;
+        texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
+    }
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_LO;
+}
+
+void GGLAssembler::filter16(
+        const fragment_parts_t& parts,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{    
+    // compute the mask
+    // XXX: it would be nice if the mask below could be computed
+    // automatically.
+    uint32_t mask = 0;
+    int shift = 0;
+    int prec = 0;
+    switch (tmu.format_idx) {
+        case GGL_PIXEL_FORMAT_RGB_565:
+            // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
+            // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
+            mask = 0x07E0F81F;
+            shift = 16;
+            prec = 5;
+            break;
+        case GGL_PIXEL_FORMAT_RGBA_4444:
+            // 0000,1111,0000,1111 | 0000,1111,0000,1111
+            mask = 0x0F0F0F0F;
+            shift = 12;
+            prec = 4;
+            break;
+        case GGL_PIXEL_FORMAT_LA_88:
+            // 0000,0000,1111,1111 | 0000,0000,1111,1111
+            // AALL -> 00AA | 00LL
+            mask = 0x00FF00FF;
+            shift = 8;
+            prec = 8;
+            break;
+        default:
+            // unsupported format, do something sensical...
+            LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
+            LDRH(AL, texel.reg, txPtr.reg);
+            return;
+    }
+
+    const int adjust = FRAC_BITS*2 - prec;
+    const int round  = 0;
+
+    // update the texel format
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_HI|CLEAR_LO;
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
+        texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
+        texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
+    }
+
+    // ------------------------
+    // about ~40 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<prec));
+    
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SUB(AL, 0, u, k, u);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    MLA(AL, 0, texel.reg, pixel, u, d);
+}
+
+void GGLAssembler::filter24(
+        const fragment_parts_t& parts,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    // not supported yet (currently disabled)
+    load(txPtr, texel, 0);
+}
+
+void GGLAssembler::filter32(
+        const fragment_parts_t& parts,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    const int adjust = FRAC_BITS*2 - 8;
+    const int round  = 0;
+
+    // ------------------------
+    // about ~38 cycles / pixel
+    Scratch scratches(registerFile());
+    
+    int pixel= scratches.obtain();
+    int dh   = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    int temp = scratches.obtain();
+    int dl   = scratches.obtain();
+    int mask = scratches.obtain();
+
+    MOV(AL, 0, mask, imm(0xFF));
+    ORR(AL, 0, mask, mask, imm(0xFF0000));
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, dh, temp, u);
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MUL(AL, 0, dl, temp, u);
+    RSB(AL, 0, k, u, imm(0x100));
+
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+    SUB(AL, 0, k, k, u);
+
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SUB(AL, 0, u, k, u);
+    AND(AL, 0, temp, mask, pixel);
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
+    AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
+    ORR(AL, 0, texel.reg, dh, dl);
+}
+
+void GGLAssembler::build_texture_environment(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    const uint32_t component_mask = 1<<component;
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+
+        if (tmu.mask & component_mask) {
+            // replace or modulate with this texture
+            if ((tmu.replaced & component_mask) == 0) {
+                // not replaced by a later tmu...
+
+                Scratch scratches(registerFile());
+                pixel_t texel(parts.texel[i]);
+                if (multiTexture && 
+                    tmu.swrap == GGL_NEEDS_WRAP_11 &&
+                    tmu.twrap == GGL_NEEDS_WRAP_11)
+                {
+                    texel.reg = scratches.obtain();
+                    texel.flags |= CORRUPTIBLE;
+                    comment("fetch texel (multitexture 1:1)");
+                    load(parts.coords[i].ptr, texel, WRITE_BACK);
+                 }
+
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                
+                switch (tmu.env) {
+                case GGL_REPLACE:
+                    extract(fragment, texel, component);
+                    break;
+                case GGL_MODULATE:
+                    modulate(fragment, incoming, texel, component);
+                    break;
+                case GGL_DECAL:
+                    decal(fragment, incoming, texel, component);
+                    break;
+                case GGL_BLEND:
+                    blend(fragment, incoming, texel, component, i);
+                    break;
+                case GGL_ADD:
+                    add(fragment, incoming, texel, component);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::wrapping(
+            int d,
+            int coord, int size,
+            int tx_wrap, int tx_linear)
+{
+    // notes:
+    // if tx_linear is set, we need 4 extra bits of precision on the result
+    // SMULL/UMULL is 3 cycles
+    Scratch scratches(registerFile());
+    int c = coord;
+    if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
+        // UMULL takes 4 cycles (interlocked), and we can get away with
+        // 2 cycles using SMULWB, but we're loosing 16 bits of precision
+        // out of 32 (this is not a problem because the iterator keeps
+        // its full precision)
+        // UMULL(AL, 0, size, d, c, size);
+        // note: we can't use SMULTB because it's signed.
+        MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
+        SMULWB(AL, d, d, size);
+    } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
+        if (tx_linear) {
+            // 1 cycle
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
+        } else {
+            // 4 cycles (common case)
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16));
+            BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
+            CMP(AL, d, size);
+            SUB(GE, 0, d, size, imm(1));
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::modulate(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    extract(texel, incomingTexel, component);
+
+    const int Nt = texel.size();
+        // Nt should always be less than 10 bits because it comes
+        // from the TMU.
+
+    int Ni = incoming.size();
+        // Ni could be big because it comes from previous MODULATEs
+
+    if (Nt == 1) {
+        // texel acts as a bit-mask
+        // dest = incoming & ((texel << incoming.h)-texel)
+        RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
+        AND(AL, 0, dest.reg, dest.reg, incoming.reg);
+        dest.l = incoming.l;
+        dest.h = incoming.h;
+        dest.flags |= (incoming.flags & CLEAR_LO);
+    } else if (Ni == 1) {
+        MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
+        AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
+        dest.l = 0;
+        dest.h = Nt;
+    } else {
+        int inReg = incoming.reg;
+        int shift = incoming.l;
+        if ((Nt + Ni) > 32) {
+            // we will overflow, reduce the precision of Ni to 8 bits
+            // (Note Nt cannot be more than 10 bits which happens with 
+            // 565 textures and GGL_LINEAR)
+            shift += Ni-8;
+            Ni = 8;
+        }
+
+        // modulate by the component with the lowest precision
+        if (Nt >= Ni) {
+            if (shift) {
+                // XXX: we should be able to avoid this shift
+                // when shift==16 && Nt<16 && Ni<16, in which
+                // we could use SMULBT below.
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Ni)-1)
+            // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
+            // this operation doesn't change texel's size
+            ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
+            if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
+            else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
+            dest.l = Ni;
+            dest.h = Nt + Ni;            
+        } else {
+            if (shift && (shift != 16)) {
+                // if shift==16, we can use 16-bits mul instructions later
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Nt)-1)
+            // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
+            // this operation doesn't change incoming's size
+            Scratch scratches(registerFile());
+            int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
+            if (t == inReg)
+                t = scratches.obtain();
+            ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
+            if (Nt<16 && Ni<16) {
+                if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
+                else            SMULBB(AL, dest.reg, t, inReg);
+            } else              MUL(AL, 0, dest.reg, t, inReg);
+            dest.l = Nt;
+            dest.h = Nt + Ni;
+        }
+
+        // low bits are not valid
+        dest.flags |= CLEAR_LO;
+
+        // no need to keep more than 8 bits/component
+        if (dest.size() > 8)
+            dest.l = dest.h-8;
+    }
+}
+
+void GGLAssembler::decal(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
+    // Av = Af
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    extract(texel, incomingTexel, component);
+    extract(factor, incomingTexel, GGLFormat::ALPHA);
+
+    // no need to keep more than 8-bits for decal 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, texel);
+}
+
+void GGLAssembler::blend(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component, int tmu)
+{
+    // RGBA:
+    // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
+    // Av = At*Af
+
+    if (component == GGLFormat::ALPHA) {
+        modulate(dest, incoming, incomingTexel, component);
+        return;
+    }
+    
+    Scratch locals(registerFile());
+    integer_t color(locals.obtain(), 8, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    LDRB(AL, color.reg, mBuilderContext.Rctx,
+            immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
+    extract(factor, incomingTexel, component);
+
+    // no need to keep more than 8-bits for blend 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, color);
+}
+
+void GGLAssembler::add(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf + Ct;
+    Scratch locals(registerFile());
+    
+    component_t incomingTemp(incoming);
+
+    // use "dest" as a temporary for extracting the texel, unless "dest"
+    // overlaps "incoming".
+    integer_t texel(dest.reg, 32, CORRUPTIBLE);
+    if (dest.reg == incomingTemp.reg)
+        texel.reg = locals.obtain();
+    extract(texel, incomingTexel, component);
+
+    if (texel.s < incomingTemp.size()) {
+        expand(texel, texel, incomingTemp.size());
+    } else if (texel.s > incomingTemp.size()) {
+        if (incomingTemp.flags & CORRUPTIBLE) {
+            expand(incomingTemp, incomingTemp, texel.s);
+        } else {
+            incomingTemp.reg = locals.obtain();
+            expand(incomingTemp, incoming, texel.s);
+        }
+    }
+
+    if (incomingTemp.l) {
+        ADD(AL, 0, dest.reg, texel.reg,
+                reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
+    } else {
+        ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
+    }
+    dest.l = 0;
+    dest.h = texel.size();
+    component_sat(dest);
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/fixed.cpp b/libpixelflinger/fixed.cpp
new file mode 100644
index 0000000..5b92062
--- /dev/null
+++ b/libpixelflinger/fixed.cpp
@@ -0,0 +1,339 @@
+/* libs/pixelflinger/fixed.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <stdio.h>
+
+#include <private/pixelflinger/ggl_context.h>
+#include <private/pixelflinger/ggl_fixed.h>
+
+
+// ------------------------------------------------------------------------
+
+int32_t gglRecipQNormalized(int32_t x, int* exponent)
+{
+    const int32_t s = x>>31;
+    uint32_t a = s ? -x : x;
+
+    // the result will overflow, so just set it to the biggest/inf value
+    if (ggl_unlikely(a <= 2LU)) {
+        *exponent = 0;
+        return s ? FIXED_MIN : FIXED_MAX;
+    }
+
+    // Newton-Raphson iteration:
+    // x = r*(2 - a*r)
+
+    const int32_t lz = gglClz(a);
+    a <<= lz;  // 0.32
+    uint32_t r = a;
+    // note: if a == 0x80000000, this means x was a power-of-2, in this
+    // case we don't need to compute anything. We get the reciprocal for
+    // (almost) free.
+    if (a != 0x80000000) {
+        r = (0x2E800 << (30-16)) - (r>>(2-1)); // 2.30, r = 2.90625 - 2*a
+        // 0.32 + 2.30 = 2.62 -> 2.30
+        // 2.30 + 2.30 = 4.60 -> 2.30
+        r = (((2LU<<30) - uint32_t((uint64_t(a)*r) >> 32)) * uint64_t(r)) >> 30;
+        r = (((2LU<<30) - uint32_t((uint64_t(a)*r) >> 32)) * uint64_t(r)) >> 30;
+    }
+
+    // shift right 1-bit to make room for the sign bit
+    *exponent = 30-lz-1;
+    r >>= 1;
+    return s ? -r : r;
+}
+
+int32_t gglRecipQ(GGLfixed x, int q)
+{
+    int shift;
+    x = gglRecipQNormalized(x, &shift);
+    shift += 16-q;
+    x += 1L << (shift-1);   // rounding
+    x >>= shift;
+    return x;
+}    
+
+// ------------------------------------------------------------------------
+
+GGLfixed gglFastDivx(GGLfixed n, GGLfixed d)
+{
+    if ((d>>24) && ((d>>24)+1)) {
+        n >>= 8;
+        d >>= 8;
+    }
+    return gglMulx(n, gglRecip(d));
+}
+
+// ------------------------------------------------------------------------
+
+static const GGLfixed ggl_sqrt_reciproc_approx_tab[8] = {
+    // 1/sqrt(x) with x = 1-N/16, N=[8...1]
+    0x16A09, 0x15555, 0x143D1, 0x134BF, 0x1279A, 0x11C01, 0x111AC, 0x10865
+};
+
+GGLfixed gglSqrtRecipx(GGLfixed x)
+{
+    if (x == 0)         return FIXED_MAX;
+    if (x == FIXED_ONE) return x;
+    const GGLfixed a = x;
+    const int32_t lz = gglClz(x);
+    x = ggl_sqrt_reciproc_approx_tab[(a>>(28-lz))&0x7];
+    const int32_t exp = lz - 16;
+    if (exp <= 0)   x >>= -exp>>1;
+    else            x <<= (exp>>1) + (exp & 1);        
+    if (exp & 1) {
+        x = gglMulx(x, ggl_sqrt_reciproc_approx_tab[0])>>1;
+    }
+    // 2 Newton-Raphson iterations: x = x/2*(3-(a*x)*x)
+    x = gglMulx((x>>1),(0x30000 - gglMulx(gglMulx(a,x),x)));
+    x = gglMulx((x>>1),(0x30000 - gglMulx(gglMulx(a,x),x)));
+    return x;
+}
+
+GGLfixed gglSqrtx(GGLfixed a)
+{
+    // Compute a full precision square-root (24 bits accuracy)
+    GGLfixed r = 0;
+    GGLfixed bit = 0x800000;
+    int32_t bshift = 15;
+    do {
+        GGLfixed temp = bit + (r<<1);
+        if (bshift >= 8)    temp <<= (bshift-8);
+        else                temp >>= (8-bshift);
+        if (a >= temp) {
+            r += bit;
+            a -= temp;
+        }
+        bshift--;
+    } while (bit>>=1);
+    return r;
+}
+
+// ------------------------------------------------------------------------
+
+static const GGLfixed ggl_log_approx_tab[] = {
+    // -ln(x)/ln(2) with x = N/16, N=[8...16]
+    0xFFFF, 0xd47f, 0xad96, 0x8a62, 0x6a3f, 0x4caf, 0x3151, 0x17d6, 0x0000
+};
+
+static const GGLfixed ggl_alog_approx_tab[] = { // domain [0 - 1.0]
+	0xffff, 0xeac0, 0xd744, 0xc567, 0xb504, 0xa5fe, 0x9837, 0x8b95, 0x8000
+};
+
+GGLfixed gglPowx(GGLfixed x, GGLfixed y)
+{
+    // prerequisite: 0 <= x <= 1, and y >=0
+
+    // pow(x,y) = 2^(y*log2(x))
+    // =  2^(y*log2(x*(2^exp)*(2^-exp))))
+    // =  2^(y*(log2(X)-exp))
+    // =  2^(log2(X)*y - y*exp)
+    // =  2^( - (-log2(X)*y + y*exp) )
+    
+    int32_t exp = gglClz(x) - 16;
+    GGLfixed f = x << exp;
+    x = (f & 0x0FFF)<<4;
+    f = (f >> 12) & 0x7;
+    GGLfixed p = gglMulAddx(
+            ggl_log_approx_tab[f+1] - ggl_log_approx_tab[f], x,
+            ggl_log_approx_tab[f]);
+    p = gglMulAddx(p, y, y*exp);
+    exp = gglFixedToIntFloor(p);
+    if (exp < 31) {
+        p = gglFracx(p);
+        x = (p & 0x1FFF)<<3;
+        p >>= 13;    
+        p = gglMulAddx(
+                ggl_alog_approx_tab[p+1] - ggl_alog_approx_tab[p], x,
+                ggl_alog_approx_tab[p]);
+        p >>= exp;
+    } else {
+        p = 0;
+    }
+    return p;
+        // ( powf((a*65536.0f), (b*65536.0f)) ) * 65536.0f;
+}
+
+// ------------------------------------------------------------------------
+
+int32_t gglDivQ(GGLfixed n, GGLfixed d, int32_t i)
+{
+    //int32_t r =int32_t((int64_t(n)<<i)/d);
+    const int32_t ds = n^d;
+    if (n<0) n = -n;
+    if (d<0) d = -d;
+    int nd = gglClz(d) - gglClz(n);
+    i += nd + 1;
+    if (nd > 0) d <<= nd;
+    else        n <<= -nd;
+    uint32_t q = 0;
+
+    int j = i & 7;
+    i >>= 3;
+
+    // gcc deals with the code below pretty well.
+    // we get 3.75 cycles per bit in the main loop
+    // and 8 cycles per bit in the termination loop
+    if (ggl_likely(i)) {
+        n -= d;
+        do {
+            q <<= 8;
+            if (n>=0)   q |= 128;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 64;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 32;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 16;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 8;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 4;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 2;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 1;
+            else        n += d;
+            
+            if (--i == 0)
+                goto finish;
+
+            n = n*2 - d;
+        } while(true);
+        do {
+            q <<= 1;
+            n = n*2 - d;
+            if (n>=0)   q |= 1;
+            else        n += d;
+        finish: ;
+        } while (j--);
+        return (ds<0) ? -q : q;
+    }
+
+    n -= d;
+    if (n>=0)   q |= 1;
+    else        n += d;
+    j--;
+    goto finish;
+}
+
+// ------------------------------------------------------------------------
+
+// assumes that the int32_t values of a, b, and c are all positive
+// use when both a and b are larger than c
+
+template <typename T>
+static inline void swap(T& a, T& b) {
+    T t(a);
+    a = b;
+    b = t;
+}
+
+static __attribute__((noinline))
+int32_t slow_muldiv(uint32_t a, uint32_t b, uint32_t c)
+{
+	// first we compute a*b as a 64-bit integer
+    // (GCC generates umull with the code below)
+    uint64_t ab = uint64_t(a)*b;
+    uint32_t hi = ab>>32;
+    uint32_t lo = ab;
+    uint32_t result;
+
+	// now perform the division
+	if (hi >= c) {
+	overflow:
+		result = 0x7fffffff;  // basic overflow
+	} else if (hi == 0) {
+		result = lo/c;  // note: c can't be 0
+		if ((result >> 31) != 0)  // result must fit in 31 bits
+			goto overflow;
+	} else {
+		uint32_t r = hi;
+		int bits = 31;
+	    result = 0;
+		do {
+			r = (r << 1) | (lo >> 31);
+			lo <<= 1;
+			result <<= 1;
+			if (r >= c) {
+				r -= c;
+				result |= 1;
+			}
+		} while (bits--);
+	}
+	return int32_t(result);
+}
+
+// assumes a >= 0 and c >= b >= 0
+static inline
+int32_t quick_muldiv(int32_t a, int32_t b, int32_t c)
+{
+    int32_t r = 0, q = 0, i;
+    int leading = gglClz(a);
+    i = 32 - leading;
+    a <<= leading;
+    do {
+        r <<= 1;
+        if (a < 0)
+            r += b;
+        a <<= 1;
+        q <<= 1;
+        if (r >= c) {
+            r -= c;
+            q++;
+        }
+        asm(""::); // gcc generates better code this way
+        if (r >= c) {
+            r -= c;
+            q++;
+        }
+    }
+    while (--i);
+    return q;
+}
+
+// this function computes a*b/c with 64-bit intermediate accuracy
+// overflows (e.g. division by 0) are handled and return INT_MAX
+
+int32_t gglMulDivi(int32_t a, int32_t b, int32_t c)
+{
+	int32_t result;
+	int32_t sign = a^b^c;
+
+	if (a < 0) a = -a;
+	if (b < 0) b = -b;
+	if (c < 0) c = -c;
+
+    if (a < b) {
+        swap(a, b);
+    }
+    
+	if (b <= c) result = quick_muldiv(a, b, c);
+	else        result = slow_muldiv((uint32_t)a, (uint32_t)b, (uint32_t)c);
+	
+	if (sign < 0)
+		result = -result;
+	  
+    return result;
+}
diff --git a/libpixelflinger/format.cpp b/libpixelflinger/format.cpp
new file mode 100644
index 0000000..161e6d6
--- /dev/null
+++ b/libpixelflinger/format.cpp
@@ -0,0 +1,67 @@
+/* libs/pixelflinger/format.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <stdio.h>
+#include <pixelflinger/format.h>
+
+namespace android {
+
+static GGLFormat const gPixelFormatInfos[] =
+{   //          Alpha    Red     Green   Blue
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  4, 32, {{32,24,   8, 0,  16, 8,  24,16 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_8888
+    {  4, 24, {{ 0, 0,   8, 0,  16, 8,  24,16 }}, GGL_RGB  },   // PIXEL_FORMAT_RGBX_8888
+    {  3, 24, {{ 0, 0,   8, 0,  16, 8,  24,16 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_888
+    {  2, 16, {{ 0, 0,  16,11,  11, 5,   5, 0 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_565
+    {  4, 32, {{32,24,  24,16,  16, 8,   8, 0 }}, GGL_RGBA },   // PIXEL_FORMAT_BGRA_8888
+    {  2, 16, {{ 1, 0,  16,11,  11, 6,   6, 1 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_5551
+    {  2, 16, {{ 4, 0,  16,12,  12, 8,   8, 4 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_4444
+    {  1,  8, {{ 8, 0,   0, 0,   0, 0,   0, 0 }}, GGL_ALPHA},   // PIXEL_FORMAT_A8
+    {  1,  8, {{ 0, 0,   8, 0,   8, 0,   8, 0 }}, GGL_LUMINANCE},//PIXEL_FORMAT_L8
+    {  2, 16, {{16, 8,   8, 0,   8, 0,   8, 0 }}, GGL_LUMINANCE_ALPHA},// PIXEL_FORMAT_LA_88
+    {  1,  8, {{ 0, 0,   8, 5,   5, 2,   2, 0 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_332
+    
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+
+    {  1, 16, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_SP },// PIXEL_FORMAT_YCbCr_422_SP
+    {  1, 12, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_SP },// PIXEL_FORMAT_YCbCr_420_SP
+    {  1, 16, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_P }, // PIXEL_FORMAT_YCbCr_422_P
+    {  1, 12, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_P }, // PIXEL_FORMAT_YCbCr_420_P
+    {  1, 16, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_I }, // PIXEL_FORMAT_YCbCr_422_I
+    {  1, 12, {{ 0, 8,   0, 8,   0, 8,   0, 0 }}, GGL_Y_CB_CR_I }, // PIXEL_FORMAT_YCbCr_420_I
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    
+    {  2, 16, {{  0, 0, 16, 0,   0, 0,   0, 0 }}, GGL_DEPTH_COMPONENT},
+    {  1,  8, {{  8, 0,  0, 0,   0, 0,   0, 0 }}, GGL_STENCIL_INDEX  },
+    {  4, 24, {{  0, 0, 24, 0,   0, 0,   0, 0 }}, GGL_DEPTH_COMPONENT},
+    {  4,  8, {{ 32,24,  0, 0,   0, 0,   0, 0 }}, GGL_STENCIL_INDEX  },
+};
+
+}; // namespace android
+
+
+const GGLFormat* gglGetPixelFormatTable(size_t* numEntries)
+{
+    if (numEntries) {
+        *numEntries = sizeof(android::gPixelFormatInfos)/sizeof(GGLFormat);
+    }
+    return android::gPixelFormatInfos;
+}
diff --git a/libpixelflinger/picker.cpp b/libpixelflinger/picker.cpp
new file mode 100644
index 0000000..030ef19
--- /dev/null
+++ b/libpixelflinger/picker.cpp
@@ -0,0 +1,173 @@
+/* libs/pixelflinger/picker.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdio.h>
+
+#include "buffer.h"
+#include "scanline.h"
+#include "picker.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_picker(context_t* c)
+{
+}
+
+void ggl_pick(context_t* c)
+{
+    if (ggl_likely(!c->dirty))
+        return;
+        
+    // compute needs, see if they changed...
+    const uint32_t enables = c->state.enables;
+    needs_t new_needs(c->state.needs);
+
+    if (c->dirty & GGL_CB_STATE) {
+        new_needs.n &= ~GGL_NEEDS_CB_FORMAT_MASK;
+        new_needs.n |= GGL_BUILD_NEEDS(c->state.buffers.color.format, CB_FORMAT);
+        if (enables & GGL_ENABLE_BLENDING)
+            c->dirty |= GGL_PIXEL_PIPELINE_STATE;
+    }
+
+    if (c->dirty & GGL_PIXEL_PIPELINE_STATE) {
+        uint32_t n = GGL_BUILD_NEEDS(c->state.buffers.color.format, CB_FORMAT);
+        uint32_t p = 0;
+        if (enables & GGL_ENABLE_BLENDING) {
+            uint32_t src = c->state.blend.src;
+            uint32_t dst = c->state.blend.dst;
+            uint32_t src_alpha = c->state.blend.src_alpha;
+            uint32_t dst_alpha = c->state.blend.dst_alpha;
+            const GGLFormat& cbf = c->formats[ c->state.buffers.color.format ];
+            if (!cbf.c[GGLFormat::ALPHA].h) {
+                if ((src == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (src == GGL_DST_ALPHA)) {
+                    src = GGL_ONE;
+                }
+                if ((src_alpha == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (src_alpha == GGL_DST_ALPHA)) {
+                    src_alpha = GGL_ONE;
+                }
+                if ((dst == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (dst == GGL_DST_ALPHA)) {
+                    dst = GGL_ONE;
+                }
+                if ((dst_alpha == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (dst_alpha == GGL_DST_ALPHA)) {
+                    dst_alpha = GGL_ONE;
+                }
+            }
+
+            src       = ggl_blendfactor_to_needs(src);
+            dst       = ggl_blendfactor_to_needs(dst);
+            src_alpha = ggl_blendfactor_to_needs(src_alpha);
+            dst_alpha = ggl_blendfactor_to_needs(dst_alpha);
+                    
+            n |= GGL_BUILD_NEEDS( src, BLEND_SRC );
+            n |= GGL_BUILD_NEEDS( dst, BLEND_DST );
+            if (c->state.blend.alpha_separate) {
+                n |= GGL_BUILD_NEEDS( src_alpha, BLEND_SRCA );
+                n |= GGL_BUILD_NEEDS( dst_alpha, BLEND_DSTA );
+            } else {
+                n |= GGL_BUILD_NEEDS( src, BLEND_SRCA );
+                n |= GGL_BUILD_NEEDS( dst, BLEND_DSTA );
+            }
+        } else {
+            n |= GGL_BUILD_NEEDS( GGL_ONE,  BLEND_SRC );
+            n |= GGL_BUILD_NEEDS( GGL_ZERO, BLEND_DST );
+            n |= GGL_BUILD_NEEDS( GGL_ONE,  BLEND_SRCA );
+            n |= GGL_BUILD_NEEDS( GGL_ZERO, BLEND_DSTA );
+        }
+
+
+        n |= GGL_BUILD_NEEDS(c->state.mask.color^0xF,               MASK_ARGB);
+        n |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_SMOOTH)  ?1:0,   SHADE);
+        if (enables & GGL_ENABLE_TMUS) {
+            n |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_W)       ?1:0,   W);
+        }
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_DITHER)  ?1:0,   P_DITHER);
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_AA)      ?1:0,   P_AA);
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_FOG)     ?1:0,   P_FOG);
+
+        if (enables & GGL_ENABLE_LOGIC_OP) {
+            n |= GGL_BUILD_NEEDS(c->state.logic_op.opcode, LOGIC_OP);
+        } else {
+            n |= GGL_BUILD_NEEDS(GGL_COPY, LOGIC_OP);
+        }
+
+        if (enables & GGL_ENABLE_ALPHA_TEST) {
+            p |= GGL_BUILD_NEEDS(c->state.alpha_test.func, P_ALPHA_TEST);
+        } else {
+            p |= GGL_BUILD_NEEDS(GGL_ALWAYS, P_ALPHA_TEST);
+        }
+
+        if (enables & GGL_ENABLE_DEPTH_TEST) {
+            p |= GGL_BUILD_NEEDS(c->state.depth_test.func, P_DEPTH_TEST);
+            p |= GGL_BUILD_NEEDS(c->state.mask.depth&1, P_MASK_Z);
+        } else {
+            p |= GGL_BUILD_NEEDS(GGL_ALWAYS, P_DEPTH_TEST);
+            // writing to the z-buffer is always disabled if depth-test
+            // is disabled.
+        }
+        new_needs.n = n;
+        new_needs.p = p;
+    }
+
+    if (c->dirty & GGL_TMU_STATE) {
+        int idx = 0;
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            const texture_t& tx = c->state.texture[i];
+            if (tx.enable) {
+                uint32_t t = 0;
+                t |= GGL_BUILD_NEEDS(tx.surface.format, T_FORMAT);
+                t |= GGL_BUILD_NEEDS(ggl_env_to_needs(tx.env), T_ENV);
+                t |= GGL_BUILD_NEEDS(0, T_POT);       // XXX: not used yet
+                if (tx.s_coord==GGL_ONE_TO_ONE && tx.t_coord==GGL_ONE_TO_ONE) {
+                    // we encode 1-to-1 into the wrap mode
+                    t |= GGL_BUILD_NEEDS(GGL_NEEDS_WRAP_11, T_S_WRAP);
+                    t |= GGL_BUILD_NEEDS(GGL_NEEDS_WRAP_11, T_T_WRAP);
+                } else {
+                    t |= GGL_BUILD_NEEDS(ggl_wrap_to_needs(tx.s_wrap), T_S_WRAP);
+                    t |= GGL_BUILD_NEEDS(ggl_wrap_to_needs(tx.t_wrap), T_T_WRAP);
+                }
+                if (tx.mag_filter == GGL_LINEAR) {
+                    t |= GGL_BUILD_NEEDS(1, T_LINEAR);
+                }
+                if (tx.min_filter == GGL_LINEAR) {
+                    t |= GGL_BUILD_NEEDS(1, T_LINEAR);
+                }
+                new_needs.t[idx++] = t;
+            } else {
+                new_needs.t[i] = 0;
+            }
+        }
+    }
+
+    if (new_needs != c->state.needs) {
+        c->state.needs = new_needs;
+        ggl_pick_texture(c);
+        ggl_pick_cb(c);
+        ggl_pick_scanline(c);
+    }
+    c->dirty = 0;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
diff --git a/libpixelflinger/picker.h b/libpixelflinger/picker.h
new file mode 100644
index 0000000..9cdbc3c
--- /dev/null
+++ b/libpixelflinger/picker.h
@@ -0,0 +1,31 @@
+/* libs/pixelflinger/picker.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_PICKER_H
+#define ANDROID_PICKER_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_picker(context_t* c);
+void ggl_pick(context_t* c);
+
+}; // namespace android
+
+#endif
diff --git a/libpixelflinger/pixelflinger.cpp b/libpixelflinger/pixelflinger.cpp
new file mode 100644
index 0000000..b54da0c
--- /dev/null
+++ b/libpixelflinger/pixelflinger.cpp
@@ -0,0 +1,843 @@
+/* libs/pixelflinger/pixelflinger.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <sys/time.h>
+
+#include <pixelflinger/pixelflinger.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "buffer.h"
+#include "clear.h"
+#include "picker.h"
+#include "raster.h"
+#include "scanline.h"
+#include "trap.h"
+
+#include "codeflinger/GGLAssembler.h"
+#include "codeflinger/CodeCache.h"
+
+#include <stdio.h> 
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+// 8x8 Bayer dither matrix
+static const uint8_t gDitherMatrix[GGL_DITHER_SIZE] = {
+     0, 32,  8, 40,  2, 34, 10, 42,
+    48, 16, 56, 24, 50, 18, 58, 26,
+    12, 44,  4, 36, 14, 46,  6, 38,
+    60, 28, 52, 20, 62, 30, 54, 22,
+     3, 35, 11, 43,  1, 33,  9, 41,
+    51, 19, 59, 27, 49, 17, 57, 25,
+    15, 47,  7, 39, 13, 45,  5, 37,
+    63, 31, 55, 23, 61, 29, 53, 21
+};
+
+static void ggl_init_procs(context_t* c);
+static void ggl_set_scissor(context_t* c);
+
+static void ggl_enable_blending(context_t* c, int enable);
+static void ggl_enable_scissor_test(context_t* c, int enable);
+static void ggl_enable_alpha_test(context_t* c, int enable);
+static void ggl_enable_logic_op(context_t* c, int enable);
+static void ggl_enable_dither(context_t* c, int enable);
+static void ggl_enable_stencil_test(context_t* c, int enable);
+static void ggl_enable_depth_test(context_t* c, int enable);
+static void ggl_enable_aa(context_t* c, int enable);
+static void ggl_enable_point_aa_nice(context_t* c, int enable);
+static void ggl_enable_texture2d(context_t* c, int enable);
+static void ggl_enable_w_lerp(context_t* c, int enable);
+static void ggl_enable_fog(context_t* c, int enable);
+
+static inline int min(int a, int b) CONST;
+static inline int min(int a, int b) {
+    return a < b ? a : b;
+}
+
+static inline int max(int a, int b) CONST;
+static inline int max(int a, int b) {
+    return a < b ? b : a;
+}
+
+// ----------------------------------------------------------------------------
+
+void ggl_error(context_t* c, GGLenum error)
+{
+    if (c->error == GGL_NO_ERROR)
+        c->error = error;
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_bindTexture(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format != c->activeTMU->surface.format)
+        ggl_state_changed(c, GGL_TMU_STATE);    
+    ggl_set_surface(c, &(c->activeTMU->surface), surface);
+}
+
+
+static void ggl_bindTextureLod(void* con, GGLuint tmu,const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    // All LODs must have the same format
+    ggl_set_surface(c, &c->state.texture[tmu].surface, surface);
+}
+
+static void ggl_colorBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format != c->state.buffers.color.format)
+        ggl_state_changed(c, GGL_CB_STATE);
+
+    if (surface->width > c->state.buffers.coverageBufferSize) {
+        // allocate the coverage factor buffer
+        free(c->state.buffers.coverage);
+        c->state.buffers.coverage = (int16_t*)malloc(surface->width * 2);
+        c->state.buffers.coverageBufferSize =
+                c->state.buffers.coverage ? surface->width : 0;
+    }
+    ggl_set_surface(c, &(c->state.buffers.color), surface);
+    if (c->state.buffers.read.format == 0) {
+        ggl_set_surface(c, &(c->state.buffers.read), surface);
+    }
+    ggl_set_scissor(c);
+}
+
+static void ggl_readBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    ggl_set_surface(c, &(c->state.buffers.read), surface);
+}
+
+static void ggl_depthBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format == GGL_PIXEL_FORMAT_Z_16) {
+        ggl_set_surface(c, &(c->state.buffers.depth), surface);
+    } else {
+        c->state.buffers.depth.format = GGL_PIXEL_FORMAT_NONE;
+        ggl_enable_depth_test(c, 0);
+    }
+}
+
+static void ggl_scissor(void* con, GGLint x, GGLint y,
+        GGLsizei width, GGLsizei height)
+{
+    GGL_CONTEXT(c, con);
+    c->state.scissor.user_left = x;
+    c->state.scissor.user_top = y;
+    c->state.scissor.user_right = x + width;
+    c->state.scissor.user_bottom = y + height;
+    ggl_set_scissor(c);
+}
+
+// ----------------------------------------------------------------------------
+
+static void enable_disable(context_t* c, GGLenum name, int en)
+{
+    switch (name) {
+    case GGL_BLEND:             ggl_enable_blending(c, en);      break;
+    case GGL_SCISSOR_TEST:      ggl_enable_scissor_test(c, en);  break;
+    case GGL_ALPHA_TEST:        ggl_enable_alpha_test(c, en);    break;
+    case GGL_COLOR_LOGIC_OP:    ggl_enable_logic_op(c, en);      break;
+    case GGL_DITHER:            ggl_enable_dither(c, en);        break;
+    case GGL_STENCIL_TEST:      ggl_enable_stencil_test(c, en);  break;
+    case GGL_DEPTH_TEST:        ggl_enable_depth_test(c, en);    break;
+    case GGL_AA:                ggl_enable_aa(c, en);            break;
+    case GGL_TEXTURE_2D:        ggl_enable_texture2d(c, en);     break;
+    case GGL_W_LERP:            ggl_enable_w_lerp(c, en);        break;
+    case GGL_FOG:               ggl_enable_fog(c, en);           break;
+    case GGL_POINT_SMOOTH_NICE: ggl_enable_point_aa_nice(c, en); break;
+    }
+}
+
+static void ggl_enable(void* con, GGLenum name)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, 1);
+}
+
+static void ggl_disable(void* con, GGLenum name)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, 0);
+}
+
+static void ggl_enableDisable(void* con, GGLenum name, GGLboolean en)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, en ? 1 : 0);
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_shadeModel(void* con, GGLenum mode)
+{
+    GGL_CONTEXT(c, con);
+    switch (mode) {
+    case GGL_FLAT:
+        if (c->state.enables & GGL_ENABLE_SMOOTH) {
+            c->state.enables &= ~GGL_ENABLE_SMOOTH;
+            ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+        }
+        break;
+    case GGL_SMOOTH:
+        if (!(c->state.enables & GGL_ENABLE_SMOOTH)) {
+            c->state.enables |= GGL_ENABLE_SMOOTH;
+            ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+        }
+        break;
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_color4xv(void* con, const GGLclampx* color)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.r0 = gglFixedToIteratedColor(color[0]);
+	c->shade.g0 = gglFixedToIteratedColor(color[1]);
+	c->shade.b0 = gglFixedToIteratedColor(color[2]);
+	c->shade.a0 = gglFixedToIteratedColor(color[3]);
+}
+
+static void ggl_colorGrad12xv(void* con, const GGLcolor* grad)
+{
+    GGL_CONTEXT(c, con);
+    // it is very important to round the iterated value here because
+    // the rasterizer doesn't clamp them, therefore the iterated value
+    //must absolutely be correct.
+    // GGLColor is encoded as 8.16 value
+    const int32_t round = 0x8000;
+	c->shade.r0   = grad[ 0] + round;
+	c->shade.drdx = grad[ 1];
+	c->shade.drdy = grad[ 2];
+	c->shade.g0   = grad[ 3] + round;
+	c->shade.dgdx = grad[ 4];
+	c->shade.dgdy = grad[ 5];
+	c->shade.b0   = grad[ 6] + round;
+	c->shade.dbdx = grad[ 7];
+	c->shade.dbdy = grad[ 8];
+	c->shade.a0   = grad[ 9] + round;
+	c->shade.dadx = grad[10];
+	c->shade.dady = grad[11];
+}
+
+static void ggl_zGrad3xv(void* con, const GGLfixed32* grad)
+{
+    GGL_CONTEXT(c, con);
+    // z iterators are encoded as 0.32 fixed point and the z-buffer
+    // holds 16 bits, the rounding value is 0x8000.
+    const uint32_t round = 0x8000;
+	c->shade.z0   = grad[0] + round;
+	c->shade.dzdx = grad[1];
+	c->shade.dzdy = grad[2];
+}
+
+static void ggl_wGrad3xv(void* con, const GGLfixed* grad)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.w0   = grad[0];
+	c->shade.dwdx = grad[1];
+	c->shade.dwdy = grad[2];
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_fogGrad3xv(void* con, const GGLfixed* grad)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.f0     = grad[0];
+	c->shade.dfdx   = grad[1];
+	c->shade.dfdy   = grad[2];
+}
+
+static void ggl_fogColor3xv(void* con, const GGLclampx* color)
+{
+    GGL_CONTEXT(c, con);
+    const int32_t r = gglClampx(color[0]);
+    const int32_t g = gglClampx(color[1]);
+    const int32_t b = gglClampx(color[2]);
+	c->state.fog.color[GGLFormat::RED]  = (r - (r>>8))>>8;
+	c->state.fog.color[GGLFormat::GREEN]= (g - (g>>8))>>8;
+	c->state.fog.color[GGLFormat::BLUE] = (b - (b>>8))>>8;
+}
+
+static void ggl_enable_fog(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_FOG)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_FOG;
+        else        c->state.enables &= ~GGL_ENABLE_FOG;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_blendFunc(void* con, GGLenum src, GGLenum dst)
+{
+    GGL_CONTEXT(c, con);
+    c->state.blend.src = src;
+    c->state.blend.src_alpha = src;
+    c->state.blend.dst = dst;
+    c->state.blend.dst_alpha = dst;
+    c->state.blend.alpha_separate = 0;
+    if (c->state.enables & GGL_ENABLE_BLENDING) {
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_blendFuncSeparate(void* con,
+        GGLenum src, GGLenum dst,
+        GGLenum srcAlpha, GGLenum dstAplha)
+{
+    GGL_CONTEXT(c, con);
+    c->state.blend.src = src;
+    c->state.blend.src_alpha = srcAlpha;
+    c->state.blend.dst = dst;
+    c->state.blend.dst_alpha = dstAplha;
+    c->state.blend.alpha_separate = 1;
+    if (c->state.enables & GGL_ENABLE_BLENDING) {
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_texEnvi(void* con,	GGLenum target,
+							GGLenum pname,
+							GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_ENV || pname != GGL_TEXTURE_ENV_MODE) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    switch (param) {
+    case GGL_REPLACE:
+    case GGL_MODULATE:
+    case GGL_DECAL:
+    case GGL_BLEND:
+    case GGL_ADD:
+        if (c->activeTMU->env != param) {
+            c->activeTMU->env = param;
+            ggl_state_changed(c, GGL_TMU_STATE);
+        }
+        break;
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_texEnvxv(void* con, GGLenum target,
+        GGLenum pname, const GGLfixed* params)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_ENV) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    switch (pname) {
+    case GGL_TEXTURE_ENV_MODE:
+        ggl_texEnvi(con, target, pname, params[0]);
+        break;
+    case GGL_TEXTURE_ENV_COLOR: {
+        uint8_t* const color = c->activeTMU->env_color;
+        const GGLclampx r = gglClampx(params[0]);
+        const GGLclampx g = gglClampx(params[1]);
+        const GGLclampx b = gglClampx(params[2]);
+        const GGLclampx a = gglClampx(params[3]);
+        color[0] = (a-(a>>8))>>8;
+        color[1] = (r-(r>>8))>>8;
+        color[2] = (g-(g>>8))>>8;
+        color[3] = (b-(b>>8))>>8;
+        break;
+    }
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+}
+
+
+static void ggl_texParameteri(void* con,
+        GGLenum target,
+        GGLenum pname,
+        GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_2D) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+
+    if (param == GGL_CLAMP_TO_EDGE)
+        param = GGL_CLAMP;
+
+    uint16_t* what = 0;
+    switch (pname) {
+    case GGL_TEXTURE_WRAP_S:
+        if ((param == GGL_CLAMP) ||
+            (param == GGL_REPEAT)) {
+            what = &c->activeTMU->s_wrap;
+        }
+        break;
+    case GGL_TEXTURE_WRAP_T:
+        if ((param == GGL_CLAMP) ||
+            (param == GGL_REPEAT)) {
+            what = &c->activeTMU->t_wrap;
+        }
+        break;
+    case GGL_TEXTURE_MIN_FILTER:
+        if ((param == GGL_NEAREST) ||
+            (param == GGL_NEAREST_MIPMAP_NEAREST) ||
+            (param == GGL_NEAREST_MIPMAP_LINEAR)) {
+            what = &c->activeTMU->min_filter;
+            param = GGL_NEAREST;
+        }
+        if ((param == GGL_LINEAR) ||
+            (param == GGL_LINEAR_MIPMAP_NEAREST) ||
+            (param == GGL_LINEAR_MIPMAP_LINEAR)) {
+            what = &c->activeTMU->min_filter;
+            param = GGL_LINEAR;
+        }
+        break;
+    case GGL_TEXTURE_MAG_FILTER:
+        if ((param == GGL_NEAREST) ||
+            (param == GGL_LINEAR)) {
+            what = &c->activeTMU->mag_filter;
+        }
+        break;
+    }
+    
+    if (!what) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    
+    if (*what != param) {
+        *what = param;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+}
+
+static void ggl_texCoordGradScale8xv(void* con, GGLint tmu, const int32_t* grad)
+{
+    GGL_CONTEXT(c, con);
+    texture_t& u = c->state.texture[tmu];
+	u.shade.is0   = grad[0];
+	u.shade.idsdx = grad[1];
+	u.shade.idsdy = grad[2];
+	u.shade.it0   = grad[3];
+	u.shade.idtdx = grad[4];
+	u.shade.idtdy = grad[5];
+    u.shade.sscale= grad[6];
+    u.shade.tscale= grad[7];
+}
+
+static void ggl_texCoord2x(void* con, GGLfixed s, GGLfixed t)
+{
+    GGL_CONTEXT(c, con);
+	c->activeTMU->shade.is0 = s;
+	c->activeTMU->shade.it0 = t;
+    c->activeTMU->shade.sscale= 0;
+    c->activeTMU->shade.tscale= 0;
+}
+
+static void ggl_texCoord2i(void* con, GGLint s, GGLint t)
+{
+    ggl_texCoord2x(con, s<<16, t<<16);
+}
+
+static void ggl_texGeni(void* con, GGLenum coord, GGLenum pname, GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (pname != GGL_TEXTURE_GEN_MODE) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+
+    uint32_t* coord_ptr = 0;
+    if (coord == GGL_S)         coord_ptr = &(c->activeTMU->s_coord);
+    else if (coord == GGL_T)    coord_ptr = &(c->activeTMU->t_coord);
+
+    if (coord_ptr) {
+        if (*coord_ptr != uint32_t(param)) {
+            *coord_ptr = uint32_t(param);
+            ggl_state_changed(c, GGL_TMU_STATE);
+        }
+    } else {
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_activeTexture(void* con, GGLuint tmu)
+{
+    GGL_CONTEXT(c, con);
+    if (tmu >= GGLuint(GGL_TEXTURE_UNIT_COUNT)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    c->activeTMUIndex = tmu;
+    c->activeTMU = &(c->state.texture[tmu]);
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_colorMask(void* con, GGLboolean r,
+                                     GGLboolean g,
+                                     GGLboolean b,
+                                     GGLboolean a)
+{
+    GGL_CONTEXT(c, con);
+    int mask = 0;
+    if (a)  mask |= 1 << GGLFormat::ALPHA;
+    if (r)  mask |= 1 << GGLFormat::RED;
+    if (g)  mask |= 1 << GGLFormat::GREEN;
+    if (b)  mask |= 1 << GGLFormat::BLUE;
+    if (c->state.mask.color != mask) {
+        c->state.mask.color = mask;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_depthMask(void* con, GGLboolean flag)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.mask.depth != flag?1:0) {
+        c->state.mask.depth = flag?1:0;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_stencilMask(void* con, GGLuint mask)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.mask.stencil != mask) {
+        c->state.mask.stencil = mask;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_alphaFuncx(void* con, GGLenum func, GGLclampx ref)
+{
+    GGL_CONTEXT(c, con);
+    if ((func < GGL_NEVER) || (func > GGL_ALWAYS)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    c->state.alpha_test.ref = gglFixedToIteratedColor(gglClampx(ref));
+    if (c->state.alpha_test.func != func) {
+        c->state.alpha_test.func = func;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_depthFunc(void* con, GGLenum func)
+{
+    GGL_CONTEXT(c, con);
+    if ((func < GGL_NEVER) || (func > GGL_ALWAYS)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    if (c->state.depth_test.func != func) {
+        c->state.depth_test.func = func;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_logicOp(void* con, GGLenum opcode)
+{
+    GGL_CONTEXT(c, con);
+    if ((opcode < GGL_CLEAR) || (opcode > GGL_SET)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    if (c->state.logic_op.opcode != opcode) {
+        c->state.logic_op.opcode = opcode;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+
+void ggl_set_scissor(context_t* c)
+{
+    if (c->state.enables & GGL_ENABLE_SCISSOR_TEST) {
+        const int32_t l = c->state.scissor.user_left;
+        const int32_t t = c->state.scissor.user_top;
+        const int32_t r = c->state.scissor.user_right;
+        const int32_t b = c->state.scissor.user_bottom;
+        c->state.scissor.left   = max(0, l);
+        c->state.scissor.right  = min(c->state.buffers.color.width, r);
+        c->state.scissor.top    = max(0, t);
+        c->state.scissor.bottom = min(c->state.buffers.color.height, b);
+    } else {
+        c->state.scissor.left   = 0;
+        c->state.scissor.top    = 0;
+        c->state.scissor.right  = c->state.buffers.color.width;
+        c->state.scissor.bottom = c->state.buffers.color.height;
+    }
+}
+
+void ggl_enable_blending(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_BLENDING)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_BLENDING;
+        else        c->state.enables &= ~GGL_ENABLE_BLENDING;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_scissor_test(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_SCISSOR_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_SCISSOR_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_SCISSOR_TEST;
+        ggl_set_scissor(c);
+    }
+}
+
+void ggl_enable_alpha_test(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_ALPHA_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_ALPHA_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_ALPHA_TEST;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_logic_op(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_LOGIC_OP)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_LOGIC_OP;
+        else        c->state.enables &= ~GGL_ENABLE_LOGIC_OP;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_dither(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_DITHER)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_DITHER;
+        else        c->state.enables &= ~GGL_ENABLE_DITHER;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_stencil_test(context_t* c, int enable)
+{
+}
+
+void ggl_enable_depth_test(context_t* c, int enable)
+{
+    if (c->state.buffers.depth.format == 0)
+        enable = 0;
+    const int e = (c->state.enables & GGL_ENABLE_DEPTH_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_DEPTH_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_DEPTH_TEST;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_aa(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_AA)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_AA;
+        else        c->state.enables &= ~GGL_ENABLE_AA;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_point_aa_nice(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_POINT_AA_NICE)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_POINT_AA_NICE;
+        else        c->state.enables &= ~GGL_ENABLE_POINT_AA_NICE;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_w_lerp(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_W)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_W;
+        else        c->state.enables &= ~GGL_ENABLE_W;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_texture2d(context_t* c, int enable)
+{
+    if (c->activeTMU->enable != enable) {
+        const uint32_t tmu = c->activeTMUIndex;
+        c->activeTMU->enable = enable;
+        const uint32_t mask = 1UL << tmu;
+        if (enable)                 c->state.enabled_tmu |= mask;
+        else                        c->state.enabled_tmu &= ~mask;
+        if (c->state.enabled_tmu)   c->state.enables |= GGL_ENABLE_TMUS;
+        else                        c->state.enables &= ~GGL_ENABLE_TMUS;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+}
+
+        
+// ----------------------------------------------------------------------------
+
+int64_t ggl_system_time()
+{
+#if defined(HAVE_POSIX_CLOCKS)
+    struct timespec t;
+    t.tv_sec = t.tv_nsec = 0;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
+    return int64_t(t.tv_sec)*1000000000LL + t.tv_nsec;
+#else
+    // we don't support the clocks here.
+    struct timeval t;
+    t.tv_sec = t.tv_usec = 0;
+    gettimeofday(&t, NULL);
+    return int64_t(t.tv_sec)*1000000000LL + int64_t(t.tv_usec)*1000LL;
+#endif
+}
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_procs(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, scissor);
+    GGL_INIT_PROC(procs, activeTexture);
+    GGL_INIT_PROC(procs, bindTexture);
+    GGL_INIT_PROC(procs, bindTextureLod);
+    GGL_INIT_PROC(procs, colorBuffer);
+    GGL_INIT_PROC(procs, readBuffer);
+    GGL_INIT_PROC(procs, depthBuffer);
+    GGL_INIT_PROC(procs, enable);
+    GGL_INIT_PROC(procs, disable);
+    GGL_INIT_PROC(procs, enableDisable);
+    GGL_INIT_PROC(procs, shadeModel);
+    GGL_INIT_PROC(procs, color4xv);
+    GGL_INIT_PROC(procs, colorGrad12xv);
+    GGL_INIT_PROC(procs, zGrad3xv);
+    GGL_INIT_PROC(procs, wGrad3xv);
+    GGL_INIT_PROC(procs, fogGrad3xv);
+    GGL_INIT_PROC(procs, fogColor3xv);
+    GGL_INIT_PROC(procs, blendFunc);
+    GGL_INIT_PROC(procs, blendFuncSeparate);
+    GGL_INIT_PROC(procs, texEnvi);
+    GGL_INIT_PROC(procs, texEnvxv);
+    GGL_INIT_PROC(procs, texParameteri);
+    GGL_INIT_PROC(procs, texCoord2i);
+    GGL_INIT_PROC(procs, texCoord2x);
+    GGL_INIT_PROC(procs, texCoordGradScale8xv);
+    GGL_INIT_PROC(procs, texGeni);
+    GGL_INIT_PROC(procs, colorMask);
+    GGL_INIT_PROC(procs, depthMask);
+    GGL_INIT_PROC(procs, stencilMask);
+    GGL_INIT_PROC(procs, alphaFuncx);
+    GGL_INIT_PROC(procs, depthFunc);
+    GGL_INIT_PROC(procs, logicOp);
+    ggl_init_clear(c);
+}
+
+void ggl_init_context(context_t* c)
+{
+    memset(c, 0, sizeof(context_t));
+    ggl_init_procs(c);
+    ggl_init_trap(c);
+    ggl_init_scanline(c);
+    ggl_init_texture(c);
+    ggl_init_picker(c);
+    ggl_init_raster(c);
+    c->formats = gglGetPixelFormatTable();
+    c->state.blend.src = GGL_ONE;
+    c->state.blend.dst = GGL_ZERO;
+    c->state.blend.src_alpha = GGL_ONE;
+    c->state.blend.dst_alpha = GGL_ZERO;
+    c->state.mask.color = 0xF;
+    c->state.mask.depth = 0;
+    c->state.mask.stencil = 0xFFFFFFFF;
+    c->state.logic_op.opcode = GGL_COPY;
+    c->state.alpha_test.func = GGL_ALWAYS;
+    c->state.depth_test.func = GGL_LESS;
+    c->state.depth_test.clearValue = FIXED_ONE;
+    c->shade.w0 = FIXED_ONE;
+    memcpy(c->ditherMatrix, gDitherMatrix, sizeof(gDitherMatrix));
+}
+
+void ggl_uninit_context(context_t* c)
+{
+    ggl_uninit_scanline(c);
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+// ----------------------------------------------------------------------------
+
+
+
+using namespace android;
+
+ssize_t gglInit(GGLContext** context)
+{
+    void* const base = malloc(sizeof(context_t) + 32);
+	if (base) {
+        // always align the context on cache lines
+        context_t *c = (context_t *)((ptrdiff_t(base)+31) & ~0x1FL);
+        ggl_init_context(c);
+        c->base = base;
+		*context = (GGLContext*)c;
+	} else {
+		return -1;
+	}
+	return 0;
+}
+
+ssize_t gglUninit(GGLContext* con)
+{
+    GGL_CONTEXT(c, (void*)con);
+    ggl_uninit_context(c);
+	free(c->base);
+	return 0;
+}
+
diff --git a/libpixelflinger/raster.cpp b/libpixelflinger/raster.cpp
new file mode 100644
index 0000000..d751202
--- /dev/null
+++ b/libpixelflinger/raster.cpp
@@ -0,0 +1,217 @@
+/* libs/pixelflinger/raster.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+
+#include <string.h>
+
+#include "raster.h"
+#include "trap.h"
+
+namespace android {
+
+static void ggl_rasterPos2x(void* con, GGLfixed x, GGLfixed y);
+static void ggl_rasterPos2i(void* con, GGLint x, GGLint y);
+static void ggl_copyPixels(void* con, GGLint xs, GGLint ys,
+        GGLsizei width, GGLsizei height, GGLenum type);
+
+void ggl_init_raster(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, copyPixels);
+    GGL_INIT_PROC(procs, rasterPos2x);
+    GGL_INIT_PROC(procs, rasterPos2i);
+}
+
+void ggl_rasterPos2x(void* con, GGLfixed x, GGLfixed y)
+{
+    GGL_CONTEXT(c, con);
+    // raster pos should be processed just like glVertex
+    c->state.raster.x = x;
+    c->state.raster.y = y;
+}
+
+void ggl_rasterPos2i(void* con, GGLint x, GGLint y)
+{
+    ggl_rasterPos2x(con, gglIntToFixed(x), gglIntToFixed(y));
+}
+
+void ggl_copyPixels(void* con, GGLint xs, GGLint ys,
+        GGLsizei width, GGLsizei height, GGLenum type)
+{
+    GGL_CONTEXT(c, con);
+
+    // color-buffer
+    surface_t* cb = &(c->state.buffers.color);
+
+    // undefined behaviour if we try to copy from outside the surface
+    if (uint32_t(xs) > cb->width)
+        return;
+    if (uint32_t(ys) > cb->height)
+        return;
+    if (uint32_t(xs + width) > cb->width)
+        return;
+    if (uint32_t(ys + height) > cb->height)
+        return;
+
+    // copy to current raster position
+    GGLint xd = gglFixedToIntRound(c->state.raster.x);
+    GGLint yd = gglFixedToIntRound(c->state.raster.y);
+
+    // clip to scissor
+    if (xd < GGLint(c->state.scissor.left)) {
+        GGLint offset = GGLint(c->state.scissor.left) - xd;
+        xd = GGLint(c->state.scissor.left);
+        xs += offset;
+        width -= offset;
+    }
+    if (yd < GGLint(c->state.scissor.top)) {
+        GGLint offset = GGLint(c->state.scissor.top) - yd;
+        yd = GGLint(c->state.scissor.top);
+        ys += offset;
+        height -= offset;
+    }
+    if ((xd + width) > GGLint(c->state.scissor.right)) {
+        width = GGLint(c->state.scissor.right) - xd;
+    }
+    if ((yd + height) > GGLint(c->state.scissor.bottom)) {
+        height = GGLint(c->state.scissor.bottom) - yd;
+    }
+
+    if (width<=0 || height<=0) {
+        return; // nothing to copy
+    }
+
+    if (xs==xd && ys==yd) {
+        // nothing to do, but be careful, this might not be true when we support
+        // gglPixelTransfer, gglPixelMap and gglPixelZoom
+        return;
+    }
+
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* src = reinterpret_cast<uint8_t*>(cb->data)
+            + (xs + (cb->stride * ys)) * fp->size;
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data)
+            + (xd + (cb->stride * yd)) * fp->size;
+    const size_t bpr = cb->stride * fp->size;
+    const size_t rowsize = width * fp->size;
+    size_t yc = height;
+
+    if (ys < yd) {
+        // bottom to top
+        src += height * bpr;
+        dst += height * bpr;
+        do {
+            dst -= bpr;
+            src -= bpr;
+            memcpy(dst, src, rowsize);
+        } while (--yc);
+    } else {
+        if (ys == yd) {
+            // might be right to left
+            do {
+                memmove(dst, src, rowsize);
+                dst += bpr;
+                src += bpr;
+            } while (--yc);
+        } else {
+            // top to bottom
+            do {
+                memcpy(dst, src, rowsize);
+                dst += bpr;
+                src += bpr;
+            } while (--yc);
+        }
+    }
+}
+
+}; // namespace android
+
+using namespace android;
+
+GGLint gglBitBlti(GGLContext* con, int tmu, GGLint crop[4], GGLint where[4])
+{
+    GGL_CONTEXT(c, (void*)con);
+
+     GGLint x = where[0];
+     GGLint y = where[1];
+     GGLint w = where[2];
+     GGLint h = where[3];
+
+    // exclsively enable this tmu
+    const GGLSurface& cbSurface = c->state.buffers.color.s;
+    c->procs.activeTexture(c, tmu);
+    c->procs.disable(c, GGL_W_LERP);
+
+    uint32_t tmus = 1UL<<tmu;
+    if (c->state.enabled_tmu != tmus) {
+        c->activeTMU->enable = 1;
+        c->state.enabled_tmu = tmus;
+        c->state.enables |= GGL_ENABLE_TMUS;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+
+    const GGLint Wcr = crop[2];
+    const GGLint Hcr = crop[3];
+    if ((w == Wcr) && (h == Hcr)) {
+        c->procs.texGeni(c, GGL_S, GGL_TEXTURE_GEN_MODE, GGL_ONE_TO_ONE);
+        c->procs.texGeni(c, GGL_T, GGL_TEXTURE_GEN_MODE, GGL_ONE_TO_ONE);
+        const GGLint Ucr = crop[0];
+        const GGLint Vcr = crop[1];
+        const GGLint s0  = Ucr - x;
+        const GGLint t0  = Vcr - y;
+        c->procs.texCoord2i(c, s0, t0);
+        c->procs.recti(c, x, y, x+w, y+h);
+    } else {
+        int32_t texcoords[8];
+        x = gglIntToFixed(x);
+        y = gglIntToFixed(y); 
+    
+        // we CLAMP here, which works with premultiplied (s,t)
+        c->procs.texParameteri(c, GGL_TEXTURE_2D, GGL_TEXTURE_WRAP_S, GGL_CLAMP);
+        c->procs.texParameteri(c, GGL_TEXTURE_2D, GGL_TEXTURE_WRAP_T, GGL_CLAMP);
+        c->procs.texGeni(c, GGL_S, GGL_TEXTURE_GEN_MODE, GGL_AUTOMATIC);
+        c->procs.texGeni(c, GGL_T, GGL_TEXTURE_GEN_MODE, GGL_AUTOMATIC);
+    
+        const GGLint Ucr = crop[0] << 16;
+        const GGLint Vcr = crop[1] << 16;
+        const GGLint Wcr = crop[2] << 16;
+        const GGLint Hcr = crop[3] << 16;
+    
+        // computes texture coordinates (pre-multiplied)
+        int32_t dsdx = Wcr / w;   // dsdx = ((Wcr/w)/Wt)*Wt
+        int32_t dtdy = Hcr / h;   // dtdy = ((Hcr/h)/Ht)*Ht
+        int32_t s0   = Ucr - gglMulx(dsdx, x); // s0 = Ucr - x * dsdx
+        int32_t t0   = Vcr - gglMulx(dtdy, y); // t0 = Vcr - y * dtdy
+        texcoords[0] = s0;
+        texcoords[1] = dsdx;
+        texcoords[2] = 0;
+        texcoords[3] = t0;
+        texcoords[4] = 0;
+        texcoords[5] = dtdy;
+        texcoords[6] = 0;
+        texcoords[7] = 0;
+        c->procs.texCoordGradScale8xv(c, tmu, texcoords);
+        c->procs.recti(c, 
+                gglFixedToIntRound(x),
+                gglFixedToIntRound(y),
+                gglFixedToIntRound(x)+w,
+                gglFixedToIntRound(y)+h);
+    }
+    return 0;
+}
+
diff --git a/libpixelflinger/raster.h b/libpixelflinger/raster.h
new file mode 100644
index 0000000..9f0f240
--- /dev/null
+++ b/libpixelflinger/raster.h
@@ -0,0 +1,33 @@
+/* libs/pixelflinger/raster.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGL_RASTER_H
+#define ANDROID_GGL_RASTER_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_raster(context_t* c);
+
+void gglCopyPixels(void* c, GGLint x, GGLint y, GGLsizei width, GGLsizei height, GGLenum type);
+void gglRasterPos2d(void* c, GGLint x, GGLint y);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_RASTER_H
diff --git a/libpixelflinger/rotate90CW_4x4_16v6.S b/libpixelflinger/rotate90CW_4x4_16v6.S
new file mode 100644
index 0000000..8e3e142
--- /dev/null
+++ b/libpixelflinger/rotate90CW_4x4_16v6.S
@@ -0,0 +1,62 @@
+/*
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+    .text
+    .align
+    
+    .global rotate90CW_4x4_16v6
+
+// Rotates 90deg CW a 4x4 block of 16bpp pixels using ARMv6
+// src and dst must be 4 pixels-aligned (2-pixels aligned might
+// actually work)
+//
+// The code below is complicated by ARM's little endianness. 
+
+rotate90CW_4x4_16v6:
+    // r0 = dst
+    // r1 = src
+    // r2 = dst stride in pixels
+    // r3 = src stride in pixels
+
+    stmfd   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, lr}
+    add     r14, r3, r3
+    add     r12, r2, r2
+
+    ldrd    r2, r3, [r1], r14
+    ldrd    r4, r5, [r1], r14
+    ldrd    r6, r7, [r1], r14
+    ldrd    r8, r9, [r1]
+
+    pkhbt   r10, r8, r6, lsl #16
+    pkhbt   r11, r4, r2, lsl #16
+    strd    r10, r11, [r0], r12  
+
+    pkhtb   r10, r6, r8, asr #16
+    pkhtb   r11, r2, r4, asr #16
+
+    strd    r10, r11, [r0], r12  
+    pkhbt   r10, r9, r7, lsl #16
+    pkhbt   r11, r5, r3, lsl #16
+
+    strd    r10, r11, [r0], r12  
+
+    pkhtb   r10, r7, r9, asr #16
+    pkhtb   r11, r3, r5, asr #16
+    strd    r10, r11, [r0]
+
+    ldmfd   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, pc}
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
new file mode 100644
index 0000000..f700306
--- /dev/null
+++ b/libpixelflinger/scanline.cpp
@@ -0,0 +1,1496 @@
+/* libs/pixelflinger/scanline.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#define LOG_TAG "pixelflinger"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cutils/memory.h>
+#include <cutils/log.h>
+
+#include "buffer.h"
+#include "scanline.h"
+
+#include "codeflinger/CodeCache.h"
+#include "codeflinger/GGLAssembler.h"
+#include "codeflinger/ARMAssembler.h"
+//#include "codeflinger/ARMAssemblerOptimizer.h"
+
+// ----------------------------------------------------------------------------
+
+#define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
+#define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic 
+#define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
+#define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
+
+#ifdef NDEBUG
+#   define ANDROID_RELEASE
+#   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
+#else
+#   define ANDROID_DEBUG
+#   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
+#endif
+
+#if defined(__arm__)
+#   define ANDROID_ARM_CODEGEN  1
+#else
+#   define ANDROID_ARM_CODEGEN  0
+#endif
+
+#define DEBUG__CODEGEN_ONLY     0
+
+
+#define ASSEMBLY_SCRATCH_SIZE   2048
+
+// ----------------------------------------------------------------------------
+namespace android {
+// ----------------------------------------------------------------------------
+
+static void init_y(context_t*, int32_t);
+static void init_y_noop(context_t*, int32_t);
+static void init_y_packed(context_t*, int32_t);
+static void init_y_error(context_t*, int32_t);
+
+static void step_y__generic(context_t* c);
+static void step_y__nop(context_t*);
+static void step_y__smooth(context_t* c);
+static void step_y__tmu(context_t* c);
+static void step_y__w(context_t* c);
+
+static void scanline(context_t* c);
+static void scanline_perspective(context_t* c);
+static void scanline_perspective_single(context_t* c);
+static void scanline_t32cb16blend(context_t* c);
+static void scanline_t32cb16(context_t* c);
+static void scanline_memcpy(context_t* c);
+static void scanline_memset8(context_t* c);
+static void scanline_memset16(context_t* c);
+static void scanline_memset32(context_t* c);
+static void scanline_noop(context_t* c);
+static void scanline_set(context_t* c);
+static void scanline_clear(context_t* c);
+
+static void rect_generic(context_t* c, size_t yc);
+static void rect_memcpy(context_t* c, size_t yc);
+
+extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
+extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
+
+// ----------------------------------------------------------------------------
+
+struct shortcut_t {
+    needs_filter_t  filter;
+    const char*     desc;
+    void            (*scanline)(context_t*);
+    void            (*init_y)(context_t*, int32_t);
+};
+
+// Keep in sync with needs
+static shortcut_t shortcuts[] = {
+    { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, blend", scanline_t32cb16blend, init_y_noop },
+    { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx", scanline_t32cb16, init_y_noop  },  
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
+        "(nop) alpha test", scanline_noop, init_y_noop },
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
+        "(nop) depth test", scanline_noop, init_y_noop },
+    { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
+        "(nop) logic_op", scanline_noop, init_y_noop },
+    { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
+        "(nop) color mask", scanline_noop, init_y_noop },
+    { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(set) logic_op", scanline_set, init_y_noop },
+    { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(clear) logic_op", scanline_clear, init_y_noop },
+    { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(clear) blending 0/0", scanline_clear, init_y_noop },
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
+        "(error) invalid color-buffer format", scanline_noop, init_y_error },
+};
+static const needs_filter_t noblend1to1 = {
+        // (disregard dithering, see below)
+        { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
+        { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
+};
+static  const needs_filter_t fill16noblend = {
+        { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
+};
+
+// ----------------------------------------------------------------------------
+
+#if ANDROID_ARM_CODEGEN
+static CodeCache gCodeCache(12 * 1024);
+
+class ScanlineAssembly : public Assembly {
+    AssemblyKey<needs_t> mKey;
+public:
+    ScanlineAssembly(needs_t needs, size_t size)
+        : Assembly(size), mKey(needs) { }
+    const AssemblyKey<needs_t>& key() const { return mKey; }
+};
+#endif
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_scanline(context_t* c)
+{
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+    c->scanline = scanline;
+}
+
+void ggl_uninit_scanline(context_t* c)
+{
+    if (c->state.buffers.coverage)
+        free(c->state.buffers.coverage);
+#if ANDROID_ARM_CODEGEN
+    if (c->scanline_as)
+        c->scanline_as->decStrong(c);
+#endif
+}
+
+// ----------------------------------------------------------------------------
+
+static void pick_scanline(context_t* c)
+{
+#if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
+
+#if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+    c->scanline = scanline;
+    return;
+#endif
+
+    //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
+    //    c->state.needs.n, c->state.needs.p,
+    //    c->state.needs.t[0], c->state.needs.t[1]);
+
+    // first handle the special case that we cannot test with a filter
+    const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
+    if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
+        if (c->state.needs.match(noblend1to1)) {
+            // this will match regardless of dithering state, since both
+            // src and dest have the same format anyway, there is no dithering
+            // to be done.
+            const GGLFormat* f =
+                &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
+            if ((f->components == GGL_RGB) ||
+                (f->components == GGL_RGBA) ||
+                (f->components == GGL_LUMINANCE) ||
+                (f->components == GGL_LUMINANCE_ALPHA))
+            {
+                // format must have all of RGB components
+                // (so the current color doesn't show through)
+                c->scanline = scanline_memcpy;
+                c->init_y = init_y_noop;
+                return;
+            }
+        }
+    }
+
+    if (c->state.needs.match(fill16noblend)) {
+        c->init_y = init_y_packed;
+        switch (c->formats[cb_format].size) {
+        case 1: c->scanline = scanline_memset8;  return;
+        case 2: c->scanline = scanline_memset16; return;
+        case 4: c->scanline = scanline_memset32; return;
+        }
+    }
+
+    const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
+    for (int i=0 ; i<numFilters ; i++) {
+        if (c->state.needs.match(shortcuts[i].filter)) {
+            c->scanline = shortcuts[i].scanline;
+            c->init_y = shortcuts[i].init_y;
+            return;
+        }
+    }
+
+#endif // DEBUG__CODEGEN_ONLY
+
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+
+#if ANDROID_ARM_CODEGEN
+    // we're going to have to generate some code...
+    // here, generate code for our pixel pipeline
+    const AssemblyKey<needs_t> key(c->state.needs);
+    sp<Assembly> assembly = gCodeCache.lookup(key);
+    if (assembly == 0) {
+        // create a new assembly region
+        sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs, 
+                ASSEMBLY_SCRATCH_SIZE);
+        // initialize our assembler
+        GGLAssembler assembler( new ARMAssembler(a) );
+        //GGLAssembler assembler(
+        //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
+        // generate the scanline code for the given needs
+        int err = assembler.scanline(c->state.needs, c);
+        if (ggl_likely(!err)) {
+            // finally, cache this assembly
+            err = gCodeCache.cache(a->key(), a);
+        }
+        if (ggl_unlikely(err)) {
+            LOGE("error generating or caching assembly. Reverting to NOP.");
+            c->scanline = scanline_noop;
+            c->init_y = init_y_noop;
+            c->step_y = step_y__nop;
+            return;
+        }
+        assembly = a;
+    }
+
+    // release the previous assembly
+    if (c->scanline_as) {
+        c->scanline_as->decStrong(c);
+    }
+
+    //LOGI("using generated pixel-pipeline");
+    c->scanline_as = assembly.get();
+    c->scanline_as->incStrong(c); //  hold on to assembly
+    c->scanline = (void(*)(context_t* c))assembly->base();
+#else
+//    LOGW("using generic (slow) pixel-pipeline");
+    c->scanline = scanline;
+#endif
+}
+
+void ggl_pick_scanline(context_t* c)
+{
+    pick_scanline(c);
+    if ((c->state.enables & GGL_ENABLE_W) &&
+        (c->state.enables & GGL_ENABLE_TMUS))
+    {
+        c->span = c->scanline;
+        c->scanline = scanline_perspective;
+        if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
+            // only one TMU enabled
+            c->scanline = scanline_perspective_single;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
+static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
+        const pixel_t* src, const pixel_t* dst);
+static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
+
+#if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+
+// no need to compile the generic-pipeline, it can't be reached
+void scanline(context_t*)
+{
+}
+
+#else
+
+void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
+{
+    if (su && sv) {
+        if (su > sv) {
+            v = ggl_expand(v, sv, su);
+            sv = su;
+        } else if (su < sv) {
+            u = ggl_expand(u, su, sv);
+            su = sv;
+        }
+    }
+}
+
+void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
+{
+    rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
+    rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
+    rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
+    rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
+
+    pixel_t sf, df;
+    blend_factor(c, &sf, c->state.blend.src, fragment, fb);
+    blend_factor(c, &df, c->state.blend.dst, fragment, fb);
+
+    fragment->c[1] =
+            gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
+    fragment->c[2] =
+            gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
+    fragment->c[3] =
+            gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
+
+    if (c->state.blend.alpha_separate) {
+        blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
+        blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
+    }
+
+    fragment->c[0] =
+            gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
+
+    // clamp to 1.0
+    if (fragment->c[0] >= (1LU<<fragment->s[0]))
+        fragment->c[0] = (1<<fragment->s[0])-1;
+    if (fragment->c[1] >= (1LU<<fragment->s[1]))
+        fragment->c[1] = (1<<fragment->s[1])-1;
+    if (fragment->c[2] >= (1LU<<fragment->s[2]))
+        fragment->c[2] = (1<<fragment->s[2])-1;
+    if (fragment->c[3] >= (1LU<<fragment->s[3]))
+        fragment->c[3] = (1<<fragment->s[3])-1;
+}
+
+static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
+{
+    if (!size)
+        return def;
+
+    // scale to 16 bits
+    if (size > 16) {
+        x >>= (size - 16);
+    } else if (size < 16) {
+        x = ggl_expand(x, size, 16);
+    }
+    x += x >> 15;
+    return x;
+}
+
+void blend_factor(context_t* c, pixel_t* r, 
+        uint32_t factor, const pixel_t* src, const pixel_t* dst)
+{
+    switch (factor) {
+        case GGL_ZERO:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = 0;
+            break;
+        case GGL_ONE:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE;
+            break;
+        case GGL_DST_COLOR:
+            r->c[1] = blendfactor(dst->c[1], dst->s[1]);
+            r->c[2] = blendfactor(dst->c[2], dst->s[2]);
+            r->c[3] = blendfactor(dst->c[3], dst->s[3]);
+            r->c[0] = blendfactor(dst->c[0], dst->s[0]);
+            break;
+        case GGL_SRC_COLOR:
+            r->c[1] = blendfactor(src->c[1], src->s[1]);
+            r->c[2] = blendfactor(src->c[2], src->s[2]);
+            r->c[3] = blendfactor(src->c[3], src->s[3]);
+            r->c[0] = blendfactor(src->c[0], src->s[0]);
+            break;
+        case GGL_ONE_MINUS_DST_COLOR:
+            r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
+            r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
+            r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
+            r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
+            break;
+        case GGL_ONE_MINUS_SRC_COLOR:
+            r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
+            r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
+            r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
+            r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
+            break;
+        case GGL_SRC_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
+            break;
+        case GGL_ONE_MINUS_SRC_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
+            break;
+        case GGL_DST_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
+            break;
+        case GGL_ONE_MINUS_DST_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
+            break;
+        case GGL_SRC_ALPHA_SATURATE:
+            // XXX: GGL_SRC_ALPHA_SATURATE
+            break;
+    }
+}
+
+static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
+{
+    GGLfixed d;
+    if (tx_wrap == GGL_REPEAT) {
+        d = (uint32_t(coord)>>16) * size;
+    } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
+        const GGLfixed clamp_min = FIXED_HALF;
+        const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
+        if (coord < clamp_min)     coord = clamp_min;
+        if (coord > clamp_max)     coord = clamp_max;
+        d = coord;
+    } else { // 1:1
+        const GGLfixed clamp_min = 0;
+        const GGLfixed clamp_max = (size << 16);
+        if (coord < clamp_min)     coord = clamp_min;
+        if (coord > clamp_max)     coord = clamp_max;
+        d = coord;
+    }
+    return d;
+}
+
+static inline
+GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
+{
+    const int32_t end = dvdx * (len-1) + v;
+    if (end < 0)
+        v -= end;
+    v &= ~(v>>31);
+    return v;
+}
+
+void scanline(context_t* c)
+{
+    const uint32_t enables = c->state.enables;
+    const int xs = c->iterators.xl;
+    const int x1 = c->iterators.xr;
+	int xc = x1 - xs;
+    const int16_t* covPtr = c->state.buffers.coverage + xs;
+
+    // All iterated values are sampled at the pixel center
+
+    // reset iterators for that scanline...
+    GGLcolor r, g, b, a;
+    iterators_t& ci = c->iterators;
+    if (enables & GGL_ENABLE_SMOOTH) {
+        r = (xs * c->shade.drdx) + ci.ydrdy;
+        g = (xs * c->shade.dgdx) + ci.ydgdy;
+        b = (xs * c->shade.dbdx) + ci.ydbdy;
+        a = (xs * c->shade.dadx) + ci.ydady;
+        r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
+        g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
+        b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
+        a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
+    } else {
+        r = ci.ydrdy;
+        g = ci.ydgdy;
+        b = ci.ydbdy;
+        a = ci.ydady;
+    }
+
+    // z iterators are 1.31
+    GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
+    GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
+
+    struct {
+        GGLfixed s, t;
+    } tc[GGL_TEXTURE_UNIT_COUNT];
+    if (enables & GGL_ENABLE_TMUS) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            if (c->state.texture[i].enable) {
+                texture_iterators_t& ti = c->state.texture[i].iterators;
+                if (enables & GGL_ENABLE_W) {
+                    tc[i].s = ti.ydsdy;
+                    tc[i].t = ti.ydtdy;
+                } else {
+                    tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
+                    tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
+                }
+            }
+        }
+    }
+
+    pixel_t fragment;
+    pixel_t texel;
+    pixel_t fb;
+
+	uint32_t x = xs;
+	uint32_t y = c->iterators.y;
+
+	while (xc--) {
+    
+        { // just a scope
+
+		// read color (convert to 8 bits by keeping only the integer part)
+        fragment.s[1] = fragment.s[2] =
+        fragment.s[3] = fragment.s[0] = 8;
+        fragment.c[1] = r >> (GGL_COLOR_BITS-8);
+        fragment.c[2] = g >> (GGL_COLOR_BITS-8);
+        fragment.c[3] = b >> (GGL_COLOR_BITS-8);
+        fragment.c[0] = a >> (GGL_COLOR_BITS-8);
+
+		// texturing
+        if (enables & GGL_ENABLE_TMUS) {
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+                texture_t& tx = c->state.texture[i];
+                if (!tx.enable)
+                    continue;
+                texture_iterators_t& ti = tx.iterators;
+                int32_t u, v;
+
+                // s-coordinate
+                if (tx.s_coord != GGL_ONE_TO_ONE) {
+                    const int w = tx.surface.width;
+                    u = wrapping(tc[i].s, w, tx.s_wrap);
+                    tc[i].s += ti.dsdx;
+                } else {
+                    u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
+                }
+
+                // t-coordinate
+                if (tx.t_coord != GGL_ONE_TO_ONE) {
+                    const int h = tx.surface.height;
+                    v = wrapping(tc[i].t, h, tx.t_wrap);
+                    tc[i].t += ti.dtdx;
+                } else {
+                    v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
+                }
+
+                // read texture
+                if (tx.mag_filter == GGL_NEAREST &&
+                    tx.min_filter == GGL_NEAREST)
+                {
+                    u >>= 16;
+                    v >>= 16;
+                    tx.surface.read(&tx.surface, c, u, v, &texel);
+                } else {
+                    const int w = tx.surface.width;
+                    const int h = tx.surface.height;
+                    u -= FIXED_HALF;
+                    v -= FIXED_HALF;
+                    int u0 = u >> 16;
+                    int v0 = v >> 16;
+                    int u1 = u0 + 1;
+                    int v1 = v0 + 1;
+                    if (tx.s_wrap == GGL_REPEAT) {
+                        if (u0<0)  u0 += w;
+                        if (u1<0)  u1 += w;
+                        if (u0>=w) u0 -= w;
+                        if (u1>=w) u1 -= w;
+                    } else {
+                        if (u0<0)  u0 = 0;
+                        if (u1<0)  u1 = 0;
+                        if (u0>=w) u0 = w-1;
+                        if (u1>=w) u1 = w-1;
+                    }
+                    if (tx.t_wrap == GGL_REPEAT) {
+                        if (v0<0)  v0 += h;
+                        if (v1<0)  v1 += h;
+                        if (v0>=h) v0 -= h;
+                        if (v1>=h) v1 -= h;
+                    } else {
+                        if (v0<0)  v0 = 0;
+                        if (v1<0)  v1 = 0;
+                        if (v0>=h) v0 = h-1;
+                        if (v1>=h) v1 = h-1;
+                    }
+                    pixel_t texels[4];
+                    uint32_t mm[4];
+                    tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
+                    tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
+                    tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
+                    tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
+                    u = (u >> 12) & 0xF; 
+                    v = (v >> 12) & 0xF;
+                    u += u>>3;
+                    v += v>>3;
+                    mm[0] = (0x10 - u) * (0x10 - v);
+                    mm[1] = (0x10 - u) * v;
+                    mm[2] = u * (0x10 - v);
+                    mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
+                    for (int j=0 ; j<4 ; j++) {
+                        texel.s[j] = texels[0].s[j];
+                        if (!texel.s[j]) continue;
+                        texel.s[j] += 8;
+                        texel.c[j] =    texels[0].c[j]*mm[0] +
+                                        texels[1].c[j]*mm[1] +
+                                        texels[2].c[j]*mm[2] +
+                                        texels[3].c[j]*mm[3] ;
+                    }
+                }
+
+                // Texture environnement...
+                for (int j=0 ; j<4 ; j++) {
+                    uint32_t& Cf = fragment.c[j];
+                    uint32_t& Ct = texel.c[j];
+                    uint8_t& sf  = fragment.s[j];
+                    uint8_t& st  = texel.s[j];
+                    uint32_t At = texel.c[0];
+                    uint8_t sat = texel.s[0];
+                    switch (tx.env) {
+                    case GGL_REPLACE:
+                        if (st) {
+                            Cf = Ct;
+                            sf = st;
+                        }
+                        break;
+                    case GGL_MODULATE:
+                        if (st) {
+                            uint32_t factor = Ct + (Ct>>(st-1));
+                            Cf = (Cf * factor) >> st;
+                        }
+                        break;
+                    case GGL_DECAL:
+                        if (sat) {
+                            rescale(Cf, sf, Ct, st);
+                            Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
+                        }
+                        break;
+                    case GGL_BLEND:
+                        if (st) {
+                            uint32_t Cc = tx.env_color[i];
+                            if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
+                            else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
+                            uint32_t factor = Ct + (Ct>>(st-1));
+                            Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
+                        }
+                        break;
+                    case GGL_ADD:
+                        if (st) {
+                            rescale(Cf, sf, Ct, st);
+                            Cf += Ct;
+                        }
+                        break;
+                    }
+                }
+            }
+		}
+    
+        // coverage application
+        if (enables & GGL_ENABLE_AA) {
+            int16_t cf = *covPtr++;
+            fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
+        }
+        
+        // alpha-test
+        if (enables & GGL_ENABLE_ALPHA_TEST) {
+            GGLcolor ref = c->state.alpha_test.ref;
+            GGLcolor alpha = (uint64_t(fragment.c[0]) *
+                    ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
+            switch (c->state.alpha_test.func) {
+            case GGL_NEVER:     goto discard;
+            case GGL_LESS:      if (alpha<ref)  break; goto discard;
+            case GGL_EQUAL:     if (alpha==ref) break; goto discard;
+            case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
+            case GGL_GREATER:   if (alpha>ref)  break; goto discard;
+            case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
+            case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
+            }
+        }
+        
+        // depth test
+        if (c->state.buffers.depth.format) {
+            if (enables & GGL_ENABLE_DEPTH_TEST) {
+                surface_t* cb = &(c->state.buffers.depth);
+                uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
+                uint16_t zz = uint32_t(z)>>(16);
+                uint16_t depth = *p;
+                switch (c->state.depth_test.func) {
+                case GGL_NEVER:     goto discard;
+                case GGL_LESS:      if (zz<depth)    break; goto discard;
+                case GGL_EQUAL:     if (zz==depth)   break; goto discard;
+                case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
+                case GGL_GREATER:   if (zz>depth)    break; goto discard;
+                case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
+                case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
+                }
+                // depth buffer is not enabled, if depth-test is not enabled
+/*
+        fragment.s[1] = fragment.s[2] =
+        fragment.s[3] = fragment.s[0] = 8;
+        fragment.c[1] = 
+        fragment.c[2] = 
+        fragment.c[3] = 
+        fragment.c[0] = 255 - (zz>>8);
+*/
+                if (c->state.mask.depth) {
+                    *p = zz;
+                }
+            }
+        }
+
+        // fog
+        if (enables & GGL_ENABLE_FOG) {
+            for (int i=1 ; i<=3 ; i++) {
+                GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
+                uint32_t& c = fragment.c[i];
+                uint8_t& s  = fragment.s[i];
+                c = (c * 0x10000) / ((1<<s)-1);
+                c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
+                s = 16;
+            }
+        }
+
+        // blending
+        if (enables & GGL_ENABLE_BLENDING) {
+            fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
+            fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
+            c->state.buffers.color.read(
+                    &(c->state.buffers.color), c, x, y, &fb);
+            blending( c, &fragment, &fb );
+        }
+
+		// write
+        c->state.buffers.color.write(
+                &(c->state.buffers.color), c, x, y, &fragment);
+        }
+
+discard:
+		// iterate...
+        x += 1;
+        if (enables & GGL_ENABLE_SMOOTH) {
+            r += c->shade.drdx;
+            g += c->shade.dgdx;
+            b += c->shade.dbdx;
+            a += c->shade.dadx;
+        }
+        z += c->shade.dzdx;
+        f += c->shade.dfdx;
+	}
+}
+
+#endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Scanline
+#endif
+
+template <typename T, typename U>
+static inline __attribute__((const))
+T interpolate(int y, T v0, U dvdx, U dvdy) {
+    // interpolates in pixel's centers
+    // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
+    return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void init_y(context_t* c, int32_t ys)
+{
+    const uint32_t enables = c->state.enables;
+
+    // compute iterators...
+    iterators_t& ci = c->iterators;
+    
+    // sample in the center
+    ci.y = ys;
+
+    if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
+        ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
+        ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
+        ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
+    }
+
+    if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
+        ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
+        ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
+        ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
+        ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
+        c->step_y = step_y__smooth;
+    } else {
+        ci.ydrdy = c->shade.r0;
+        ci.ydgdy = c->shade.g0;
+        ci.ydbdy = c->shade.b0;
+        ci.ydady = c->shade.a0;
+        // XXX: do only if needed, or make sure this is fast
+        c->packed = ggl_pack_color(c, c->state.buffers.color.format,
+                ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
+        c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888, 
+                ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
+    }
+
+    // initialize the variables we need in the shader
+    generated_vars_t& gen = c->generated_vars;
+    gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
+    gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
+    gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
+    gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
+    gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
+    gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
+    gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
+    gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
+    gen.dzdx = c->shade.dzdx;
+    gen.f    = ci.ydfdy;
+    gen.dfdx = c->shade.dfdx;
+
+    if (enables & GGL_ENABLE_TMUS) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            texture_t& t = c->state.texture[i];
+            if (!t.enable) continue;
+
+            texture_iterators_t& ti = t.iterators;
+            if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
+                // we need to set all of these to 0 because in some cases
+                // step_y__generic() or step_y__tmu() will be used and
+                // therefore will update dtdy, however, in 1:1 mode
+                // this is always done by the scanline rasterizer.
+                ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
+                ti.ydsdy = t.shade.is0;
+                ti.ydtdy = t.shade.it0;
+            } else {
+                const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
+                const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
+                ti.sscale = t.shade.sscale + adjustSWrap;
+                ti.tscale = t.shade.tscale + adjustTWrap;
+                if (!(enables & GGL_ENABLE_W)) {
+                    // S coordinate
+                    const int32_t sscale = ti.sscale;
+                    const int32_t sy = interpolate(ys,
+                            t.shade.is0, t.shade.idsdx, t.shade.idsdy);
+                    if (sscale>=0) {
+                        ti.ydsdy= sy            << sscale;
+                        ti.dsdx = t.shade.idsdx << sscale; 
+                        ti.dsdy = t.shade.idsdy << sscale;
+                    } else {
+                        ti.ydsdy= sy            >> -sscale;
+                        ti.dsdx = t.shade.idsdx >> -sscale; 
+                        ti.dsdy = t.shade.idsdy >> -sscale;
+                    }
+                    // T coordinate
+                    const int32_t tscale = ti.tscale;
+                    const int32_t ty = interpolate(ys,
+                            t.shade.it0, t.shade.idtdx, t.shade.idtdy);
+                    if (tscale>=0) {
+                        ti.ydtdy= ty            << tscale;
+                        ti.dtdx = t.shade.idtdx << tscale; 
+                        ti.dtdy = t.shade.idtdy << tscale;
+                    } else {
+                        ti.ydtdy= ty            >> -tscale;
+                        ti.dtdx = t.shade.idtdx >> -tscale; 
+                        ti.dtdy = t.shade.idtdy >> -tscale;
+                    }
+                }
+            }
+            // mirror for generated code...
+            generated_tex_vars_t& gen = c->generated_vars.texture[i];
+            gen.width   = t.surface.width;
+            gen.height  = t.surface.height;
+            gen.stride  = t.surface.stride;
+            gen.data    = int32_t(t.surface.data);
+            gen.dsdx = ti.dsdx;
+            gen.dtdx = ti.dtdx;
+        }
+    }
+
+    // choose the y-stepper
+    c->step_y = step_y__nop;
+    if (enables & GGL_ENABLE_FOG) {
+        c->step_y = step_y__generic;
+    } else if (enables & GGL_ENABLE_TMUS) {
+        if (enables & GGL_ENABLE_SMOOTH) {
+            c->step_y = step_y__generic;
+        } else if (enables & GGL_ENABLE_W) {
+            c->step_y = step_y__w;
+        } else {
+            c->step_y = step_y__tmu;
+        }
+    } else {
+        if (enables & GGL_ENABLE_SMOOTH) {
+            c->step_y = step_y__smooth;
+        }
+    }
+    
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if ((c->step_y == step_y__nop) &&
+        (c->scanline == scanline_memcpy))
+    {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_packed(context_t* c, int32_t y0)
+{
+    uint8_t f = c->state.buffers.color.format;
+    c->packed = ggl_pack_color(c, f,
+            c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
+    c->iterators.y = y0;
+    c->step_y = step_y__nop;
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if (c->scanline == scanline_memcpy) {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_noop(context_t* c, int32_t y0)
+{
+    c->iterators.y = y0;
+    c->step_y = step_y__nop;
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if (c->scanline == scanline_memcpy) {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_error(context_t* c, int32_t y0)
+{
+    // woooops, shoud never happen,
+    // fail gracefully (don't display anything)
+    init_y_noop(c, y0);
+    LOGE("color-buffer has an invalid format!");
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void step_y__generic(context_t* c)
+{
+    const uint32_t enables = c->state.enables;
+
+    // iterate...
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+                
+    if (enables & GGL_ENABLE_SMOOTH) {
+        ci.ydrdy += c->shade.drdy;
+        ci.ydgdy += c->shade.dgdy;
+        ci.ydbdy += c->shade.dbdy;
+        ci.ydady += c->shade.dady;
+    }
+
+    const uint32_t mask =
+            GGL_ENABLE_DEPTH_TEST |
+            GGL_ENABLE_W |
+            GGL_ENABLE_FOG;
+    if (enables & mask) {
+        ci.ydzdy += c->shade.dzdy;
+        ci.ydwdy += c->shade.dwdy;
+        ci.ydfdy += c->shade.dfdy;
+    }
+
+    if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            if (c->state.texture[i].enable) {
+                texture_iterators_t& ti = c->state.texture[i].iterators;
+                ti.ydsdy += ti.dsdy;
+                ti.ydtdy += ti.dtdy;
+            }
+        }
+    }
+}
+
+void step_y__nop(context_t* c)
+{
+    c->iterators.y += 1;
+    c->iterators.ydzdy += c->shade.dzdy;
+}
+
+void step_y__smooth(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydrdy += c->shade.drdy;
+    ci.ydgdy += c->shade.dgdy;
+    ci.ydbdy += c->shade.dbdy;
+    ci.ydady += c->shade.dady;
+    ci.ydzdy += c->shade.dzdy;
+}
+
+void step_y__w(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydzdy += c->shade.dzdy;
+    ci.ydwdy += c->shade.dwdy;
+}
+
+void step_y__tmu(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydzdy += c->shade.dzdy;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        if (c->state.texture[i].enable) {
+            texture_iterators_t& ti = c->state.texture[i].iterators;
+            ti.ydsdy += ti.dsdy;
+            ti.ydtdy += ti.dtdy;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void scanline_perspective(context_t* c)
+{
+    struct {
+        union {
+            struct {
+                int32_t s, sq;
+                int32_t t, tq;
+            };
+            struct {
+                int32_t v, q;
+            } st[2];
+        };
+    } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
+
+    // XXX: we should have a special case when dwdx = 0
+
+    // 32 pixels spans works okay. 16 is a lot better,
+    // but hey, it's a software renderer...
+    const uint32_t SPAN_BITS = 5; 
+    const uint32_t ys = c->iterators.y;
+    const uint32_t xs = c->iterators.xl;
+    const uint32_t x1 = c->iterators.xr;
+	const uint32_t xc = x1 - xs;
+    uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
+    uint32_t numSpans = xc >> SPAN_BITS;
+
+    const iterators_t& ci = c->iterators;
+    int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
+    int32_t q0 = gglRecipQ(w0, 30);
+    const int iwscale = 32 - gglClz(q0);
+
+    const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
+    int32_t xl = c->iterators.xl;
+
+    // We process s & t with a loop to reduce the code size
+    // (and i-cache pressure).
+
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        const texture_t& tmu = c->state.texture[i];
+        if (!tmu.enable) continue;
+        int32_t s =   tmu.shade.is0 +
+                     (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
+                     ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
+        int32_t t =   tmu.shade.it0 +
+                     (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
+                     ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
+        tc[i].s  = s;
+        tc[i].t  = t;
+        tc[i].sq = gglMulx(s, q0, iwscale);
+        tc[i].tq = gglMulx(t, q0, iwscale);
+    }
+
+    int32_t span = 0;
+    do {
+        int32_t w1;
+        if (ggl_likely(numSpans)) {
+            w1 = w0 + dwdx;
+        } else {
+            if (remainder) {
+                // finish off the scanline...
+                span = remainder;
+                w1 = (c->shade.dwdx * span) + w0;
+            } else {
+                break;
+            }
+        }
+        int32_t q1 = gglRecipQ(w1, 30);
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            texture_t& tmu = c->state.texture[i];
+            if (!tmu.enable) continue;
+            texture_iterators_t& ti = tmu.iterators;
+
+            for (int j=0 ; j<2 ; j++) {
+                int32_t v = tc[i].st[j].v;
+                if (span)   v += (tmu.shade.st[j].dx)*span;
+                else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
+                const int32_t v0 = tc[i].st[j].q;
+                const int32_t v1 = gglMulx(v, q1, iwscale);
+                int32_t dvdx = v1 - v0;
+                if (span)   dvdx /= span;
+                else        dvdx >>= SPAN_BITS;
+                tc[i].st[j].v = v;
+                tc[i].st[j].q = v1;
+
+                const int scale = ti.st[j].scale + (iwscale - 30);
+                if (scale >= 0) {
+                    ti.st[j].ydvdy = v0   << scale;
+                    ti.st[j].dvdx  = dvdx << scale;
+                } else {
+                    ti.st[j].ydvdy = v0   >> -scale;
+                    ti.st[j].dvdx  = dvdx >> -scale;
+                }
+            }
+            generated_tex_vars_t& gen = c->generated_vars.texture[i];
+            gen.dsdx = ti.st[0].dvdx;
+            gen.dtdx = ti.st[1].dvdx;
+        }
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
+        w0 = w1;
+        q0 = q1;
+        c->span(c);
+    } while(numSpans--);
+}
+
+void scanline_perspective_single(context_t* c)
+{
+    // 32 pixels spans works okay. 16 is a lot better,
+    // but hey, it's a software renderer...
+    const uint32_t SPAN_BITS = 5; 
+    const uint32_t ys = c->iterators.y;
+    const uint32_t xs = c->iterators.xl;
+    const uint32_t x1 = c->iterators.xr;
+	const uint32_t xc = x1 - xs;
+
+    const iterators_t& ci = c->iterators;
+    int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
+    int32_t iw = gglRecipQ(w, 30);
+    const int iwscale = 32 - gglClz(iw);
+
+    const int i = 31 - gglClz(c->state.enabled_tmu);
+    generated_tex_vars_t& gen = c->generated_vars.texture[i];
+    texture_t& tmu = c->state.texture[i];
+    texture_iterators_t& ti = tmu.iterators;
+    const int sscale = ti.sscale + (iwscale - 30);
+    const int tscale = ti.tscale + (iwscale - 30);
+    int32_t s =   tmu.shade.is0 +
+                 (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
+                 ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
+    int32_t t =   tmu.shade.it0 +
+                 (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
+                 ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
+    int32_t s0 = gglMulx(s, iw, iwscale);
+    int32_t t0 = gglMulx(t, iw, iwscale);
+    int32_t xl = c->iterators.xl;
+
+    int32_t sq, tq, dsdx, dtdx;
+    int32_t premainder = xc & ((1<<SPAN_BITS)-1);
+    uint32_t numSpans = xc >> SPAN_BITS;
+    if (c->shade.dwdx == 0) {
+        // XXX: we could choose to do this if the error is small enough
+        numSpans = 0;
+        premainder = xc;
+        goto no_perspective;
+    }
+
+    if (premainder) {
+        w += c->shade.dwdx   * premainder;
+        iw = gglRecipQ(w, 30);
+no_perspective:        
+        s += tmu.shade.idsdx * premainder;
+        t += tmu.shade.idtdx * premainder;
+        sq = gglMulx(s, iw, iwscale);
+        tq = gglMulx(t, iw, iwscale);
+        dsdx = (sq - s0) / premainder;
+        dtdx = (tq - t0) / premainder;
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + premainder;
+        goto finish;
+    }
+
+    while (numSpans--) {
+        w += c->shade.dwdx   << SPAN_BITS;
+        s += tmu.shade.idsdx << SPAN_BITS;
+        t += tmu.shade.idtdx << SPAN_BITS;
+        iw = gglRecipQ(w, 30);
+        sq = gglMulx(s, iw, iwscale);
+        tq = gglMulx(t, iw, iwscale);
+        dsdx = (sq - s0) >> SPAN_BITS;
+        dtdx = (tq - t0) >> SPAN_BITS;
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + (1<<SPAN_BITS);
+finish:
+        if (sscale >= 0) {
+            ti.ydsdy = s0   << sscale;
+            ti.dsdx  = dsdx << sscale;
+        } else {
+            ti.ydsdy = s0   >>-sscale;
+            ti.dsdx  = dsdx >>-sscale;
+        }
+        if (tscale >= 0) {
+            ti.ydtdy = t0   << tscale;
+            ti.dtdx  = dtdx << tscale;
+        } else {
+            ti.ydtdy = t0   >>-tscale;
+            ti.dtdx  = dtdx >>-tscale;
+        }
+        s0 = sq;
+        t0 = tq;
+        gen.dsdx = ti.dsdx;
+        gen.dtdx = ti.dtdx;
+        c->span(c);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+void scanline_t32cb16(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;    
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    union {
+        uint16_t* dst;
+        uint32_t* dst32;
+    };
+    dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
+    int sR, sG, sB;
+    uint32_t s, d;
+
+    if (ct==1 || uint32_t(dst)&2) {
+last_one:
+        s = GGL_RGBA_TO_HOST( *src++ );
+        sR = (s >> (   3))&0x1F;
+        sG = (s >> ( 8+2))&0x3F;
+        sB = (s >> (16+3))&0x1F;
+        *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
+        ct--;
+    }
+
+    while (ct >= 2) {
+        s = GGL_RGBA_TO_HOST( *src++ );
+        sR = (s >> (   3))&0x1F;
+        sG = (s >> ( 8+2))&0x3F;
+        sB = (s >> (16+3))&0x1F;
+        d = (sR<<11)|(sG<<5)|sB;
+        
+        s = GGL_RGBA_TO_HOST( *src++ );
+        sR = (s >> (   3))&0x1F;
+        sG = (s >> ( 8+2))&0x3F;
+        sB = (s >> (16+3))&0x1F;        
+        d |= ((sR<<11)|(sG<<5)|sB)<<16;
+
+#if BYTE_ORDER == BIG_ENDIAN
+        d = (d>>16) | (d<<16);
+#endif
+
+        *dst32++ = d;
+        ct -= 2;
+    }
+    
+    if (ct > 0) {
+        goto last_one;
+    }
+}
+
+void scanline_t32cb16blend(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
+
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
+    scanline_t32cb16blend_arm(dst, src, ct);
+#else
+    while (ct--) {
+        uint32_t s = *src++;
+        if (!s) {
+            dst++;
+            continue;
+        }
+        uint16_t d = *dst;
+        s = GGL_RGBA_TO_HOST(s);
+        int sR = (s >> (   3))&0x1F;
+        int sG = (s >> ( 8+2))&0x3F;
+        int sB = (s >> (16+3))&0x1F;
+        int sA = (s>>24);
+        int f = 0x100 - (sA + (sA>>7));
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR += (f*dR)>>8;
+        sG += (f*dG)>>8;
+        sB += (f*dB)>>8;
+        *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+#endif
+}
+
+void scanline_memcpy(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
+                            (u + (tex->stride * v)) * fp->size;
+
+    const size_t size = ct * fp->size;
+    memcpy(dst, src, size);
+}
+
+void scanline_memset8(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = c->packed;
+    memset(dst, packed, ct);
+}
+
+void scanline_memset16(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = c->packed;
+    android_memset16(dst, packed, ct*2);
+}
+
+void scanline_memset32(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
+    android_memset32(dst, packed, ct*4);
+}
+
+void scanline_clear(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0, size);
+}
+
+void scanline_set(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0xFF, size);
+}
+
+void scanline_noop(context_t* c)
+{
+}
+
+void rect_generic(context_t* c, size_t yc)
+{
+    do {
+        c->scanline(c);
+        c->step_y(c);
+    } while (--yc);
+}
+
+void rect_memcpy(context_t* c, size_t yc)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
+                            (u + (tex->stride * v)) * fp->size;
+
+    if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
+        memcpy(dst, src, ct * fp->size * yc);
+    } else {
+        const size_t size = ct * fp->size;
+        const size_t dbpr = cb->stride  * fp->size;
+        const size_t sbpr = tex->stride * fp->size;
+        do {
+            memcpy(dst, src, size);
+            dst += dbpr;
+            src += sbpr;        
+        } while (--yc);
+    }
+}
+// ----------------------------------------------------------------------------
+}; // namespace android
+
+using namespace android;
+extern "C" void ggl_test_codegen(uint32_t n, uint32_t p, uint32_t t0, uint32_t t1)
+{
+#if ANDROID_ARM_CODEGEN
+    GGLContext* c;
+    gglInit(&c);
+    needs_t needs;
+    needs.n = n;
+    needs.p = p;
+    needs.t[0] = t0;
+    needs.t[1] = t1;
+    sp<ScanlineAssembly> a(new ScanlineAssembly(needs, ASSEMBLY_SCRATCH_SIZE));
+    GGLAssembler assembler( new ARMAssembler(a) );
+    int err = assembler.scanline(needs, (context_t*)c);
+    if (err != 0) {
+        printf("error %08x (%s)\n", err, strerror(-err));
+    }
+    gglUninit(c);
+#else
+    printf("This test runs only on ARM\n");
+#endif
+}
+
diff --git a/libpixelflinger/scanline.h b/libpixelflinger/scanline.h
new file mode 100644
index 0000000..b6f4d37
--- /dev/null
+++ b/libpixelflinger/scanline.h
@@ -0,0 +1,32 @@
+/* libs/pixelflinger/scanline.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_SCANLINE_H
+#define ANDROID_SCANLINE_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_scanline(context_t* c);
+void ggl_uninit_scanline(context_t* c);
+void ggl_pick_scanline(context_t* c);
+
+}; // namespace android
+
+#endif
diff --git a/libpixelflinger/t32cb16blend.S b/libpixelflinger/t32cb16blend.S
new file mode 100644
index 0000000..d4b2579
--- /dev/null
+++ b/libpixelflinger/t32cb16blend.S
@@ -0,0 +1,171 @@
+/* libs/pixelflinger/t32cb16blend.S
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+	.text
+	.align
+	
+	.global scanline_t32cb16blend_arm
+
+// uses r6, r7, lr
+
+.macro pixel,   DREG, SRC, FB, OFFSET
+
+    // SRC = AARRGGBB
+    mov     r7, \SRC, lsr #24           // sA
+    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
+    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
+
+1:
+
+.if \OFFSET
+
+    // red
+    mov     lr, \DREG, lsr #(\OFFSET + 6 + 5)
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    orr     \FB, lr, lsl #(\OFFSET + 11)
+
+        // green
+        and     r6, \DREG, #(0x3F<<(\OFFSET + 5))
+        smulbt  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        orr     \FB, \FB, r6, lsl #(\OFFSET + 5)
+
+            // blue
+            and     lr, \DREG, #(0x1F << \OFFSET)
+            smulbt  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            orr     \FB, \FB, lr, lsl #\OFFSET
+
+.else
+
+    // red
+    mov     lr, \DREG, lsr #(6+5)
+    and     lr, lr, #0x1F
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    mov     \FB, lr, lsl #11
+
+        // green
+        and     r6, \DREG, #(0x3F<<5)
+        smulbb  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        orr     \FB, \FB, r6, lsl #5
+
+            // blue
+            and     lr, \DREG, #0x1F
+            smulbb  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            orr     \FB, \FB, lr
+
+.endif
+
+    .endm
+    
+
+// r0:  dst ptr
+// r1:  src ptr
+// r2:  count
+// r3:  d
+// r4:  s0
+// r5:  s1
+// r6:  pixel
+// r7:  pixel
+// r8:  free
+// r9:  free
+// r10: free
+// r11: free
+// r12: scratch
+// r14: pixel
+
+scanline_t32cb16blend_arm:
+    stmfd	sp!, {r4-r7, lr}
+
+    pld     [r0]
+    pld     [r1]
+
+    // align DST to 32 bits
+    tst     r0, #0x3
+    beq     aligned
+    subs    r2, r2, #1
+    ldmlofd	sp!, {r4-r7, lr}        // return
+    bxlo    lr
+
+last:
+    ldr     r4, [r1], #4
+    ldrh    r3, [r0]
+    pixel   r3, r4, r12, 0
+    strh    r12, [r0], #2
+
+aligned:
+    subs    r2, r2, #2
+    blo     9f
+
+    // The main loop is unrolled twice and process 4 pixels
+8:  ldmia   r1!, {r4, r5}
+    // stream the source
+    pld     [r1, #32]
+    add     r0, r0, #4
+    // it's all zero, skip this pixel
+    orrs    r3, r4, r5
+    beq     7f
+    
+    // load the destination
+    ldr     r3, [r0, #-4]
+    // stream the destination
+    pld     [r0, #32]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    // effectively, we're getting write-combining by virtue of the
+    // cpu's write-back cache.
+    str     r12, [r0, #-4]
+
+    // 2nd iterration of the loop, don't stream anything
+    subs    r2, r2, #2
+    movlt   r4, r5
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    orrs    r3, r4, r5
+    beq     7f
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+    
+7:  subs    r2, r2, #2
+    bhs     8b
+    mov     r4, r5
+
+9:  adds    r2, r2, #1
+    ldmlofd sp!, {r4-r7, lr}        // return
+    bxlo    lr
+    b       last
diff --git a/libpixelflinger/tests/Android.mk b/libpixelflinger/tests/Android.mk
new file mode 100644
index 0000000..6571161
--- /dev/null
+++ b/libpixelflinger/tests/Android.mk
@@ -0,0 +1 @@
+include $(all-subdir-makefiles)
diff --git a/libpixelflinger/tests/codegen/Android.mk b/libpixelflinger/tests/codegen/Android.mk
new file mode 100644
index 0000000..1bc4214
--- /dev/null
+++ b/libpixelflinger/tests/codegen/Android.mk
@@ -0,0 +1,15 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES:= \
+	codegen.cpp
+
+LOCAL_SHARED_LIBRARIES := \
+	libcutils \
+    libpixelflinger
+
+LOCAL_MODULE:= test-opengl-codegen
+
+LOCAL_MODULE_TAGS := tests
+
+include $(BUILD_EXECUTABLE)
diff --git a/libpixelflinger/tests/codegen/codegen.cpp b/libpixelflinger/tests/codegen/codegen.cpp
new file mode 100644
index 0000000..1865888
--- /dev/null
+++ b/libpixelflinger/tests/codegen/codegen.cpp
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdint.h>
+
+extern "C" void ggl_test_codegen(
+        uint32_t n, uint32_t p, uint32_t t0, uint32_t t1);
+
+
+int main(int argc, char** argv)
+{
+    if (argc != 2) {
+        printf("usage: %s 00000117:03454504_00001501_00000000\n", argv[0]);
+        return 0;
+    }
+    uint32_t n;
+    uint32_t p;
+    uint32_t t0;
+    uint32_t t1;
+    sscanf(argv[1], "%08x:%08x_%08x_%08x", &p, &n, &t0, &t1);
+    ggl_test_codegen(n, p,  t0, t1);
+    return 0;
+}
diff --git a/libpixelflinger/tinyutils/KeyedVector.h b/libpixelflinger/tinyutils/KeyedVector.h
new file mode 100644
index 0000000..1be2094
--- /dev/null
+++ b/libpixelflinger/tinyutils/KeyedVector.h
@@ -0,0 +1,193 @@
+/*
+ *  keyed_vector.h
+ *  Android  
+ *
+ *  Created on 11/18/05.
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_KEYED_VECTOR_H
+#define ANDROID_KEYED_VECTOR_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/SortedVector.h"
+#include "tinyutils/TypeHelpers.h"
+
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+template <typename KEY, typename VALUE>
+class KeyedVector
+{
+public:
+    typedef KEY    key_type;
+    typedef VALUE  value_type;
+
+    inline                  KeyedVector();
+
+    /*
+     * empty the vector
+     */
+
+    inline  void            clear()                     { mVector.clear(); }
+
+    /*! 
+     * vector stats
+     */
+
+    //! returns number of items in the vector
+    inline  size_t          size() const                { return mVector.size(); }
+    //! returns wether or not the vector is empty
+    inline  bool            isEmpty() const             { return mVector.isEmpty(); }
+    //! returns how many items can be stored without reallocating the backing store
+    inline  size_t          capacity() const            { return mVector.capacity(); }
+    //! setst the capacity. capacity can never be reduced less than size()
+    inline ssize_t          setCapacity(size_t size)    { return mVector.setCapacity(size); }
+    
+    /*! 
+     * accessors
+     */
+            const VALUE&    valueFor(const KEY& key) const;
+            const VALUE&    valueAt(size_t index) const;
+            const KEY&      keyAt(size_t index) const;
+            ssize_t         indexOfKey(const KEY& key) const;
+
+    /*!
+     * modifing the array
+     */
+
+            VALUE&          editValueFor(const KEY& key);
+            VALUE&          editValueAt(size_t index);
+
+            /*! 
+             * add/insert/replace items
+             */
+             
+            ssize_t         add(const KEY& key, const VALUE& item);
+            ssize_t         replaceValueFor(const KEY& key, const VALUE& item);
+            ssize_t         replaceValueAt(size_t index, const VALUE& item);
+
+    /*!
+     * remove items
+     */
+
+            ssize_t         removeItem(const KEY& key);
+            ssize_t         removeItemsAt(size_t index, size_t count = 1);
+            
+private:
+            SortedVector< key_value_pair_t<KEY, VALUE> >    mVector;
+};
+
+// ---------------------------------------------------------------------------
+
+/**
+ * Variation of KeyedVector that holds a default value to return when
+ * valueFor() is called with a key that doesn't exist.
+ */
+template <typename KEY, typename VALUE>
+class DefaultKeyedVector : public KeyedVector<KEY, VALUE>
+{
+public:
+    inline                  DefaultKeyedVector(const VALUE& defValue = VALUE());
+            const VALUE&    valueFor(const KEY& key) const;
+
+private:
+            VALUE                                           mDefault;
+};
+
+// ---------------------------------------------------------------------------
+
+template<typename KEY, typename VALUE> inline
+KeyedVector<KEY,VALUE>::KeyedVector()
+{
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY,VALUE>::indexOfKey(const KEY& key) const {
+    return mVector.indexOf( key_value_pair_t<KEY,VALUE>(key) );
+}
+
+template<typename KEY, typename VALUE> inline
+const VALUE& KeyedVector<KEY,VALUE>::valueFor(const KEY& key) const {
+    ssize_t i = indexOfKey(key);
+    assert(i>=0);
+    return mVector.itemAt(i).value;
+}
+
+template<typename KEY, typename VALUE> inline
+const VALUE& KeyedVector<KEY,VALUE>::valueAt(size_t index) const {
+    return mVector.itemAt(index).value;
+}
+
+template<typename KEY, typename VALUE> inline
+const KEY& KeyedVector<KEY,VALUE>::keyAt(size_t index) const {
+    return mVector.itemAt(index).key;
+}
+
+template<typename KEY, typename VALUE> inline
+VALUE& KeyedVector<KEY,VALUE>::editValueFor(const KEY& key) {
+    ssize_t i = indexOfKey(key);
+    assert(i>=0);
+    return mVector.editItemAt(i).value;
+}
+
+template<typename KEY, typename VALUE> inline
+VALUE& KeyedVector<KEY,VALUE>::editValueAt(size_t index) {
+    return mVector.editItemAt(index).value;
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY,VALUE>::add(const KEY& key, const VALUE& value) {
+    return mVector.add( key_value_pair_t<KEY,VALUE>(key, value) );
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY,VALUE>::replaceValueFor(const KEY& key, const VALUE& value) {
+    key_value_pair_t<KEY,VALUE> pair(key, value);
+    mVector.remove(pair);
+    return mVector.add(pair);
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY,VALUE>::replaceValueAt(size_t index, const VALUE& item) {
+    if (index<size()) {
+        mVector.editValueAt(index).value = item;
+        return index;
+    }
+    return BAD_INDEX;
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY,VALUE>::removeItem(const KEY& key) {
+    return mVector.remove(key_value_pair_t<KEY,VALUE>(key));
+}
+
+template<typename KEY, typename VALUE> inline
+ssize_t KeyedVector<KEY, VALUE>::removeItemsAt(size_t index, size_t count) {
+    return mVector.removeItemsAt(index, count);
+}
+
+// ---------------------------------------------------------------------------
+
+template<typename KEY, typename VALUE> inline
+DefaultKeyedVector<KEY,VALUE>::DefaultKeyedVector(const VALUE& defValue)
+    : mDefault(defValue)
+{
+}
+
+template<typename KEY, typename VALUE> inline
+const VALUE& DefaultKeyedVector<KEY,VALUE>::valueFor(const KEY& key) const {
+    ssize_t i = indexOfKey(key);
+    return i >= 0 ? KeyedVector<KEY,VALUE>::valueAt(i) : mDefault;
+}
+
+}; // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_KEYED_VECTOR_H
diff --git a/libpixelflinger/tinyutils/SharedBuffer.cpp b/libpixelflinger/tinyutils/SharedBuffer.cpp
new file mode 100644
index 0000000..ef781a7
--- /dev/null
+++ b/libpixelflinger/tinyutils/SharedBuffer.cpp
@@ -0,0 +1,106 @@
+/*
+ *  SharedBuffer.cpp
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <cutils/atomic.h>
+
+#include "tinyutils/SharedBuffer.h"
+
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+SharedBuffer* SharedBuffer::alloc(size_t size)
+{
+    SharedBuffer* sb = static_cast<SharedBuffer *>(malloc(sizeof(SharedBuffer) + size));
+    if (sb) {
+        sb->mRefs = 1;
+        sb->mSize = size;
+    }
+    return sb;
+}
+
+
+ssize_t SharedBuffer::dealloc(const SharedBuffer* released)
+{
+    if (released->mRefs != 0) return -1; // XXX: invalid operation
+    free(const_cast<SharedBuffer*>(released));
+    return 0;
+}
+
+SharedBuffer* SharedBuffer::edit() const
+{
+    if (onlyOwner()) {
+        return const_cast<SharedBuffer*>(this);
+    }
+    SharedBuffer* sb = alloc(mSize);
+    if (sb) {
+        memcpy(sb->data(), data(), size());
+        release();
+    }
+    return sb;    
+}
+
+SharedBuffer* SharedBuffer::editResize(size_t newSize) const
+{
+    if (onlyOwner()) {
+        SharedBuffer* buf = const_cast<SharedBuffer*>(this);
+        if (buf->mSize == newSize) return buf;
+        buf = (SharedBuffer*)realloc(buf, sizeof(SharedBuffer) + newSize);
+        if (buf != NULL) {
+            buf->mSize = newSize;
+            return buf;
+        }
+    }
+    SharedBuffer* sb = alloc(newSize);
+    if (sb) {
+        const size_t mySize = mSize;
+        memcpy(sb->data(), data(), newSize < mySize ? newSize : mySize);
+        release();
+    }
+    return sb;    
+}
+
+SharedBuffer* SharedBuffer::attemptEdit() const
+{
+    if (onlyOwner()) {
+        return const_cast<SharedBuffer*>(this);
+    }
+    return 0;
+}
+
+SharedBuffer* SharedBuffer::reset(size_t new_size) const
+{
+    // cheap-o-reset.
+    SharedBuffer* sb = alloc(new_size);
+    if (sb) {
+        release();
+    }
+    return sb;
+}
+
+void SharedBuffer::acquire() const {
+    android_atomic_inc(&mRefs);
+}
+
+int32_t SharedBuffer::release(uint32_t flags) const
+{
+    int32_t prev = 1;
+    if (onlyOwner() || ((prev = android_atomic_dec(&mRefs)) == 1)) {
+        mRefs = 0;
+        if ((flags & eKeepStorage) == 0) {
+            free(const_cast<SharedBuffer*>(this));
+        }
+    }
+    return prev;
+}
+
+
+}; // namespace android
diff --git a/libpixelflinger/tinyutils/SharedBuffer.h b/libpixelflinger/tinyutils/SharedBuffer.h
new file mode 100644
index 0000000..9f63121
--- /dev/null
+++ b/libpixelflinger/tinyutils/SharedBuffer.h
@@ -0,0 +1,138 @@
+/*
+ *  SharedBuffer.h
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_SHARED_BUFFER_H
+#define ANDROID_SHARED_BUFFER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+class SharedBuffer
+{
+public:
+
+    /* flags to use with release() */
+    enum {
+        eKeepStorage = 0x00000001
+    };
+
+    /*! allocate a buffer of size 'size' and acquire() it.
+     *  call release() to free it.
+     */
+    static          SharedBuffer*           alloc(size_t size);
+    
+    /*! free the memory associated with the SharedBuffer.
+     * Fails if there are any users associated with this SharedBuffer.
+     * In other words, the buffer must have been release by all its
+     * users.
+     */
+    static          ssize_t                 dealloc(const SharedBuffer* released);
+    
+    //! get the SharedBuffer from the data pointer
+    static  inline  const SharedBuffer*     sharedBuffer(const void* data);
+
+    //! access the data for read
+    inline          const void*             data() const;
+    
+    //! access the data for read/write
+    inline          void*                   data();
+
+    //! get size of the buffer
+    inline          size_t                  size() const;
+ 
+    //! get back a SharedBuffer object from its data
+    static  inline  SharedBuffer*           bufferFromData(void* data);
+    
+    //! get back a SharedBuffer object from its data
+    static  inline  const SharedBuffer*     bufferFromData(const void* data);
+
+    //! get the size of a SharedBuffer object from its data
+    static  inline  size_t                  sizeFromData(const void* data);
+    
+    //! edit the buffer (get a writtable, or non-const, version of it)
+                    SharedBuffer*           edit() const;
+
+    //! edit the buffer, resizing if needed
+                    SharedBuffer*           editResize(size_t size) const;
+
+    //! like edit() but fails if a copy is required
+                    SharedBuffer*           attemptEdit() const;
+    
+    //! resize and edit the buffer, loose it's content.
+                    SharedBuffer*           reset(size_t size) const;
+
+    //! acquire/release a reference on this buffer
+                    void                    acquire() const;
+                    
+    /*! release a reference on this buffer, with the option of not
+     * freeing the memory associated with it if it was the last reference
+     * returns the previous reference count
+     */     
+                    int32_t                 release(uint32_t flags = 0) const;
+    
+    //! returns wether or not we're the only owner
+    inline          bool                    onlyOwner() const;
+    
+
+private:
+        inline SharedBuffer() { }
+        inline ~SharedBuffer() { }
+        inline SharedBuffer(const SharedBuffer&);
+ 
+        // 16 bytes. must be sized to preserve correct alingment.
+        mutable int32_t        mRefs;
+                size_t         mSize;
+                uint32_t       mReserved[2];
+};
+
+// ---------------------------------------------------------------------------
+
+const SharedBuffer* SharedBuffer::sharedBuffer(const void* data) {
+    return data ? reinterpret_cast<const SharedBuffer *>(data)-1 : 0;
+}
+
+const void* SharedBuffer::data() const {
+    return this + 1;
+}
+
+void* SharedBuffer::data() {
+    return this + 1;
+}
+
+size_t SharedBuffer::size() const {
+    return mSize;
+}
+
+SharedBuffer* SharedBuffer::bufferFromData(void* data)
+{
+    return ((SharedBuffer*)data)-1;
+}
+    
+const SharedBuffer* SharedBuffer::bufferFromData(const void* data)
+{
+    return ((const SharedBuffer*)data)-1;
+}
+
+size_t SharedBuffer::sizeFromData(const void* data)
+{
+    return (((const SharedBuffer*)data)-1)->mSize;
+}
+
+bool SharedBuffer::onlyOwner() const {
+    return (mRefs == 1);
+}
+
+}; // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_VECTOR_H
diff --git a/libpixelflinger/tinyutils/TypeHelpers.h b/libpixelflinger/tinyutils/TypeHelpers.h
new file mode 100644
index 0000000..9500c90
--- /dev/null
+++ b/libpixelflinger/tinyutils/TypeHelpers.h
@@ -0,0 +1,245 @@
+/*
+ *  TypeHelpers.h
+ *  
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_TYPE_HELPERS_H
+#define ANDROID_TYPE_HELPERS_H
+
+#include <new>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+/*
+ * Types traits
+ */
+    
+template <typename T> struct trait_trivial_ctor  { enum { value = false }; };
+template <typename T> struct trait_trivial_dtor  { enum { value = false }; };
+template <typename T> struct trait_trivial_copy  { enum { value = false }; };
+template <typename T> struct trait_trivial_assign{ enum { value = false }; };
+
+template <typename T> struct trait_pointer     { enum { value = false }; };    
+template <typename T> struct trait_pointer<T*> { enum { value = true }; };
+
+#define ANDROID_BASIC_TYPES_TRAITS( T )                                       \
+    template<> struct trait_trivial_ctor< T >  { enum { value = true }; };    \
+    template<> struct trait_trivial_dtor< T >  { enum { value = true }; };    \
+    template<> struct trait_trivial_copy< T >  { enum { value = true }; };    \
+    template<> struct trait_trivial_assign< T >{ enum { value = true }; }; 
+
+#define ANDROID_TYPE_TRAITS( T, ctor, dtor, copy, assign )                    \
+    template<> struct trait_trivial_ctor< T >  { enum { value = ctor }; };    \
+    template<> struct trait_trivial_dtor< T >  { enum { value = dtor }; };    \
+    template<> struct trait_trivial_copy< T >  { enum { value = copy }; };    \
+    template<> struct trait_trivial_assign< T >{ enum { value = assign }; }; 
+
+template <typename TYPE>
+struct traits {
+    enum {
+        is_pointer          = trait_pointer<TYPE>::value,
+        has_trivial_ctor    = is_pointer || trait_trivial_ctor<TYPE>::value,
+        has_trivial_dtor    = is_pointer || trait_trivial_dtor<TYPE>::value,
+        has_trivial_copy    = is_pointer || trait_trivial_copy<TYPE>::value,
+        has_trivial_assign  = is_pointer || trait_trivial_assign<TYPE>::value   
+    };
+};
+
+template <typename T, typename U>
+struct aggregate_traits {
+    enum {
+        is_pointer          = false,
+        has_trivial_ctor    = traits<T>::has_trivial_ctor && traits<U>::has_trivial_ctor,
+        has_trivial_dtor    = traits<T>::has_trivial_dtor && traits<U>::has_trivial_dtor,
+        has_trivial_copy    = traits<T>::has_trivial_copy && traits<U>::has_trivial_copy,
+        has_trivial_assign  = traits<T>::has_trivial_assign && traits<U>::has_trivial_assign
+    };
+};
+
+// ---------------------------------------------------------------------------
+
+/*
+ * basic types traits
+ */
+ 
+ANDROID_BASIC_TYPES_TRAITS( void );
+ANDROID_BASIC_TYPES_TRAITS( bool );
+ANDROID_BASIC_TYPES_TRAITS( char );
+ANDROID_BASIC_TYPES_TRAITS( unsigned char );
+ANDROID_BASIC_TYPES_TRAITS( short );
+ANDROID_BASIC_TYPES_TRAITS( unsigned short );
+ANDROID_BASIC_TYPES_TRAITS( int );
+ANDROID_BASIC_TYPES_TRAITS( unsigned int );
+ANDROID_BASIC_TYPES_TRAITS( long );
+ANDROID_BASIC_TYPES_TRAITS( unsigned long );
+ANDROID_BASIC_TYPES_TRAITS( long long );
+ANDROID_BASIC_TYPES_TRAITS( unsigned long long );
+ANDROID_BASIC_TYPES_TRAITS( float );
+ANDROID_BASIC_TYPES_TRAITS( double );
+
+// ---------------------------------------------------------------------------
+
+    
+/*
+ * compare and order types
+ */
+
+template<typename TYPE> inline
+int strictly_order_type(const TYPE& lhs, const TYPE& rhs) {
+    return (lhs < rhs) ? 1 : 0;
+}
+
+template<typename TYPE> inline
+int compare_type(const TYPE& lhs, const TYPE& rhs) {
+    return strictly_order_type(rhs, lhs) - strictly_order_type(lhs, rhs);
+}
+
+/*
+ * create, destroy, copy and assign types...
+ */
+ 
+template<typename TYPE> inline
+void construct_type(TYPE* p, size_t n) {
+    if (!traits<TYPE>::has_trivial_ctor) {
+        while (n--) {
+            new(p++) TYPE;
+        }
+    }
+}
+
+template<typename TYPE> inline
+void destroy_type(TYPE* p, size_t n) {
+    if (!traits<TYPE>::has_trivial_dtor) {
+        while (n--) {
+            p->~TYPE();
+            p++;
+        }
+    }
+}
+
+template<typename TYPE> inline
+void copy_type(TYPE* d, const TYPE* s, size_t n) {
+    if (!traits<TYPE>::has_trivial_copy) {
+        while (n--) {
+            new(d) TYPE(*s);
+            d++, s++;
+        }
+    } else {
+        memcpy(d,s,n*sizeof(TYPE));
+    }
+}
+
+template<typename TYPE> inline
+void assign_type(TYPE* d, const TYPE* s, size_t n) {
+    if (!traits<TYPE>::has_trivial_assign) {
+        while (n--) {
+            *d++ = *s++;
+        }
+    } else {
+        memcpy(d,s,n*sizeof(TYPE));
+    }
+}
+
+template<typename TYPE> inline
+void splat_type(TYPE* where, const TYPE* what, size_t n) {
+    if (!traits<TYPE>::has_trivial_copy) {
+        while (n--) {
+            new(where) TYPE(*what);
+            where++;
+        }
+    } else {
+         while (n--) {
+             *where++ = *what;
+        }
+    }
+}
+
+template<typename TYPE> inline
+void move_forward_type(TYPE* d, const TYPE* s, size_t n = 1) {
+    if (!traits<TYPE>::has_trivial_copy || !traits<TYPE>::has_trivial_dtor) {
+        d += n;
+        s += n;
+        while (n--) {
+            --d, --s;
+            if (!traits<TYPE>::has_trivial_copy) {
+                new(d) TYPE(*s);
+            } else {
+                *d = *s;
+            }
+            if (!traits<TYPE>::has_trivial_dtor) {
+                s->~TYPE();
+            }
+        }
+    } else {
+        memmove(d,s,n*sizeof(TYPE));
+    }
+}
+
+template<typename TYPE> inline
+void move_backward_type(TYPE* d, const TYPE* s, size_t n = 1) {
+    if (!traits<TYPE>::has_trivial_copy || !traits<TYPE>::has_trivial_dtor) {
+        while (n--) {
+            if (!traits<TYPE>::has_trivial_copy) {
+                new(d) TYPE(*s);
+            } else {
+                *d = *s;
+            }
+            if (!traits<TYPE>::has_trivial_dtor) {
+                s->~TYPE();
+            }
+            d++, s++;
+        }
+    } else {
+        memmove(d,s,n*sizeof(TYPE));
+    }
+}
+// ---------------------------------------------------------------------------
+
+/*
+ * a key/value pair
+ */
+
+template <typename KEY, typename VALUE>
+struct key_value_pair_t {
+    KEY     key;
+    VALUE   value;
+    key_value_pair_t() { }
+    key_value_pair_t(const key_value_pair_t& o) : key(o.key), value(o.value) { }
+    key_value_pair_t(const KEY& k, const VALUE& v) : key(k), value(v)  { }
+    key_value_pair_t(const KEY& k) : key(k) { }
+    inline bool operator < (const key_value_pair_t& o) const {
+        return strictly_order_type(key, o.key);
+    }
+};
+
+template<>
+template <typename K, typename V>
+struct trait_trivial_ctor< key_value_pair_t<K, V> >
+{ enum { value = aggregate_traits<K,V>::has_trivial_ctor }; };
+template<> 
+template <typename K, typename V>
+struct trait_trivial_dtor< key_value_pair_t<K, V> >
+{ enum { value = aggregate_traits<K,V>::has_trivial_dtor }; };
+template<> 
+template <typename K, typename V>
+struct trait_trivial_copy< key_value_pair_t<K, V> >
+{ enum { value = aggregate_traits<K,V>::has_trivial_copy }; };
+template<> 
+template <typename K, typename V>
+struct trait_trivial_assign< key_value_pair_t<K, V> >
+{ enum { value = aggregate_traits<K,V>::has_trivial_assign};};
+
+// ---------------------------------------------------------------------------
+
+}; // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_TYPE_HELPERS_H
diff --git a/libpixelflinger/tinyutils/Vector.h b/libpixelflinger/tinyutils/Vector.h
new file mode 100644
index 0000000..182bc7b
--- /dev/null
+++ b/libpixelflinger/tinyutils/Vector.h
@@ -0,0 +1,352 @@
+/*
+ *  vector.h
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_VECTOR_H
+#define ANDROID_VECTOR_H
+
+#include <new>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "tinyutils/VectorImpl.h"
+#include "tinyutils/TypeHelpers.h"
+
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+/*!
+ * The main templated vector class ensuring type safety
+ * while making use of VectorImpl.
+ * This is the class users want to use.
+ */
+
+template <class TYPE>
+class Vector : private VectorImpl
+{
+public:
+            typedef TYPE    value_type;
+    
+    /*! 
+     * Constructors and destructors
+     */
+    
+                            Vector();
+                            Vector(const Vector<TYPE>& rhs);
+    virtual                 ~Vector();
+
+    /*! copy operator */
+            const Vector<TYPE>&     operator = (const Vector<TYPE>& rhs) const;
+            Vector<TYPE>&           operator = (const Vector<TYPE>& rhs);    
+
+    /*
+     * empty the vector
+     */
+
+    inline  void            clear()             { VectorImpl::clear(); }
+
+    /*! 
+     * vector stats
+     */
+
+    //! returns number of items in the vector
+    inline  size_t          size() const                { return VectorImpl::size(); }
+    //! returns wether or not the vector is empty
+    inline  bool            isEmpty() const             { return VectorImpl::isEmpty(); }
+    //! returns how many items can be stored without reallocating the backing store
+    inline  size_t          capacity() const            { return VectorImpl::capacity(); }
+    //! setst the capacity. capacity can never be reduced less than size()
+    inline  ssize_t         setCapacity(size_t size)    { return VectorImpl::setCapacity(size); }
+
+    /*! 
+     * C-style array access
+     */
+     
+    //! read-only C-style access 
+    inline  const TYPE*     array() const;
+    //! read-write C-style access
+            TYPE*           editArray();
+    
+    /*! 
+     * accessors
+     */
+
+    //! read-only access to an item at a given index
+    inline  const TYPE&     operator [] (size_t index) const;
+    //! alternate name for operator []
+    inline  const TYPE&     itemAt(size_t index) const;
+    //! stack-usage of the vector. returns the top of the stack (last element)
+            const TYPE&     top() const;
+    //! same as operator [], but allows to access the vector backward (from the end) with a negative index
+            const TYPE&     mirrorItemAt(ssize_t index) const;
+
+    /*!
+     * modifing the array
+     */
+
+    //! copy-on write support, grants write access to an item
+            TYPE&           editItemAt(size_t index);
+    //! grants right acces to the top of the stack (last element)
+            TYPE&           editTop();
+
+            /*! 
+             * append/insert another vector
+             */
+            
+    //! insert another vector at a given index
+            ssize_t         insertVectorAt(const Vector<TYPE>& vector, size_t index);
+
+    //! append another vector at the end of this one
+            ssize_t         appendVector(const Vector<TYPE>& vector);
+
+
+            /*! 
+             * add/insert/replace items
+             */
+             
+    //! insert one or several items initialized with their default constructor
+    inline  ssize_t         insertAt(size_t index, size_t numItems = 1);
+    //! insert on onr several items initialized from a prototype item
+            ssize_t         insertAt(const TYPE& prototype_item, size_t index, size_t numItems = 1);
+    //! pop the top of the stack (removes the last element). No-op if the stack's empty
+    inline  void            pop();
+    //! pushes an item initialized with its default constructor
+    inline  void            push();
+    //! pushes an item on the top of the stack
+            void            push(const TYPE& item);
+    //! same as push() but returns the index the item was added at (or an error)
+    inline  ssize_t         add();
+    //! same as push() but returns the index the item was added at (or an error)
+            ssize_t         add(const TYPE& item);            
+    //! replace an item with a new one initialized with its default constructor
+    inline  ssize_t         replaceAt(size_t index);
+    //! replace an item with a new one
+            ssize_t         replaceAt(const TYPE& item, size_t index);
+
+    /*!
+     * remove items
+     */
+
+    //! remove several items
+    inline  ssize_t         removeItemsAt(size_t index, size_t count = 1);
+    //! remove one item
+    inline  ssize_t         removeAt(size_t index)  { return removeItemsAt(index); }
+
+    /*!
+     * sort (stable) the array
+     */
+     
+     typedef int (*compar_t)(const TYPE* lhs, const TYPE* rhs);
+     typedef int (*compar_r_t)(const TYPE* lhs, const TYPE* rhs, void* state);
+     
+     inline status_t        sort(compar_t cmp);
+     inline status_t        sort(compar_r_t cmp, void* state);
+
+protected:
+    virtual void    do_construct(void* storage, size_t num) const;
+    virtual void    do_destroy(void* storage, size_t num) const;
+    virtual void    do_copy(void* dest, const void* from, size_t num) const;
+    virtual void    do_splat(void* dest, const void* item, size_t num) const;
+    virtual void    do_move_forward(void* dest, const void* from, size_t num) const;
+    virtual void    do_move_backward(void* dest, const void* from, size_t num) const;
+};
+
+
+// ---------------------------------------------------------------------------
+// No user serviceable parts from here...
+// ---------------------------------------------------------------------------
+
+template<class TYPE> inline
+Vector<TYPE>::Vector()
+    : VectorImpl(sizeof(TYPE),
+                ((traits<TYPE>::has_trivial_ctor   ? HAS_TRIVIAL_CTOR   : 0)
+                |(traits<TYPE>::has_trivial_dtor   ? HAS_TRIVIAL_DTOR   : 0)
+                |(traits<TYPE>::has_trivial_copy   ? HAS_TRIVIAL_COPY   : 0)
+                |(traits<TYPE>::has_trivial_assign ? HAS_TRIVIAL_ASSIGN : 0))
+                )
+{
+}
+
+template<class TYPE> inline
+Vector<TYPE>::Vector(const Vector<TYPE>& rhs)
+    : VectorImpl(rhs) {
+}
+
+template<class TYPE> inline
+Vector<TYPE>::~Vector() {
+    finish_vector();
+}
+
+template<class TYPE> inline
+Vector<TYPE>& Vector<TYPE>::operator = (const Vector<TYPE>& rhs) {
+    VectorImpl::operator = (rhs);
+    return *this; 
+}
+
+template<class TYPE> inline
+const Vector<TYPE>& Vector<TYPE>::operator = (const Vector<TYPE>& rhs) const {
+    VectorImpl::operator = (rhs);
+    return *this; 
+}
+
+template<class TYPE> inline
+const TYPE* Vector<TYPE>::array() const {
+    return static_cast<const TYPE *>(arrayImpl());
+}
+
+template<class TYPE> inline
+TYPE* Vector<TYPE>::editArray() {
+    return static_cast<TYPE *>(editArrayImpl());
+}
+
+
+template<class TYPE> inline
+const TYPE& Vector<TYPE>::operator[](size_t index) const {
+    LOG_FATAL_IF( index>=size(),
+                  "itemAt: index %d is past size %d", (int)index, (int)size() );
+    return *(array() + index);
+}
+
+template<class TYPE> inline
+const TYPE& Vector<TYPE>::itemAt(size_t index) const {
+    return operator[](index);
+}
+
+template<class TYPE> inline
+const TYPE& Vector<TYPE>::mirrorItemAt(ssize_t index) const {
+    LOG_FATAL_IF( (index>0 ? index : -index)>=size(),
+                  "mirrorItemAt: index %d is past size %d",
+                  (int)index, (int)size() );
+    return *(array() + ((index<0) ? (size()-index) : index));
+}
+
+template<class TYPE> inline
+const TYPE& Vector<TYPE>::top() const {
+    return *(array() + size() - 1);
+}
+
+template<class TYPE> inline
+TYPE& Vector<TYPE>::editItemAt(size_t index) {
+    return *( static_cast<TYPE *>(editItemLocation(index)) );
+}
+
+template<class TYPE> inline
+TYPE& Vector<TYPE>::editTop() {
+    return *( static_cast<TYPE *>(editItemLocation(size()-1)) );
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::insertVectorAt(const Vector<TYPE>& vector, size_t index) {
+    return VectorImpl::insertVectorAt(reinterpret_cast<const VectorImpl&>(vector), index);
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::appendVector(const Vector<TYPE>& vector) {
+    return VectorImpl::appendVector(reinterpret_cast<const VectorImpl&>(vector));
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::insertAt(const TYPE& item, size_t index, size_t numItems) {
+    return VectorImpl::insertAt(&item, index, numItems);
+}
+
+template<class TYPE> inline
+void Vector<TYPE>::push(const TYPE& item) {
+    return VectorImpl::push(&item);
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::add(const TYPE& item) {
+    return VectorImpl::add(&item);
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::replaceAt(const TYPE& item, size_t index) {
+    return VectorImpl::replaceAt(&item, index);
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::insertAt(size_t index, size_t numItems) {
+    return VectorImpl::insertAt(index, numItems);
+}
+
+template<class TYPE> inline
+void Vector<TYPE>::pop() {
+    VectorImpl::pop();
+}
+
+template<class TYPE> inline
+void Vector<TYPE>::push() {
+    VectorImpl::push();
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::add() {
+    return VectorImpl::add();
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::replaceAt(size_t index) {
+    return VectorImpl::replaceAt(index);
+}
+
+template<class TYPE> inline
+ssize_t Vector<TYPE>::removeItemsAt(size_t index, size_t count) {
+    return VectorImpl::removeItemsAt(index, count);
+}
+
+template<class TYPE> inline
+status_t Vector<TYPE>::sort(Vector<TYPE>::compar_t cmp) {
+    return VectorImpl::sort((VectorImpl::compar_t)cmp);
+}
+
+template<class TYPE> inline
+status_t Vector<TYPE>::sort(Vector<TYPE>::compar_r_t cmp, void* state) {
+    return VectorImpl::sort((VectorImpl::compar_r_t)cmp, state);
+}
+
+// ---------------------------------------------------------------------------
+
+template<class TYPE>
+void Vector<TYPE>::do_construct(void* storage, size_t num) const {
+    construct_type( reinterpret_cast<TYPE*>(storage), num );
+}
+
+template<class TYPE>
+void Vector<TYPE>::do_destroy(void* storage, size_t num) const {
+    destroy_type( reinterpret_cast<TYPE*>(storage), num );
+}
+
+template<class TYPE>
+void Vector<TYPE>::do_copy(void* dest, const void* from, size_t num) const {
+    copy_type( reinterpret_cast<TYPE*>(dest), reinterpret_cast<const TYPE*>(from), num );
+}
+
+template<class TYPE>
+void Vector<TYPE>::do_splat(void* dest, const void* item, size_t num) const {
+    splat_type( reinterpret_cast<TYPE*>(dest), reinterpret_cast<const TYPE*>(item), num );
+}
+
+template<class TYPE>
+void Vector<TYPE>::do_move_forward(void* dest, const void* from, size_t num) const {
+    move_forward_type( reinterpret_cast<TYPE*>(dest), reinterpret_cast<const TYPE*>(from), num );
+}
+
+template<class TYPE>
+void Vector<TYPE>::do_move_backward(void* dest, const void* from, size_t num) const {
+    move_backward_type( reinterpret_cast<TYPE*>(dest), reinterpret_cast<const TYPE*>(from), num );
+}
+
+}; // namespace android
+
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_VECTOR_H
diff --git a/libpixelflinger/tinyutils/VectorImpl.cpp b/libpixelflinger/tinyutils/VectorImpl.cpp
new file mode 100644
index 0000000..a049706
--- /dev/null
+++ b/libpixelflinger/tinyutils/VectorImpl.cpp
@@ -0,0 +1,552 @@
+/*
+ *  vector_impl.cpp
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#define LOG_TAG "Vector"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+
+#include <cutils/log.h>
+
+#include "tinyutils/SharedBuffer.h"
+#include "tinyutils/VectorImpl.h"
+
+/*****************************************************************************/
+
+
+namespace android {
+
+enum {
+    NO_ERROR          = 0,    // No errors.
+    NO_MEMORY           = -ENOMEM,
+    BAD_VALUE           = -EINVAL,
+    BAD_INDEX           = -EOVERFLOW,
+    NAME_NOT_FOUND      = -ENOENT,
+};
+
+// ----------------------------------------------------------------------------
+
+const size_t kMinVectorCapacity = 4;
+
+static inline size_t max(size_t a, size_t b) {
+    return a>b ? a : b;
+}
+
+// ----------------------------------------------------------------------------
+
+VectorImpl::VectorImpl(size_t itemSize, uint32_t flags)
+    : mStorage(0), mCount(0), mFlags(flags), mItemSize(itemSize)
+{
+}
+
+VectorImpl::VectorImpl(const VectorImpl& rhs)
+    :   mStorage(rhs.mStorage), mCount(rhs.mCount),
+        mFlags(rhs.mFlags), mItemSize(rhs.mItemSize)
+{
+    if (mStorage) {
+        SharedBuffer::sharedBuffer(mStorage)->acquire();
+    }
+}
+
+VectorImpl::~VectorImpl()
+{
+    LOG_ASSERT(!mCount,
+        "[%p] "
+        "subclasses of VectorImpl must call finish_vector()"
+        " in their destructor. Leaking %d bytes.",
+        this, (int)(mCount*mItemSize));
+    // We can't call _do_destroy() here because the vtable is already gone. 
+}
+
+VectorImpl& VectorImpl::operator = (const VectorImpl& rhs)
+{
+    LOG_ASSERT(mItemSize == rhs.mItemSize,
+        "Vector<> have different types (this=%p, rhs=%p)", this, &rhs);
+    if (this != &rhs) {
+        release_storage();
+        if (rhs.mCount) {
+            mStorage = rhs.mStorage;
+            mCount = rhs.mCount;
+            SharedBuffer::sharedBuffer(mStorage)->acquire();
+        } else {
+            mStorage = 0;
+            mCount = 0;
+        }
+    }
+    return *this;
+}
+
+void* VectorImpl::editArrayImpl()
+{
+    if (mStorage) {
+        SharedBuffer* sb = SharedBuffer::sharedBuffer(mStorage)->attemptEdit();
+        if (sb == 0) {
+            sb = SharedBuffer::alloc(capacity() * mItemSize);
+            if (sb) {
+                _do_copy(sb->data(), mStorage, mCount);
+                release_storage();
+                mStorage = sb->data();
+            }
+        }
+    }
+    return mStorage;
+}
+
+size_t VectorImpl::capacity() const
+{
+    if (mStorage) {
+        return SharedBuffer::sharedBuffer(mStorage)->size() / mItemSize;
+    }
+    return 0;
+}
+
+ssize_t VectorImpl::insertVectorAt(const VectorImpl& vector, size_t index)
+{
+    if (index > size())
+        return BAD_INDEX;
+    void* where = _grow(index, vector.size());
+    if (where) {
+        _do_copy(where, vector.arrayImpl(), vector.size());
+    }
+    return where ? index : (ssize_t)NO_MEMORY;
+}
+
+ssize_t VectorImpl::appendVector(const VectorImpl& vector)
+{
+    return insertVectorAt(vector, size());
+}
+
+ssize_t VectorImpl::insertAt(size_t index, size_t numItems)
+{
+    return insertAt(0, index, numItems);
+}
+
+ssize_t VectorImpl::insertAt(const void* item, size_t index, size_t numItems)
+{
+    if (index > size())
+        return BAD_INDEX;
+    void* where = _grow(index, numItems);
+    if (where) {
+        if (item) {
+            _do_splat(where, item, numItems);
+        } else {
+            _do_construct(where, numItems);
+        }
+    }
+    return where ? index : (ssize_t)NO_MEMORY;
+}
+
+void VectorImpl::pop()
+{
+    if (size())
+        removeItemsAt(size()-1, 1);
+}
+
+void VectorImpl::push()
+{
+    push(0);
+}
+
+void VectorImpl::push(const void* item)
+{
+    insertAt(item, size());
+}
+
+ssize_t VectorImpl::add()
+{
+    return add(0);
+}
+
+ssize_t VectorImpl::add(const void* item)
+{
+    return insertAt(item, size());
+}
+
+ssize_t VectorImpl::replaceAt(size_t index)
+{
+    return replaceAt(0, index);
+}
+
+ssize_t VectorImpl::replaceAt(const void* prototype, size_t index)
+{
+    LOG_ASSERT(index<size(),
+        "[%p] replace: index=%d, size=%d", this, (int)index, (int)size());
+
+    void* item = editItemLocation(index);
+    if (item == 0)
+        return NO_MEMORY;
+    _do_destroy(item, 1);
+    if (prototype == 0) {
+        _do_construct(item, 1);
+    } else {
+        _do_copy(item, prototype, 1);
+    }
+    return ssize_t(index);
+}
+
+ssize_t VectorImpl::removeItemsAt(size_t index, size_t count)
+{
+    LOG_ASSERT((index+count)<=size(),
+        "[%p] remove: index=%d, count=%d, size=%d",
+               this, (int)index, (int)count, (int)size());
+
+    if ((index+count) > size())
+        return BAD_VALUE;
+   _shrink(index, count);
+   return index;
+}
+
+void VectorImpl::finish_vector()
+{
+    release_storage();
+    mStorage = 0;
+    mCount = 0;
+}
+
+void VectorImpl::clear()
+{
+    _shrink(0, mCount);
+}
+
+void* VectorImpl::editItemLocation(size_t index)
+{
+    LOG_ASSERT(index<capacity(),
+        "[%p] itemLocation: index=%d, capacity=%d, count=%d",
+        this, (int)index, (int)capacity(), (int)mCount);
+            
+    void* buffer = editArrayImpl();
+    if (buffer)
+        return reinterpret_cast<char*>(buffer) + index*mItemSize;
+    return 0;
+}
+
+const void* VectorImpl::itemLocation(size_t index) const
+{
+    LOG_ASSERT(index<capacity(),
+        "[%p] editItemLocation: index=%d, capacity=%d, count=%d",
+        this, (int)index, (int)capacity(), (int)mCount);
+
+    const  void* buffer = arrayImpl();
+    if (buffer)
+        return reinterpret_cast<const char*>(buffer) + index*mItemSize;
+    return 0;
+}
+
+ssize_t VectorImpl::setCapacity(size_t new_capacity)
+{
+    size_t current_capacity = capacity();
+    ssize_t amount = new_capacity - size();
+    if (amount <= 0) {
+        // we can't reduce the capacity
+        return current_capacity;
+    } 
+    SharedBuffer* sb = SharedBuffer::alloc(new_capacity * mItemSize);
+    if (sb) {
+        void* array = sb->data();
+        _do_copy(array, mStorage, size());
+        release_storage();
+        mStorage = const_cast<void*>(array);
+    } else {
+        return NO_MEMORY;
+    }
+    return new_capacity;
+}
+
+void VectorImpl::release_storage()
+{
+    if (mStorage) {
+        const SharedBuffer* sb = SharedBuffer::sharedBuffer(mStorage);
+        if (sb->release(SharedBuffer::eKeepStorage) == 1) {
+            _do_destroy(mStorage, mCount);
+            SharedBuffer::dealloc(sb);
+        } 
+    }
+}
+
+void* VectorImpl::_grow(size_t where, size_t amount)
+{
+//    LOGV("_grow(this=%p, where=%d, amount=%d) count=%d, capacity=%d",
+//        this, (int)where, (int)amount, (int)mCount, (int)capacity());
+
+    if (where > mCount)
+        where = mCount;
+      
+    const size_t new_size = mCount + amount;
+    if (capacity() < new_size) {
+        const size_t new_capacity = max(kMinVectorCapacity, ((new_size*3)+1)/2);
+//        LOGV("grow vector %p, new_capacity=%d", this, (int)new_capacity);
+        if ((mStorage) &&
+            (mCount==where) &&
+            (mFlags & HAS_TRIVIAL_COPY) &&
+            (mFlags & HAS_TRIVIAL_DTOR))
+        {
+            const SharedBuffer* cur_sb = SharedBuffer::sharedBuffer(mStorage);
+            SharedBuffer* sb = cur_sb->editResize(new_capacity * mItemSize);
+            mStorage = sb->data();
+        } else {
+            SharedBuffer* sb = SharedBuffer::alloc(new_capacity * mItemSize);
+            if (sb) {
+                void* array = sb->data();
+                if (where>0) {
+                    _do_copy(array, mStorage, where);
+                }
+                if (mCount>where) {
+                    const void* from = reinterpret_cast<const uint8_t *>(mStorage) + where*mItemSize;
+                    void* dest = reinterpret_cast<uint8_t *>(array) + (where+amount)*mItemSize;
+                    _do_copy(dest, from, mCount-where);
+                }
+                release_storage();
+                mStorage = const_cast<void*>(array);
+            }
+        }
+    } else {
+        ssize_t s = mCount-where;
+        if (s>0) {
+            void* array = editArrayImpl();    
+            void* to = reinterpret_cast<uint8_t *>(array) + (where+amount)*mItemSize;
+            const void* from = reinterpret_cast<const uint8_t *>(array) + where*mItemSize;
+            _do_move_forward(to, from, s);
+        }
+    }
+    mCount += amount;
+    void* free_space = const_cast<void*>(itemLocation(where));
+    return free_space;
+}
+
+void VectorImpl::_shrink(size_t where, size_t amount)
+{
+    if (!mStorage)
+        return;
+
+//    LOGV("_shrink(this=%p, where=%d, amount=%d) count=%d, capacity=%d",
+//        this, (int)where, (int)amount, (int)mCount, (int)capacity());
+
+    if (where >= mCount)
+        where = mCount - amount;
+
+    const size_t new_size = mCount - amount;
+    if (new_size*3 < capacity()) {
+        const size_t new_capacity = max(kMinVectorCapacity, new_size*2);
+//        LOGV("shrink vector %p, new_capacity=%d", this, (int)new_capacity);
+        if ((where == mCount-amount) &&
+            (mFlags & HAS_TRIVIAL_COPY) &&
+            (mFlags & HAS_TRIVIAL_DTOR))
+        {
+            const SharedBuffer* cur_sb = SharedBuffer::sharedBuffer(mStorage);
+            SharedBuffer* sb = cur_sb->editResize(new_capacity * mItemSize);
+            mStorage = sb->data();
+        } else {
+            SharedBuffer* sb = SharedBuffer::alloc(new_capacity * mItemSize);
+            if (sb) {
+                void* array = sb->data();
+                if (where>0) {
+                    _do_copy(array, mStorage, where);
+                }
+                if (mCount > where+amount) {
+                    const void* from = reinterpret_cast<const uint8_t *>(mStorage) + (where+amount)*mItemSize;
+                    void* dest = reinterpret_cast<uint8_t *>(array) + where*mItemSize;
+                    _do_copy(dest, from, mCount-(where+amount));
+                }
+                release_storage();
+                mStorage = const_cast<void*>(array);
+            }
+        }
+    } else {
+        void* array = editArrayImpl();    
+        void* to = reinterpret_cast<uint8_t *>(array) + where*mItemSize;
+        _do_destroy(to, amount);
+        ssize_t s = mCount-(where+amount);
+        if (s>0) {
+            const void* from = reinterpret_cast<uint8_t *>(array) + (where+amount)*mItemSize;
+            _do_move_backward(to, from, s);
+        }
+    }
+
+    // adjust the number of items...
+    mCount -= amount;
+}
+
+size_t VectorImpl::itemSize() const {
+    return mItemSize;
+}
+
+void VectorImpl::_do_construct(void* storage, size_t num) const
+{
+    if (!(mFlags & HAS_TRIVIAL_CTOR)) {
+        do_construct(storage, num);
+    }
+}
+
+void VectorImpl::_do_destroy(void* storage, size_t num) const
+{
+    if (!(mFlags & HAS_TRIVIAL_DTOR)) {
+        do_destroy(storage, num);
+    }
+}
+
+void VectorImpl::_do_copy(void* dest, const void* from, size_t num) const
+{
+    if (!(mFlags & HAS_TRIVIAL_COPY)) {
+        do_copy(dest, from, num);
+    } else {
+        memcpy(dest, from, num*itemSize());
+    }
+}
+
+void VectorImpl::_do_splat(void* dest, const void* item, size_t num) const {
+    do_splat(dest, item, num);
+}
+
+void VectorImpl::_do_move_forward(void* dest, const void* from, size_t num) const {
+    do_move_forward(dest, from, num);
+}
+
+void VectorImpl::_do_move_backward(void* dest, const void* from, size_t num) const {
+    do_move_backward(dest, from, num);
+}
+
+void VectorImpl::reservedVectorImpl1() { }
+void VectorImpl::reservedVectorImpl2() { }
+void VectorImpl::reservedVectorImpl3() { }
+void VectorImpl::reservedVectorImpl4() { }
+void VectorImpl::reservedVectorImpl5() { }
+void VectorImpl::reservedVectorImpl6() { }
+void VectorImpl::reservedVectorImpl7() { }
+void VectorImpl::reservedVectorImpl8() { }
+
+/*****************************************************************************/
+
+SortedVectorImpl::SortedVectorImpl(size_t itemSize, uint32_t flags)
+    : VectorImpl(itemSize, flags)
+{
+}
+
+SortedVectorImpl::SortedVectorImpl(const VectorImpl& rhs)
+: VectorImpl(rhs)
+{
+}
+
+SortedVectorImpl::~SortedVectorImpl()
+{
+}
+
+SortedVectorImpl& SortedVectorImpl::operator = (const SortedVectorImpl& rhs)
+{
+    return static_cast<SortedVectorImpl&>( VectorImpl::operator = (static_cast<const VectorImpl&>(rhs)) );
+}
+
+ssize_t SortedVectorImpl::indexOf(const void* item) const
+{
+    return _indexOrderOf(item);
+}
+
+size_t SortedVectorImpl::orderOf(const void* item) const
+{
+    size_t o;
+    _indexOrderOf(item, &o);
+    return o;
+}
+
+ssize_t SortedVectorImpl::_indexOrderOf(const void* item, size_t* order) const
+{
+    // binary search
+    ssize_t err = NAME_NOT_FOUND;
+    ssize_t l = 0;
+    ssize_t h = size()-1;
+    ssize_t mid;
+    const void* a = arrayImpl();
+    const size_t s = itemSize();
+    while (l <= h) {
+        mid = l + (h - l)/2;
+        const void* const curr = reinterpret_cast<const char *>(a) + (mid*s);
+        const int c = do_compare(curr, item);
+        if (c == 0) {
+            err = l = mid;
+            break;
+        } else if (c < 0) {
+            l = mid + 1;
+        } else {
+            h = mid - 1;
+        }
+    }
+    if (order) *order = l;
+    return err;
+}
+
+ssize_t SortedVectorImpl::add(const void* item)
+{
+    size_t order;
+    ssize_t index = _indexOrderOf(item, &order);
+    if (index < 0) {
+        index = VectorImpl::insertAt(item, order, 1);
+    } else {
+        index = VectorImpl::replaceAt(item, index);
+    }
+    return index;
+}
+
+ssize_t SortedVectorImpl::merge(const VectorImpl& vector)
+{
+    // naive merge...
+    if (!vector.isEmpty()) {
+        const void* buffer = vector.arrayImpl();
+        const size_t is = itemSize();
+        size_t s = vector.size();
+        for (size_t i=0 ; i<s ; i++) {
+            ssize_t err = add( reinterpret_cast<const char*>(buffer) + i*is );
+            if (err<0) {
+                return err;
+            }
+        }
+    }
+    return NO_ERROR;
+}
+
+ssize_t SortedVectorImpl::merge(const SortedVectorImpl& vector)
+{
+    // we've merging a sorted vector... nice!
+    ssize_t err = NO_ERROR;
+    if (!vector.isEmpty()) {
+        // first take care of the case where the vectors are sorted together
+        if (do_compare(vector.itemLocation(vector.size()-1), arrayImpl()) <= 0) {
+            err = VectorImpl::insertVectorAt(static_cast<const VectorImpl&>(vector), 0);
+        } else if (do_compare(vector.arrayImpl(), itemLocation(size()-1)) >= 0) {
+            err = VectorImpl::appendVector(static_cast<const VectorImpl&>(vector));
+        } else {
+            // this could be made a little better
+            err = merge(static_cast<const VectorImpl&>(vector));
+        }
+    }
+    return err;
+}
+
+ssize_t SortedVectorImpl::remove(const void* item)
+{
+    ssize_t i = indexOf(item);
+    if (i>=0) {
+        VectorImpl::removeItemsAt(i, 1);
+    }
+    return i;
+}
+
+void SortedVectorImpl::reservedSortedVectorImpl1() { };
+void SortedVectorImpl::reservedSortedVectorImpl2() { };
+void SortedVectorImpl::reservedSortedVectorImpl3() { };
+void SortedVectorImpl::reservedSortedVectorImpl4() { };
+void SortedVectorImpl::reservedSortedVectorImpl5() { };
+void SortedVectorImpl::reservedSortedVectorImpl6() { };
+void SortedVectorImpl::reservedSortedVectorImpl7() { };
+void SortedVectorImpl::reservedSortedVectorImpl8() { };
+
+
+/*****************************************************************************/
+
+}; // namespace android
+
diff --git a/libpixelflinger/tinyutils/VectorImpl.h b/libpixelflinger/tinyutils/VectorImpl.h
new file mode 100644
index 0000000..e868eca
--- /dev/null
+++ b/libpixelflinger/tinyutils/VectorImpl.h
@@ -0,0 +1,185 @@
+/*
+ *  vector_impl.h
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_VECTOR_IMPL_H
+#define ANDROID_VECTOR_IMPL_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+// ---------------------------------------------------------------------------
+// No user serviceable parts in here...
+// ---------------------------------------------------------------------------
+
+namespace android {
+
+/*!
+ * Implementation of the guts of the vector<> class
+ * this ensures backward binary compatibility and
+ * reduces code size.
+ * For performance reasons, we expose mStorage and mCount
+ * so these fields are set in stone.
+ *
+ */
+
+class VectorImpl
+{
+public:
+    enum { // flags passed to the ctor
+        HAS_TRIVIAL_CTOR    = 0x00000001,
+        HAS_TRIVIAL_DTOR    = 0x00000002,
+        HAS_TRIVIAL_COPY    = 0x00000004,
+        HAS_TRIVIAL_ASSIGN  = 0x00000008
+    };
+
+                            VectorImpl(size_t itemSize, uint32_t flags);
+                            VectorImpl(const VectorImpl& rhs);
+    virtual                 ~VectorImpl();
+
+    /*! must be called from subclasses destructor */
+            void            finish_vector();
+
+            VectorImpl&     operator = (const VectorImpl& rhs);    
+            
+    /*! C-style array access */
+    inline  const void*     arrayImpl() const       { return mStorage; }
+            void*           editArrayImpl();
+            
+    /*! vector stats */
+    inline  size_t          size() const        { return mCount; }
+    inline  bool            isEmpty() const     { return mCount == 0; }
+            size_t          capacity() const;
+            ssize_t         setCapacity(size_t size);
+
+            /*! append/insert another vector */
+            ssize_t         insertVectorAt(const VectorImpl& vector, size_t index);
+            ssize_t         appendVector(const VectorImpl& vector);
+            
+            /*! add/insert/replace items */
+            ssize_t         insertAt(size_t where, size_t numItems = 1);
+            ssize_t         insertAt(const void* item, size_t where, size_t numItems = 1);
+            void            pop();
+            void            push();
+            void            push(const void* item);
+            ssize_t         add();
+            ssize_t         add(const void* item);
+            ssize_t         replaceAt(size_t index);
+            ssize_t         replaceAt(const void* item, size_t index);
+
+            /*! remove items */
+            ssize_t         removeItemsAt(size_t index, size_t count = 1);
+            void            clear();
+
+            const void*     itemLocation(size_t index) const;
+            void*           editItemLocation(size_t index);
+
+protected:
+            size_t          itemSize() const;
+            void            release_storage();
+
+    virtual void            do_construct(void* storage, size_t num) const = 0;
+    virtual void            do_destroy(void* storage, size_t num) const = 0;
+    virtual void            do_copy(void* dest, const void* from, size_t num) const = 0;
+    virtual void            do_splat(void* dest, const void* item, size_t num) const = 0;
+    virtual void            do_move_forward(void* dest, const void* from, size_t num) const = 0;
+    virtual void            do_move_backward(void* dest, const void* from, size_t num) const = 0;
+
+    // take care of FBC...
+    virtual void            reservedVectorImpl1();
+    virtual void            reservedVectorImpl2();
+    virtual void            reservedVectorImpl3();
+    virtual void            reservedVectorImpl4();
+    virtual void            reservedVectorImpl5();
+    virtual void            reservedVectorImpl6();
+    virtual void            reservedVectorImpl7();
+    virtual void            reservedVectorImpl8();
+    
+private:
+        void* _grow(size_t where, size_t amount);
+        void  _shrink(size_t where, size_t amount);
+
+        inline void _do_construct(void* storage, size_t num) const;
+        inline void _do_destroy(void* storage, size_t num) const;
+        inline void _do_copy(void* dest, const void* from, size_t num) const;
+        inline void _do_splat(void* dest, const void* item, size_t num) const;
+        inline void _do_move_forward(void* dest, const void* from, size_t num) const;
+        inline void _do_move_backward(void* dest, const void* from, size_t num) const;
+
+            // These 2 fields are exposed in the inlines below,
+            // so they're set in stone.
+            void *      mStorage;   // base address of the vector
+            size_t      mCount;     // number of items
+
+    const   uint32_t    mFlags;
+    const   size_t      mItemSize;
+};
+
+
+
+class SortedVectorImpl : public VectorImpl
+{
+public:
+                            SortedVectorImpl(size_t itemSize, uint32_t flags);
+                            SortedVectorImpl(const VectorImpl& rhs);
+    virtual                 ~SortedVectorImpl();
+    
+    SortedVectorImpl&     operator = (const SortedVectorImpl& rhs);    
+
+    //! finds the index of an item
+            ssize_t         indexOf(const void* item) const;
+
+    //! finds where this item should be inserted
+            size_t          orderOf(const void* item) const;
+
+    //! add an item in the right place (or replaces it if there is one)
+            ssize_t         add(const void* item);
+
+    //! merges a vector into this one
+            ssize_t         merge(const VectorImpl& vector);
+            ssize_t         merge(const SortedVectorImpl& vector);
+             
+    //! removes an item
+            ssize_t         remove(const void* item);
+        
+protected:
+    virtual int             do_compare(const void* lhs, const void* rhs) const = 0;
+
+    // take care of FBC...
+    virtual void            reservedSortedVectorImpl1();
+    virtual void            reservedSortedVectorImpl2();
+    virtual void            reservedSortedVectorImpl3();
+    virtual void            reservedSortedVectorImpl4();
+    virtual void            reservedSortedVectorImpl5();
+    virtual void            reservedSortedVectorImpl6();
+    virtual void            reservedSortedVectorImpl7();
+    virtual void            reservedSortedVectorImpl8();
+
+private:
+            ssize_t         _indexOrderOf(const void* item, size_t* order = 0) const;
+
+            // these are made private, because they can't be used on a SortedVector
+            // (they don't have an implementation either)
+            ssize_t         add();
+            void            pop();
+            void            push();
+            void            push(const void* item);
+            ssize_t         insertVectorAt(const VectorImpl& vector, size_t index);
+            ssize_t         appendVector(const VectorImpl& vector);
+            ssize_t         insertAt(size_t where, size_t numItems = 1);
+            ssize_t         insertAt(const void* item, size_t where, size_t numItems = 1);
+            ssize_t         replaceAt(size_t index);
+            ssize_t         replaceAt(const void* item, size_t index);
+};
+
+}; // namespace android
+
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_VECTOR_IMPL_H
diff --git a/libpixelflinger/tinyutils/smartpointer.h b/libpixelflinger/tinyutils/smartpointer.h
new file mode 100644
index 0000000..88032d7
--- /dev/null
+++ b/libpixelflinger/tinyutils/smartpointer.h
@@ -0,0 +1,170 @@
+/*
+ *  smartpointer.h
+ *  Android  
+ *
+ *  Copyright 2005 The Android Open Source Project
+ *
+ */
+
+#ifndef ANDROID_SMART_POINTER_H
+#define ANDROID_SMART_POINTER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+// ---------------------------------------------------------------------------
+namespace android {
+
+// ---------------------------------------------------------------------------
+
+#define COMPARE(_op_)                                           \
+inline bool operator _op_ (const sp<T>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+inline bool operator _op_ (const T* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const sp<U>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const U* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}
+
+// ---------------------------------------------------------------------------
+
+template <typename T>
+class sp
+{
+public:
+    inline sp() : m_ptr(0) { }
+
+    sp(T* other);
+    sp(const sp<T>& other);
+    template<typename U> sp(U* other);
+    template<typename U> sp(const sp<U>& other);
+
+    ~sp();
+    
+    // Assignment
+
+    sp& operator = (T* other);
+    sp& operator = (const sp<T>& other);
+    
+    template<typename U> sp& operator = (const sp<U>& other);
+    template<typename U> sp& operator = (U* other);
+    
+    // Reset
+    void clear();
+    
+    // Accessors
+
+    inline  T&      operator* () const  { return *m_ptr; }
+    inline  T*      operator-> () const { return m_ptr;  }
+    inline  T*      get() const         { return m_ptr; }
+
+    // Operators
+        
+    COMPARE(==)
+    COMPARE(!=)
+    COMPARE(>)
+    COMPARE(<)
+    COMPARE(<=)
+    COMPARE(>=)
+
+private:    
+    template<typename Y> friend class sp;
+
+    T*              m_ptr;
+};
+
+// ---------------------------------------------------------------------------
+// No user serviceable parts below here.
+
+template<typename T>
+sp<T>::sp(T* other)
+    : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T>
+sp<T>::sp(const sp<T>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(U* other) : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(const sp<U>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T>
+sp<T>::~sp()
+{
+    if (m_ptr) m_ptr->decStrong(this);
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (const sp<T>& other) {
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (T* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (const sp<U>& other)
+{
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (U* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T>
+void sp<T>::clear()
+{
+    if (m_ptr) {
+        m_ptr->decStrong(this);
+        m_ptr = 0;
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+}; // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_SMART_POINTER_H
diff --git a/libpixelflinger/trap.cpp b/libpixelflinger/trap.cpp
new file mode 100644
index 0000000..30b633f
--- /dev/null
+++ b/libpixelflinger/trap.cpp
@@ -0,0 +1,1173 @@
+/* libs/pixelflinger/trap.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "trap.h"
+#include "picker.h"
+
+#include <cutils/log.h>
+#include <cutils/memory.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+// enable to see triangles edges
+#define DEBUG_TRANGLES  0
+
+// ----------------------------------------------------------------------------
+
+static void pointx_validate(void *con, const GGLcoord* c, GGLcoord r);
+static void pointx(void *con, const GGLcoord* c, GGLcoord r);
+static void aa_pointx(void *con, const GGLcoord* c, GGLcoord r);
+static void aa_nice_pointx(void *con, const GGLcoord* c, GGLcoord r);
+
+static void linex_validate(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+static void linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+static void aa_linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+
+static void recti_validate(void* c, GGLint l, GGLint t, GGLint r, GGLint b); 
+static void recti(void* c, GGLint l, GGLint t, GGLint r, GGLint b); 
+
+static void trianglex_validate(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_small(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_big(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void aa_trianglex(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_debug(void* con,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+
+static void aapolyx(void* con,
+        const GGLcoord* pts, int count);
+
+static inline int min(int a, int b) CONST;
+static inline int max(int a, int b) CONST;
+static inline int min(int a, int b, int c) CONST;
+static inline int max(int a, int b, int c) CONST;
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Tools
+#endif
+
+inline int min(int a, int b) {
+    return a<b ? a : b;
+}
+inline int max(int a, int b) {
+    return a<b ? b : a;
+}
+inline int min(int a, int b, int c) {
+    return min(a,min(b,c));
+}
+inline int max(int a, int b, int c) {
+    return max(a,max(b,c));
+}
+
+template <typename T>
+static inline void swap(T& a, T& b) {
+    T t(a);
+    a = b;
+    b = t;
+}
+
+static void
+triangle_dump_points( const GGLcoord*  v0,
+                      const GGLcoord*  v1,
+				 	  const GGLcoord*  v2 )
+{
+    float tri = 1.0f / TRI_ONE;
+  LOGD(     "  P0=(%.3f, %.3f)  [%08x, %08x]\n"
+            "  P1=(%.3f, %.3f)  [%08x, %08x]\n"
+            "  P2=(%.3f, %.3f)  [%08x, %08x]\n",
+		v0[0]*tri, v0[1]*tri, v0[0], v0[1],
+		v1[0]*tri, v1[1]*tri, v1[0], v1[1],
+		v2[0]*tri, v2[1]*tri, v2[0], v2[1] );
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Misc
+#endif
+
+void ggl_init_trap(context_t* c)
+{
+    ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE|GGL_TMU_STATE|GGL_CB_STATE);
+}
+
+void ggl_state_changed(context_t* c, int flags)
+{
+    if (ggl_likely(!c->dirty)) {
+        c->procs.pointx     = pointx_validate;
+        c->procs.linex      = linex_validate;
+        c->procs.recti      = recti_validate;
+        c->procs.trianglex  = trianglex_validate;
+    }
+    c->dirty |= uint32_t(flags);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Point
+#endif
+
+void pointx_validate(void *con, const GGLcoord* v, GGLcoord rad)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        if (c->state.enables & GGL_ENABLE_POINT_AA_NICE) {
+            c->procs.pointx = aa_nice_pointx;
+        } else {
+            c->procs.pointx = aa_pointx;
+        }
+    } else {
+        c->procs.pointx = pointx;
+    }
+    c->procs.pointx(con, v, rad);
+}
+
+void pointx(void *con, const GGLcoord* v, GGLcoord rad)
+{
+    GGL_CONTEXT(c, con);
+    GGLcoord halfSize = TRI_ROUND(rad) >> 1;
+    if (halfSize == 0)
+        halfSize = TRI_HALF;
+    GGLcoord xc = v[0]; 
+    GGLcoord yc = v[1];
+    if (halfSize & TRI_HALF) { // size odd
+        xc = TRI_FLOOR(xc) + TRI_HALF;
+        yc = TRI_FLOOR(yc) + TRI_HALF;
+    } else { // size even
+        xc = TRI_ROUND(xc);
+        yc = TRI_ROUND(yc);
+    }
+    GGLint l = (xc - halfSize) >> TRI_FRACTION_BITS;
+    GGLint t = (yc - halfSize) >> TRI_FRACTION_BITS;
+    GGLint r = (xc + halfSize) >> TRI_FRACTION_BITS;
+    GGLint b = (yc + halfSize) >> TRI_FRACTION_BITS;
+    recti(c, l, t, r, b);
+}
+
+// This way of computing the coverage factor, is more accurate and gives
+// better results for small circles, but it is also a lot slower.
+// Here we use super-sampling.
+static int32_t coverageNice(GGLcoord x, GGLcoord y, 
+        GGLcoord rmin, GGLcoord rmax, GGLcoord rr)
+{
+    const GGLcoord d2 = x*x + y*y;
+    if (d2 >= rmax) return 0;
+    if (d2 < rmin)  return 0x7FFF;
+
+    const int kSamples              =  4;
+    const int kInc                  =  4;    // 1/4 = 0.25
+    const int kCoverageUnit         =  1;    // 1/(4^2) = 0.0625
+    const GGLcoord kCoordOffset     = -6;    // -0.375
+
+    int hits = 0;
+    int x_sample = x + kCoordOffset;
+    for (int i=0 ; i<kSamples ; i++, x_sample += kInc) {
+        const int xval = rr - (x_sample * x_sample);
+        int y_sample = y + kCoordOffset;
+        for (int j=0 ; j<kSamples ; j++, y_sample += kInc) {
+            if (xval - (y_sample * y_sample) > 0)
+                hits += kCoverageUnit;
+        }
+    }
+    return min(0x7FFF, hits << (15 - kSamples));
+}
+
+
+void aa_nice_pointx(void *con, const GGLcoord* v, GGLcoord size)
+{
+    GGL_CONTEXT(c, con);
+
+    GGLcoord rad = ((size + 1)>>1);
+    GGLint l = (v[0] - rad) >> TRI_FRACTION_BITS;
+    GGLint t = (v[1] - rad) >> TRI_FRACTION_BITS;
+    GGLint r = (v[0] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLint b = (v[1] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLcoord xstart = TRI_FROM_INT(l) - v[0] + TRI_HALF; 
+    GGLcoord ystart = TRI_FROM_INT(t) - v[1] + TRI_HALF; 
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left)) {
+        xstart += TRI_FROM_INT(c->state.scissor.left-l);
+        l = GGLint(c->state.scissor.left);
+    }
+    if (t < GGLint(c->state.scissor.top)) {
+        ystart += TRI_FROM_INT(c->state.scissor.top-t);
+        t = GGLint(c->state.scissor.top);
+    }
+    if (r > GGLint(c->state.scissor.right)) {
+        r = GGLint(c->state.scissor.right);
+    }
+    if (b > GGLint(c->state.scissor.bottom)) {
+        b = GGLint(c->state.scissor.bottom);
+    }
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        int16_t* covPtr = c->state.buffers.coverage;
+        const int32_t sqr2Over2 = 0xC; // rounded up
+        GGLcoord rr = rad*rad;
+        GGLcoord rmin = (rad - sqr2Over2)*(rad - sqr2Over2);
+        GGLcoord rmax = (rad + sqr2Over2)*(rad + sqr2Over2);
+        GGLcoord y = ystart;
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+        do {
+            // compute coverage factors for each pixel
+            GGLcoord x = xstart;
+            for (int i=l ; i<r ; i++) {
+                covPtr[i] = coverageNice(x, y, rmin, rmax, rr);
+                x += TRI_ONE;
+            }
+            y += TRI_ONE;
+            c->scanline(c);
+            c->step_y(c);
+        } while (--yc);
+    }
+}
+
+// This is a cheap way of computing the coverage factor for a circle.
+// We just lerp between the circles of radii r-sqrt(2)/2 and r+sqrt(2)/2
+static inline int32_t coverageFast(GGLcoord x, GGLcoord y,
+        GGLcoord rmin, GGLcoord rmax, GGLcoord scale)
+{
+    const GGLcoord d2 = x*x + y*y;
+    if (d2 >= rmax) return 0;
+    if (d2 < rmin)  return 0x7FFF;
+    return 0x7FFF - (d2-rmin)*scale;
+}
+
+void aa_pointx(void *con, const GGLcoord* v, GGLcoord size)
+{
+    GGL_CONTEXT(c, con);
+
+    GGLcoord rad = ((size + 1)>>1);
+    GGLint l = (v[0] - rad) >> TRI_FRACTION_BITS;
+    GGLint t = (v[1] - rad) >> TRI_FRACTION_BITS;
+    GGLint r = (v[0] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLint b = (v[1] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLcoord xstart = TRI_FROM_INT(l) - v[0] + TRI_HALF; 
+    GGLcoord ystart = TRI_FROM_INT(t) - v[1] + TRI_HALF; 
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left)) {
+        xstart += TRI_FROM_INT(c->state.scissor.left-l);
+        l = GGLint(c->state.scissor.left);
+    }
+    if (t < GGLint(c->state.scissor.top)) {
+        ystart += TRI_FROM_INT(c->state.scissor.top-t);
+        t = GGLint(c->state.scissor.top);
+    }
+    if (r > GGLint(c->state.scissor.right)) {
+        r = GGLint(c->state.scissor.right);
+    }
+    if (b > GGLint(c->state.scissor.bottom)) {
+        b = GGLint(c->state.scissor.bottom);
+    }
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        int16_t* covPtr = c->state.buffers.coverage;
+        rad <<= 4;
+        const int32_t sqr2Over2 = 0xB5;    // fixed-point 24.8
+        GGLcoord rmin = rad - sqr2Over2;
+        GGLcoord rmax = rad + sqr2Over2;
+        GGLcoord scale;
+        rmin *= rmin;
+        rmax *= rmax;
+        scale = 0x800000 / (rmax - rmin);
+        rmin >>= 8;
+        rmax >>= 8;
+
+        GGLcoord y = ystart;
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+
+        do {
+            // compute coverage factors for each pixel
+            GGLcoord x = xstart;
+            for (int i=l ; i<r ; i++) {
+                covPtr[i] = coverageFast(x, y, rmin, rmax, scale);
+                x += TRI_ONE;
+            }
+            y += TRI_ONE;
+            c->scanline(c);
+            c->step_y(c);
+        } while (--yc);
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Line
+#endif
+
+void linex_validate(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        c->procs.linex = aa_linex;
+    } else {
+        c->procs.linex = linex;
+    }
+    c->procs.linex(con, v0, v1, w);
+}
+
+static void linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord width)
+{
+    GGL_CONTEXT(c, con);
+    GGLcoord v[4][2];
+    v[0][0] = v0[0];    v[0][1] = v0[1];
+    v[1][0] = v1[0];    v[1][1] = v1[1];
+    v0 = v[0];
+    v1 = v[1];
+    const GGLcoord dx = abs(v0[0] - v1[0]);
+    const GGLcoord dy = abs(v0[1] - v1[1]);
+    GGLcoord nx, ny;
+    nx = ny = 0;
+
+    GGLcoord halfWidth = TRI_ROUND(width) >> 1;
+    if (halfWidth == 0)
+        halfWidth = TRI_HALF;
+
+    ((dx > dy) ? ny : nx) = halfWidth;
+    v[2][0] = v1[0];    v[2][1] = v1[1];
+    v[3][0] = v0[0];    v[3][1] = v0[1];
+    v[0][0] += nx;      v[0][1] += ny;
+    v[1][0] += nx;      v[1][1] += ny;
+    v[2][0] -= nx;      v[2][1] -= ny;
+    v[3][0] -= nx;      v[3][1] -= ny;
+    trianglex_big(con, v[0], v[1], v[2]);
+    trianglex_big(con, v[0], v[2], v[3]);
+}
+
+static void aa_linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord width)
+{
+    GGL_CONTEXT(c, con);
+    GGLcoord v[4][2];
+    v[0][0] = v0[0];    v[0][1] = v0[1];
+    v[1][0] = v1[0];    v[1][1] = v1[1];
+    v0 = v[0];
+    v1 = v[1];
+    
+    const GGLcoord dx = v0[0] - v1[0];
+    const GGLcoord dy = v0[1] - v1[1];
+    GGLcoord nx = -dy;
+    GGLcoord ny =  dx;
+
+    // generally, this will be well below 1.0
+    const GGLfixed norm = gglMulx(width, gglSqrtRecipx(nx*nx+ny*ny), 4);
+    nx = gglMulx(nx, norm, 21);
+    ny = gglMulx(ny, norm, 21);
+    
+    v[2][0] = v1[0];    v[2][1] = v1[1];
+    v[3][0] = v0[0];    v[3][1] = v0[1];
+    v[0][0] += nx;      v[0][1] += ny;
+    v[1][0] += nx;      v[1][1] += ny;
+    v[2][0] -= nx;      v[2][1] -= ny;
+    v[3][0] -= nx;      v[3][1] -= ny;
+    aapolyx(con, v[0], 4);        
+}
+
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Rect
+#endif
+
+void recti_validate(void *con, GGLint l, GGLint t, GGLint r, GGLint b)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    c->procs.recti = recti;
+    c->procs.recti(con, l, t, r, b);
+}
+
+void recti(void* con, GGLint l, GGLint t, GGLint r, GGLint b)
+{
+    GGL_CONTEXT(c, con);
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left))
+        l = GGLint(c->state.scissor.left);
+    if (t < GGLint(c->state.scissor.top))
+        t = GGLint(c->state.scissor.top);
+    if (r > GGLint(c->state.scissor.right))
+        r = GGLint(c->state.scissor.right);
+    if (b > GGLint(c->state.scissor.bottom))
+        b = GGLint(c->state.scissor.bottom);
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+        c->rect(c, yc);
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Triangle / Debugging
+#endif
+
+static void scanline_set(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0xFF, size);
+}
+
+static void trianglex_debug(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        aa_trianglex(con,v0,v1,v2);
+    } else {
+        trianglex_big(con,v0,v1,v2);
+    }
+	void (*save_scanline)(context_t*)  = c->scanline;
+    c->scanline = scanline_set;
+    linex(con, v0, v1, TRI_ONE);
+    linex(con, v1, v2, TRI_ONE);
+    linex(con, v2, v0, TRI_ONE);
+    c->scanline = save_scanline;
+}
+
+static void trianglex_xor(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    trianglex_big(con,v0,v1,v2);
+    trianglex_small(con,v0,v1,v2);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Triangle
+#endif
+
+void trianglex_validate(void *con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        c->procs.trianglex = DEBUG_TRANGLES ? trianglex_debug : aa_trianglex;
+    } else {
+        c->procs.trianglex = DEBUG_TRANGLES ? trianglex_debug : trianglex_big;
+    }
+    c->procs.trianglex(con, v0, v1, v2);
+}
+
+// ----------------------------------------------------------------------------
+
+void trianglex_small(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+
+    // vertices are in 28.4 fixed point, which allows
+    // us to use 32 bits multiplies below.
+    int32_t x0 = v0[0];
+    int32_t y0 = v0[1];
+    int32_t x1 = v1[0];
+    int32_t y1 = v1[1];
+    int32_t x2 = v2[0];
+    int32_t y2 = v2[1];
+
+    int32_t dx01 = x0 - x1;
+    int32_t dy20 = y2 - y0;
+    int32_t dy01 = y0 - y1;
+    int32_t dx20 = x2 - x0;
+
+    // The code below works only with CCW triangles
+    // so if we get a CW triangle, we need to swap two of its vertices
+    if (dx01*dy20 < dy01*dx20) {
+        swap(x0, x1);
+        swap(y0, y1);
+        dx01 = x0 - x1;
+        dy01 = y0 - y1;
+        dx20 = x2 - x0;
+        dy20 = y2 - y0;
+    }
+    int32_t dx12 = x1 - x2;
+    int32_t dy12 = y1 - y2;
+
+    // bounding box & scissor
+    const int32_t bminx = TRI_FLOOR(min(x0, x1, x2)) >> TRI_FRACTION_BITS;
+    const int32_t bminy = TRI_FLOOR(min(y0, y1, y2)) >> TRI_FRACTION_BITS;
+    const int32_t bmaxx = TRI_CEIL( max(x0, x1, x2)) >> TRI_FRACTION_BITS;
+    const int32_t bmaxy = TRI_CEIL( max(y0, y1, y2)) >> TRI_FRACTION_BITS;
+    const int32_t minx = max(bminx, c->state.scissor.left);
+    const int32_t miny = max(bminy, c->state.scissor.top);
+    const int32_t maxx = min(bmaxx, c->state.scissor.right);
+    const int32_t maxy = min(bmaxy, c->state.scissor.bottom);
+    if ((minx >= maxx) || (miny >= maxy))
+        return; // too small or clipped out...
+
+    // step equations to the bounding box and snap to pixel center
+    const int32_t my = (miny << TRI_FRACTION_BITS) + TRI_HALF;
+    const int32_t mx = (minx << TRI_FRACTION_BITS) + TRI_HALF;
+    int32_t ey0 = dy01 * (x0 - mx) - dx01 * (y0 - my);
+    int32_t ey1 = dy12 * (x1 - mx) - dx12 * (y1 - my);
+    int32_t ey2 = dy20 * (x2 - mx) - dx20 * (y2 - my);
+
+    // right-exclusive fill rule, to avoid rare cases
+    // of over drawing
+    if (dy01<0 || (dy01 == 0 && dx01>0)) ey0++;
+    if (dy12<0 || (dy12 == 0 && dx12>0)) ey1++;
+    if (dy20<0 || (dy20 == 0 && dx20>0)) ey2++;
+    
+    c->init_y(c, miny);
+    for (int32_t y = miny; y < maxy; y++) {
+        register int32_t ex0 = ey0;
+        register int32_t ex1 = ey1;
+        register int32_t ex2 = ey2;    
+        register int32_t xl, xr;
+        for (xl=minx ; xl<maxx ; xl++) {
+            if (ex0>0 && ex1>0 && ex2>0)
+                break; // all strictly positive
+            ex0 -= dy01 << TRI_FRACTION_BITS;
+            ex1 -= dy12 << TRI_FRACTION_BITS;
+            ex2 -= dy20 << TRI_FRACTION_BITS;
+        }
+        xr = xl;
+        for ( ; xr<maxx ; xr++) {
+            if (!(ex0>0 && ex1>0 && ex2>0))
+                break; // not all strictly positive
+            ex0 -= dy01 << TRI_FRACTION_BITS;
+            ex1 -= dy12 << TRI_FRACTION_BITS;
+            ex2 -= dy20 << TRI_FRACTION_BITS;
+        }
+
+        if (xl < xr) {
+            c->iterators.xl = xl;
+            c->iterators.xr = xr;
+            c->scanline(c);
+        }
+        c->step_y(c);
+
+        ey0 += dx01 << TRI_FRACTION_BITS;
+        ey1 += dx12 << TRI_FRACTION_BITS;
+        ey2 += dx20 << TRI_FRACTION_BITS;
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+// the following routine fills a triangle via edge stepping, which
+// unfortunately requires divisions in the setup phase to get right,
+// it should probably only be used for relatively large trianges
+
+
+// x = y*DX/DY    (ou DX and DY are constants, DY > 0, et y >= 0)
+// 
+// for an equation of the type:
+//      x' = y*K/2^p     (with K and p constants "carefully chosen")
+// 
+// We can now do a DDA without precision loss. We define 'e' by:
+//      x' - x = y*(DX/DY - K/2^p) = y*e
+// 
+// If we choose K = round(DX*2^p/DY) then,
+//      abs(e) <= 1/2^(p+1) by construction
+// 
+// therefore abs(x'-x) = y*abs(e) <= y/2^(p+1) <= DY/2^(p+1) <= DMAX/2^(p+1)
+// 
+// which means that if DMAX <= 2^p, therefore abs(x-x') <= 1/2, including
+// at the last line. In fact, it's even a strict inequality except in one
+// extrem case (DY == DMAX et e = +/- 1/2)
+// 
+// Applying that to our coordinates, we need 2^p >= 4096*16 = 65536
+// so p = 16 is enough, we're so lucky!
+
+const int TRI_ITERATORS_BITS = 16;
+
+struct Edge
+{
+  int32_t  x;      // edge position in 16.16 coordinates
+  int32_t  x_incr; // on each step, increment x by that amount
+  int32_t  y_top;  // starting scanline, 16.4 format
+  int32_t  y_bot;
+};
+
+static void
+edge_dump( Edge*  edge )
+{
+  LOGI( "  top=%d (%.3f)  bot=%d (%.3f)  x=%d (%.3f)  ix=%d (%.3f)",
+        edge->y_top, edge->y_top/float(TRI_ONE),
+		edge->y_bot, edge->y_bot/float(TRI_ONE),
+		edge->x, edge->x/float(FIXED_ONE),
+		edge->x_incr, edge->x_incr/float(FIXED_ONE) );
+}
+
+static void
+triangle_dump_edges( Edge*  edges,
+                     int            count )
+{ 
+    LOGI( "%d edge%s:\n", count, count == 1 ? "" : "s" );
+	for ( ; count > 0; count--, edges++ )
+	  edge_dump( edges );
+}
+
+// the following function sets up an edge, it assumes
+// that ymin and ymax are in already in the 'reduced'
+// format
+static __attribute__((noinline))
+void edge_setup(
+        Edge*           edges,
+        int*            pcount,
+        const GGLcoord* p1,
+        const GGLcoord* p2,
+        int32_t         ymin,
+        int32_t         ymax )
+{
+	const GGLfixed*  top = p1;
+	const GGLfixed*  bot = p2;
+	Edge*    edge = edges + *pcount;
+
+	if (top[1] > bot[1]) {
+        swap(top, bot);
+	}
+
+	int  y1 = top[1] | 1;
+	int  y2 = bot[1] | 1;
+	int  dy = y2 - y1;
+
+	if ( dy == 0 || y1 > ymax || y2 < ymin )
+		return;
+
+	if ( y1 > ymin )
+		ymin = TRI_SNAP_NEXT_HALF(y1);
+	
+	if ( y2 < ymax )
+		ymax = TRI_SNAP_PREV_HALF(y2);
+
+	if ( ymin > ymax )  // when the edge doesn't cross any scanline
+	  return;
+
+	const int x1 = top[0];
+	const int dx = bot[0] - x1;
+    const int shift = TRI_ITERATORS_BITS - TRI_FRACTION_BITS;
+
+	// setup edge fields
+    // We add 0.5 to edge->x here because it simplifies the rounding
+    // in triangle_sweep_edges() -- this doesn't change the ordering of 'x'
+	edge->x      = (x1 << shift) + (1LU << (TRI_ITERATORS_BITS-1));
+	edge->x_incr = 0;
+	edge->y_top  = ymin;
+	edge->y_bot  = ymax;
+
+	if (ggl_likely(ymin <= ymax && dx)) {
+        edge->x_incr = gglDivQ16(dx, dy);
+    }
+    if (ggl_likely(y1 < ymin)) {
+        int32_t xadjust = (edge->x_incr * (ymin-y1)) >> TRI_FRACTION_BITS;
+        edge->x += xadjust;
+    }
+  
+	++*pcount;
+}
+
+
+static void
+triangle_sweep_edges( Edge*  left,
+                      Edge*  right,
+					  int            ytop,
+					  int            ybot,
+					  context_t*     c )
+{
+    int count = ((ybot - ytop)>>TRI_FRACTION_BITS) + 1;
+    if (count<=0) return;
+
+    // sort the edges horizontally
+    if ((left->x > right->x) || 
+        ((left->x == right->x) && (left->x_incr > right->x_incr))) {
+        swap(left, right);
+    }
+
+    int left_x = left->x;
+    int right_x = right->x;
+    const int left_xi = left->x_incr;
+    const int right_xi  = right->x_incr;
+    left->x  += left_xi * count;
+    right->x += right_xi * count;
+
+	const int xmin = c->state.scissor.left;
+	const int xmax = c->state.scissor.right;
+    do {
+        // horizontal scissoring
+        const int32_t xl = max(left_x  >> TRI_ITERATORS_BITS, xmin);
+        const int32_t xr = min(right_x >> TRI_ITERATORS_BITS, xmax);
+        left_x  += left_xi;
+        right_x += right_xi;
+        // invoke the scanline rasterizer
+        if (ggl_likely(xl < xr)) {
+            c->iterators.xl = xl;
+            c->iterators.xr = xr;
+            c->scanline(c);
+        }
+		c->step_y(c);
+	} while (--count);
+}
+
+
+void trianglex_big(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+
+    Edge edges[3];
+	int num_edges = 0;
+	int32_t ymin = TRI_FROM_INT(c->state.scissor.top)    + TRI_HALF;
+	int32_t ymax = TRI_FROM_INT(c->state.scissor.bottom) - TRI_HALF;
+	    
+	edge_setup( edges, &num_edges, v0, v1, ymin, ymax );
+	edge_setup( edges, &num_edges, v0, v2, ymin, ymax );
+	edge_setup( edges, &num_edges, v1, v2, ymin, ymax );
+
+    if (ggl_unlikely(num_edges<2))  // for really tiny triangles that don't
+		return;                     // cross any scanline centers
+
+    Edge* left  = &edges[0];
+    Edge* right = &edges[1];
+    Edge* other = &edges[2];
+    int32_t y_top = min(left->y_top, right->y_top);
+    int32_t y_bot = max(left->y_bot, right->y_bot);
+
+	if (ggl_likely(num_edges==3)) {
+        y_top = min(y_top, edges[2].y_top);
+        y_bot = max(y_bot, edges[2].y_bot);
+		if (edges[0].y_top > y_top) {
+            other = &edges[0];
+            left  = &edges[2];
+		} else if (edges[1].y_top > y_top) {
+            other = &edges[1];
+            right = &edges[2];
+		}
+    }
+
+    c->init_y(c, y_top >> TRI_FRACTION_BITS);
+
+    int32_t y_mid = min(left->y_bot, right->y_bot);
+    triangle_sweep_edges( left, right, y_top, y_mid, c );
+
+    // second scanline sweep loop, if necessary
+    y_mid += TRI_ONE;
+    if (y_mid <= y_bot) {
+        ((left->y_bot == y_bot) ? right : left) = other;
+        if (other->y_top < y_mid) {
+            other->x += other->x_incr;
+        }
+        triangle_sweep_edges( left, right, y_mid, y_bot, c );
+    }
+}
+
+void aa_trianglex(void* con,
+        const GGLcoord* a, const GGLcoord* b, const GGLcoord* c)
+{
+    GGLcoord pts[6] = { a[0], a[1], b[0], b[1], c[0], c[1] };
+    aapolyx(con, pts, 3);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+struct AAEdge
+{
+    GGLfixed x;         // edge position in 12.16 coordinates
+    GGLfixed x_incr;    // on each y step, increment x by that amount
+    GGLfixed y_incr;    // on each x step, increment y by that amount
+    int16_t y_top;      // starting scanline, 12.4 format
+    int16_t y_bot;      // starting scanline, 12.4 format
+    void dump();
+};
+
+void AAEdge::dump()
+{
+    float tri  = 1.0f / TRI_ONE;
+    float iter = 1.0f / (1<<TRI_ITERATORS_BITS);
+    float fix  = 1.0f / FIXED_ONE;
+    LOGD(   "x=%08x (%.3f), "
+            "x_incr=%08x (%.3f), y_incr=%08x (%.3f), "
+            "y_top=%08x (%.3f), y_bot=%08x (%.3f) ",
+        x, x*fix,
+        x_incr, x_incr*iter,
+        y_incr, y_incr*iter,
+        y_top, y_top*tri,
+        y_bot, y_bot*tri );
+}
+
+// the following function sets up an edge, it assumes
+// that ymin and ymax are in already in the 'reduced'
+// format
+static __attribute__((noinline))
+void aa_edge_setup(
+        AAEdge*         edges,
+        int*            pcount,
+        const GGLcoord* p1,
+        const GGLcoord* p2,
+        int32_t         ymin,
+        int32_t         ymax )
+{
+    const GGLfixed*  top = p1;
+    const GGLfixed*  bot = p2;
+    AAEdge* edge = edges + *pcount;
+
+    if (top[1] > bot[1])
+        swap(top, bot);
+
+    int  y1 = top[1];
+    int  y2 = bot[1];
+    int  dy = y2 - y1;
+
+    if (dy==0 || y1>ymax || y2<ymin)
+        return;
+
+    if (y1 > ymin)
+        ymin = y1;
+    
+    if (y2 < ymax)
+        ymax = y2;
+
+    const int x1 = top[0];
+    const int dx = bot[0] - x1;
+    const int shift = FIXED_BITS - TRI_FRACTION_BITS;
+
+    // setup edge fields
+    edge->x      = x1 << shift;
+    edge->x_incr = 0;
+    edge->y_top  = ymin;
+    edge->y_bot  = ymax;
+    edge->y_incr = 0x7FFFFFFF;
+
+    if (ggl_likely(ymin <= ymax && dx)) {
+        edge->x_incr = gglDivQ16(dx, dy);
+        if (dx != 0) {
+            edge->y_incr = abs(gglDivQ16(dy, dx));
+        }
+    }
+    if (ggl_likely(y1 < ymin)) {
+        int32_t xadjust = (edge->x_incr * (ymin-y1))
+                >> (TRI_FRACTION_BITS + TRI_ITERATORS_BITS - FIXED_BITS);
+        edge->x += xadjust;
+    }
+  
+    ++*pcount;
+}
+
+
+typedef int (*compar_t)(const void*, const void*);
+static int compare_edges(const AAEdge *e0, const AAEdge *e1) {
+    if (e0->y_top > e1->y_top)      return 1;
+    if (e0->y_top < e1->y_top)      return -1;
+    if (e0->x > e1->x)              return 1;
+    if (e0->x < e1->x)              return -1;
+    if (e0->x_incr > e1->x_incr)    return 1;
+    if (e0->x_incr < e1->x_incr)    return -1;
+    return 0; // same edges, should never happen
+}
+
+static inline 
+void SET_COVERAGE(int16_t*& p, int32_t value, ssize_t n)
+{
+    android_memset16((uint16_t*)p, value, n*2);
+    p += n;
+}
+
+static inline 
+void ADD_COVERAGE(int16_t*& p, int32_t value)
+{
+    value = *p + value;
+    if (value >= 0x8000)
+        value = 0x7FFF;
+    *p++ = value;
+}
+
+static inline
+void SUB_COVERAGE(int16_t*& p, int32_t value)
+{
+    value = *p - value;
+    value &= ~(value>>31);
+    *p++ = value;
+}
+
+void aapolyx(void* con,
+        const GGLcoord* pts, int count)
+{
+    /*
+     * NOTE: This routine assumes that the polygon has been clipped to the
+     * viewport already, that is, no vertex lies outside of the framebuffer.
+     * If this happens, the code below won't corrupt memory but the 
+     * coverage values may not be correct.
+     */
+    
+    GGL_CONTEXT(c, con);
+
+    // we do only quads for now (it's used for thick lines)
+    if ((count>4) || (count<2)) return;
+
+    // take scissor into account
+    const int xmin = c->state.scissor.left;
+    const int xmax = c->state.scissor.right;
+    if (xmin >= xmax) return;
+
+    // generate edges from the vertices
+    int32_t ymin = TRI_FROM_INT(c->state.scissor.top);
+    int32_t ymax = TRI_FROM_INT(c->state.scissor.bottom);
+    if (ymin >= ymax) return;
+
+    AAEdge edges[4];
+    int num_edges = 0;
+    GGLcoord const * p = pts;
+    for (int i=0 ; i<count-1 ; i++, p+=2) {
+        aa_edge_setup(edges, &num_edges, p, p+2, ymin, ymax);
+    }
+    aa_edge_setup(edges, &num_edges, p, pts, ymin, ymax );
+    if (ggl_unlikely(num_edges<2))
+        return;
+
+    // sort the edge list top to bottom, left to right.
+    qsort(edges, num_edges, sizeof(AAEdge), (compar_t)compare_edges);
+
+    int16_t* const covPtr = c->state.buffers.coverage;
+    memset(covPtr+xmin, 0, (xmax-xmin)*sizeof(*covPtr));
+
+    // now, sweep all edges in order
+    // start with the 2 first edges. We know that they share their top
+    // vertex, by construction.
+    int i = 2;
+    AAEdge* left  = &edges[0];
+    AAEdge* right = &edges[1];
+    int32_t yt = left->y_top;
+    GGLfixed l = left->x;
+    GGLfixed r = right->x;
+    int retire = 0;
+    int16_t* coverage;
+
+    // at this point we can initialize the rasterizer    
+    c->init_y(c, yt>>TRI_FRACTION_BITS);
+    c->iterators.xl = xmax;
+    c->iterators.xr = xmin;
+
+    do {
+        int32_t y = min(min(left->y_bot, right->y_bot), TRI_FLOOR(yt + TRI_ONE));
+        const int32_t shift = TRI_FRACTION_BITS + TRI_ITERATORS_BITS - FIXED_BITS;
+        const int cf_shift = (1 + TRI_FRACTION_BITS*2 + TRI_ITERATORS_BITS - 15);
+
+        // compute xmin and xmax for the left edge
+        GGLfixed l_min = gglMulAddx(left->x_incr, y - left->y_top, left->x, shift);
+        GGLfixed l_max = l;
+        l = l_min;
+        if (l_min > l_max)
+            swap(l_min, l_max);
+
+        // compute xmin and xmax for the right edge
+        GGLfixed r_min = gglMulAddx(right->x_incr, y - right->y_top, right->x, shift);
+        GGLfixed r_max = r;
+        r = r_min;
+        if (r_min > r_max)
+            swap(r_min, r_max);
+
+        // make sure we're not touching coverage values outside of the
+        // framebuffer
+        l_min &= ~(l_min>>31);
+        r_min &= ~(r_min>>31);
+        l_max &= ~(l_max>>31);
+        r_max &= ~(r_max>>31);
+        if (gglFixedToIntFloor(l_min) >= xmax) l_min = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntFloor(r_min) >= xmax) r_min = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntCeil(l_max) >= xmax)  l_max = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntCeil(r_max) >= xmax)  r_max = gglIntToFixed(xmax)-1;
+
+        // compute the integer versions of the above
+        const GGLfixed l_min_i = gglFloorx(l_min);
+        const GGLfixed l_max_i = gglCeilx (l_max);
+        const GGLfixed r_min_i = gglFloorx(r_min);
+        const GGLfixed r_max_i = gglCeilx (r_max);
+
+        // clip horizontally using the scissor
+        const int xml = max(xmin, gglFixedToIntFloor(l_min_i));
+        const int xmr = min(xmax, gglFixedToIntFloor(r_max_i));
+
+        // if we just stepped to a new scanline, render the previous one.
+        // and clear the coverage buffer
+        if (retire) {
+            if (c->iterators.xl < c->iterators.xr)
+                c->scanline(c);
+            c->step_y(c);
+            memset(covPtr+xmin, 0, (xmax-xmin)*sizeof(*covPtr));
+            c->iterators.xl = xml;
+            c->iterators.xr = xmr;
+        } else {
+            // update the horizontal range of this scanline
+            c->iterators.xl = min(c->iterators.xl, xml);
+            c->iterators.xr = max(c->iterators.xr, xmr);
+        }
+
+        coverage = covPtr + gglFixedToIntFloor(l_min_i);
+        if (l_min_i == gglFloorx(l_max)) {
+            
+            /*
+             *  fully traverse this pixel vertically
+             *       l_max
+             *  +-----/--+  yt
+             *  |    /   |  
+             *  |   /    |
+             *  |  /     |
+             *  +-/------+  y
+             *   l_min  (l_min_i + TRI_ONE)
+             */
+              
+            GGLfixed dx = l_max - l_min;
+            int32_t dy = y - yt;
+            int cf = gglMulx((dx >> 1) + (l_min_i + FIXED_ONE - l_max), dy,
+                FIXED_BITS + TRI_FRACTION_BITS - 15);
+            ADD_COVERAGE(coverage, cf);
+            // all pixels on the right have cf = 1.0
+        } else {
+            /*
+             *  spans several pixels in one scanline
+             *            l_max
+             *  +--------+--/-----+  yt
+             *  |        |/       |
+             *  |       /|        |
+             *  |     /  |        |
+             *  +---/----+--------+  y
+             *   l_min (l_min_i + TRI_ONE)
+             */
+
+            // handle the first pixel separately...
+            const int32_t y_incr = left->y_incr;
+            int32_t dx = TRI_FROM_FIXED(l_min_i - l_min) + TRI_ONE;
+            int32_t cf = (dx * dx * y_incr) >> cf_shift;
+            ADD_COVERAGE(coverage, cf);
+
+            // following pixels get covered by y_incr, but we need
+            // to fix-up the cf to account for previous partial pixel
+            dx = TRI_FROM_FIXED(l_min - l_min_i);
+            cf -= (dx * dx * y_incr) >> cf_shift;
+            for (int x = l_min_i+FIXED_ONE ; x < l_max_i-FIXED_ONE ; x += FIXED_ONE) {
+                cf += y_incr >> (TRI_ITERATORS_BITS-15);
+                ADD_COVERAGE(coverage, cf);
+            }
+            
+            // and the last pixel
+            dx = TRI_FROM_FIXED(l_max - l_max_i) - TRI_ONE;
+            cf += (dx * dx * y_incr) >> cf_shift;
+            ADD_COVERAGE(coverage, cf);
+        }
+        
+        // now, fill up all fully covered pixels
+        coverage = covPtr + gglFixedToIntFloor(l_max_i);
+        int cf = ((y - yt) << (15 - TRI_FRACTION_BITS));
+        if (ggl_likely(cf >= 0x8000)) {
+            SET_COVERAGE(coverage, 0x7FFF, ((r_max - l_max_i)>>FIXED_BITS)+1);
+        } else {
+            for (int x=l_max_i ; x<r_max ; x+=FIXED_ONE) {
+                ADD_COVERAGE(coverage, cf);
+            }
+        }
+        
+        // subtract the coverage of the right edge
+        coverage = covPtr + gglFixedToIntFloor(r_min_i); 
+        if (r_min_i == gglFloorx(r_max)) {
+            GGLfixed dx = r_max - r_min;
+            int32_t dy = y - yt;
+            int cf = gglMulx((dx >> 1) + (r_min_i + FIXED_ONE - r_max), dy,
+                FIXED_BITS + TRI_FRACTION_BITS - 15);
+            SUB_COVERAGE(coverage, cf);
+            // all pixels on the right have cf = 1.0
+        } else {
+            // handle the first pixel separately...
+            const int32_t y_incr = right->y_incr;
+            int32_t dx = TRI_FROM_FIXED(r_min_i - r_min) + TRI_ONE;
+            int32_t cf = (dx * dx * y_incr) >> cf_shift;
+            SUB_COVERAGE(coverage, cf);
+            
+            // following pixels get covered by y_incr, but we need
+            // to fix-up the cf to account for previous partial pixel
+            dx = TRI_FROM_FIXED(r_min - r_min_i);
+            cf -= (dx * dx * y_incr) >> cf_shift;
+            for (int x = r_min_i+FIXED_ONE ; x < r_max_i-FIXED_ONE ; x += FIXED_ONE) {
+                cf += y_incr >> (TRI_ITERATORS_BITS-15);
+                SUB_COVERAGE(coverage, cf);
+            }
+            
+            // and the last pixel
+            dx = TRI_FROM_FIXED(r_max - r_max_i) - TRI_ONE;
+            cf += (dx * dx * y_incr) >> cf_shift;
+            SUB_COVERAGE(coverage, cf);
+        }
+
+        // did we reach the end of an edge? if so, get a new one.
+        if (y == left->y_bot || y == right->y_bot) {
+            // bail out if we're done
+            if (i>=num_edges)
+                break;
+            if (y == left->y_bot)
+                left = &edges[i++];
+            if (y == right->y_bot)
+                right = &edges[i++];
+        }
+
+        // next scanline
+        yt = y;
+        
+        // did we just finish a scanline?        
+        retire = (y << (32-TRI_FRACTION_BITS)) == 0;
+    } while (true);
+
+    // render the last scanline
+    if (c->iterators.xl < c->iterators.xr)
+        c->scanline(c);
+}
+
+}; // namespace android
diff --git a/libpixelflinger/trap.h b/libpixelflinger/trap.h
new file mode 100644
index 0000000..7cce7b3
--- /dev/null
+++ b/libpixelflinger/trap.h
@@ -0,0 +1,31 @@
+/* libs/pixelflinger/trap.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_TRAP_H
+#define ANDROID_TRAP_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_trap(context_t* c);
+void ggl_state_changed(context_t* c, int flags);
+
+}; // namespace android
+
+#endif