9 files changed, 975 insertions, 1517 deletions
diff --git a/libs/rs/java/tests/src/com/android/rs/test/math.rs b/libs/rs/java/tests/src/com/android/rs/test/math.rs
index 02993fe..8cad82b 100644
--- a/libs/rs/java/tests/src/com/android/rs/test/math.rs
+++ b/libs/rs/java/tests/src/com/android/rs/test/math.rs
@@ -12,6 +12,31 @@ volatile int2 i2;
 volatile int3 i3;
 volatile int4 i4;
 
+volatile uint ui1;
+volatile uint2 ui2;
+volatile uint3 ui3;
+volatile uint4 ui4;
+
+volatile short s1;
+volatile short2 s2;
+volatile short3 s3;
+volatile short4 s4;
+
+volatile ushort us1;
+volatile ushort2 us2;
+volatile ushort3 us3;
+volatile ushort4 us4;
+
+volatile char c1;
+volatile char2 c2;
+volatile char3 c3;
+volatile char4 c4;
+
+volatile uchar uc1;
+volatile uchar2 uc2;
+volatile uchar3 uc3;
+volatile uchar4 uc4;
+
 #define TEST_FN_FUNC_FN(fnc)        \
     rsDebug("Testing " #fnc, 0);    \
     f1 = fnc(f1);                   \
@@ -168,9 +193,124 @@ static bool test_fp_math(uint32_t index) {
     return failed;
 }
 
+#define DECL_INT(prefix)            \
+volatile char prefix##_c_1 = 1;     \
+volatile char2 prefix##_c_2 = 1;    \
+volatile char3 prefix##_c_3 = 1;    \
+volatile char4 prefix##_c_4 = 1;    \
+volatile uchar prefix##_uc_1 = 1;   \
+volatile uchar2 prefix##_uc_2 = 1;  \
+volatile uchar3 prefix##_uc_3 = 1;  \
+volatile uchar4 prefix##_uc_4 = 1;  \
+volatile short prefix##_s_1 = 1;    \
+volatile short2 prefix##_s_2 = 1;   \
+volatile short3 prefix##_s_3 = 1;   \
+volatile short4 prefix##_s_4 = 1;   \
+volatile ushort prefix##_us_1 = 1;  \
+volatile ushort2 prefix##_us_2 = 1; \
+volatile ushort3 prefix##_us_3 = 1; \
+volatile ushort4 prefix##_us_4 = 1; \
+volatile int prefix##_i_1 = 1;      \
+volatile int2 prefix##_i_2 = 1;     \
+volatile int3 prefix##_i_3 = 1;     \
+volatile int4 prefix##_i_4 = 1;     \
+volatile uint prefix##_ui_1 = 1;    \
+volatile uint2 prefix##_ui_2 = 1;   \
+volatile uint3 prefix##_ui_3 = 1;   \
+volatile uint4 prefix##_ui_4 = 1;   \
+volatile long prefix##_l_1 = 1;     \
+volatile ulong prefix##_ul_1 = 1;
+
+#define TEST_INT_OP_TYPE(op, type)                      \
+rsDebug("Testing " #op " for " #type "1", i++);         \
+res_##type##_1 = src1_##type##_1 op src2_##type##_1;    \
+rsDebug("Testing " #op " for " #type "2", i++);         \
+res_##type##_2 = src1_##type##_2 op src2_##type##_2;    \
+rsDebug("Testing " #op " for " #type "3", i++);         \
+res_##type##_3 = src1_##type##_3 op src2_##type##_3;    \
+rsDebug("Testing " #op " for " #type "4", i++);         \
+res_##type##_4 = src1_##type##_4 op src2_##type##_4;
+
+#define TEST_INT_OP(op)                     \
+TEST_INT_OP_TYPE(op, c)                     \
+TEST_INT_OP_TYPE(op, uc)                    \
+TEST_INT_OP_TYPE(op, s)                     \
+TEST_INT_OP_TYPE(op, us)                    \
+TEST_INT_OP_TYPE(op, i)                     \
+TEST_INT_OP_TYPE(op, ui)                    \
+rsDebug("Testing " #op " for l1", i++);     \
+res_l_1 = src1_l_1 op src2_l_1;             \
+rsDebug("Testing " #op " for ul1", i++);    \
+res_ul_1 = src1_ul_1 op src2_ul_1;
+
+DECL_INT(res)
+DECL_INT(src1)
+DECL_INT(src2)
+
+static bool test_basic_operators() {
+    bool failed = false;
+    int i = 0;
+
+    TEST_INT_OP(+);
+    TEST_INT_OP(-);
+    TEST_INT_OP(*);
+    TEST_INT_OP(/);
+    TEST_INT_OP(%);
+    TEST_INT_OP(<<);
+    TEST_INT_OP(>>);
+
+    if (failed) {
+        rsDebug("test_basic_operators FAILED", 0);
+    }
+    else {
+        rsDebug("test_basic_operators PASSED", 0);
+    }
+
+    return failed;
+}
+
+#define TEST_CVT(to, from, type)                        \
+rsDebug("Testing convert from " #from " to " #to, 0);   \
+to##1 = from##1;                                        \
+to##2 = convert_##type##2(from##2);                     \
+to##3 = convert_##type##3(from##3);                     \
+to##4 = convert_##type##4(from##4);
+
+#define TEST_CVT_MATRIX(to, type)   \
+TEST_CVT(to, c, type);              \
+TEST_CVT(to, uc, type);             \
+TEST_CVT(to, s, type);              \
+TEST_CVT(to, us, type);             \
+TEST_CVT(to, i, type);              \
+TEST_CVT(to, ui, type);             \
+TEST_CVT(to, f, type);              \
+
+static bool test_convert() {
+    bool failed = false;
+
+    TEST_CVT_MATRIX(c, char);
+    TEST_CVT_MATRIX(uc, uchar);
+    TEST_CVT_MATRIX(s, short);
+    TEST_CVT_MATRIX(us, ushort);
+    TEST_CVT_MATRIX(i, int);
+    TEST_CVT_MATRIX(ui, uint);
+    TEST_CVT_MATRIX(f, float);
+
+    if (failed) {
+        rsDebug("test_convert FAILED", 0);
+    }
+    else {
+        rsDebug("test_convert PASSED", 0);
+    }
+
+    return failed;
+}
+
 void math_test(uint32_t index, int test_num) {
     bool failed = false;
+    failed |= test_convert();
     failed |= test_fp_math(index);
+    failed |= test_basic_operators();
 
     if (failed) {
         rsSendToClientBlocking(RS_MSG_TEST_FAILED);
diff --git a/libs/rs/rsContext.cpp b/libs/rs/rsContext.cpp
index 3acb624..40cb5c7 100644
--- a/libs/rs/rsContext.cpp
+++ b/libs/rs/rsContext.cpp
@@ -201,9 +201,9 @@ bool Context::initGLThread() {
     mGL.mExtensions = glGetString(GL_EXTENSIONS);
 
     //LOGV("EGL Version %i %i", mEGL.mMajorVersion, mEGL.mMinorVersion);
-    LOGV("GL Version %s", mGL.mVersion);
+    //LOGV("GL Version %s", mGL.mVersion);
     //LOGV("GL Vendor %s", mGL.mVendor);
-    LOGV("GL Renderer %s", mGL.mRenderer);
+    //LOGV("GL Renderer %s", mGL.mRenderer);
     //LOGV("GL Extensions %s", mGL.mExtensions);
 
     const char *verptr = NULL;
@@ -468,7 +468,6 @@ void * Context::threadProc(void *vrsc) {
          return NULL;
      }
 
-     rsc->mScriptC.init(rsc);
      if (rsc->mIsGraphicsContext) {
          rsc->mStateRaster.init(rsc);
          rsc->setProgramRaster(NULL);
@@ -528,7 +527,7 @@ void * Context::threadProc(void *vrsc) {
 }
 
 void Context::destroyWorkerThreadResources() {
-    LOGV("destroyWorkerThreadResources 1");
+    //LOGV("destroyWorkerThreadResources 1");
     if (mIsGraphicsContext) {
          mRaster.clear();
          mFragment.clear();
@@ -544,7 +543,7 @@ void Context::destroyWorkerThreadResources() {
          mShaderCache.cleanupAll();
     }
     ObjectBase::zeroAllUserRef(this);
-    LOGV("destroyWorkerThreadResources 2");
+    //LOGV("destroyWorkerThreadResources 2");
     mExit = true;
 }
 
@@ -552,7 +551,7 @@ void * Context::helperThreadProc(void *vrsc) {
      Context *rsc = static_cast<Context *>(vrsc);
      uint32_t idx = (uint32_t)android_atomic_inc(&rsc->mWorkers.mLaunchCount);
 
-     LOGV("RS helperThread starting %p idx=%i", rsc, idx);
+     //LOGV("RS helperThread starting %p idx=%i", rsc, idx);
 
      rsc->mWorkers.mLaunchSignals[idx].init();
      rsc->mWorkers.mNativeThreadId[idx] = gettid();
@@ -573,7 +572,7 @@ void * Context::helperThreadProc(void *vrsc) {
          LOGE("pthread_setspecific %i", status);
      }
 
-     while (rsc->mRunning) {
+     while (!rsc->mExit) {
          rsc->mWorkers.mLaunchSignals[idx].wait();
          if (rsc->mWorkers.mLaunchCallback) {
             rsc->mWorkers.mLaunchCallback(rsc->mWorkers.mLaunchData, idx);
@@ -582,7 +581,7 @@ void * Context::helperThreadProc(void *vrsc) {
          rsc->mWorkers.mCompleteSignal.set();
      }
 
-     LOGV("RS helperThread exiting %p idx=%i", rsc, idx);
+     //LOGV("RS helperThread exited %p idx=%i", rsc, idx);
      return NULL;
 }
 
@@ -730,6 +729,18 @@ Context::~Context() {
     mIO.shutdown();
     int status = pthread_join(mThreadId, &res);
 
+    // Cleanup compute threads.
+    mWorkers.mLaunchData = NULL;
+    mWorkers.mLaunchCallback = NULL;
+    mWorkers.mRunningCount = (int)mWorkers.mCount;
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        int status = pthread_join(mWorkers.mThreadId[ct], &res);
+    }
+    rsAssert(!mWorkers.mRunningCount);
+
     // Global structure cleanup.
     pthread_mutex_lock(&gInitMutex);
     if (mDev) {
diff --git a/libs/rs/rsLocklessFifo.cpp b/libs/rs/rsLocklessFifo.cpp
index eb2af1c..3f88543 100644
--- a/libs/rs/rsLocklessFifo.cpp
+++ b/libs/rs/rsLocklessFifo.cpp
@@ -76,7 +76,8 @@ uint32_t LocklessCommandFifo::getFreeSpace() const {
 }
 
 bool LocklessCommandFifo::isEmpty() const {
-    return mPut == mGet;
+    uint32_t p = android_atomic_acquire_load((int32_t *)&mPut);
+    return ((uint8_t *)p) == mGet;
 }
 
 
@@ -155,7 +156,9 @@ const void * LocklessCommandFifo::get(uint32_t *command, uint32_t *bytesData) {
 
 void LocklessCommandFifo::next() {
     uint32_t bytes = reinterpret_cast<const uint16_t *>(mGet)[1];
-    mGet += ((bytes + 3) & ~3) + 4;
+
+    android_atomic_add(((bytes + 3) & ~3) + 4, (int32_t *)&mGet);
+    //mGet += ((bytes + 3) & ~3) + 4;
     if (isEmpty()) {
         mSignalToControl.set();
     }
diff --git a/libs/rs/rsScriptC.cpp b/libs/rs/rsScriptC.cpp
index eecfa16..3858e1c 100644
--- a/libs/rs/rsScriptC.cpp
+++ b/libs/rs/rsScriptC.cpp
@@ -421,21 +421,9 @@ void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, uint32_t len
 }
 
 ScriptCState::ScriptCState() {
-    mScript.clear();
 }
 
 ScriptCState::~ScriptCState() {
-    mScript.clear();
-}
-
-void ScriptCState::init(Context *rsc) {
-    clear(rsc);
-}
-
-void ScriptCState::clear(Context *rsc) {
-    rsAssert(rsc);
-    mScript.clear();
-    mScript.set(new ScriptC(rsc));
 }
 
 static void* symbolLookup(void* pContext, char const* name) {
@@ -608,8 +596,6 @@ namespace android {
 namespace renderscript {
 
 void rsi_ScriptCBegin(Context * rsc) {
-    ScriptCState *ss = &rsc->mScriptC;
-    ss->clear(rsc);
 }
 
 void rsi_ScriptCSetText(Context *rsc, const char *text, uint32_t len) {
@@ -618,8 +604,8 @@ void rsi_ScriptCSetText(Context *rsc, const char *text, uint32_t len) {
     char *t = (char *)malloc(len + 1);
     memcpy(t, text, len);
     t[len] = 0;
-    ss->mScript->mEnviroment.mScriptText = t;
-    ss->mScript->mEnviroment.mScriptTextLength = len;
+    ss->mScriptText = t;
+    ss->mScriptLen = len;
 }
 
 
@@ -630,17 +616,19 @@ RsScript rsi_ScriptCCreate(Context *rsc,
 {
     ScriptCState *ss = &rsc->mScriptC;
 
-    ObjectBaseRef<ScriptC> s(ss->mScript);
-    ss->mScript.clear();
+    ScriptC *s = new ScriptC(rsc);
+    s->mEnviroment.mScriptText = ss->mScriptText;
+    s->mEnviroment.mScriptTextLength = ss->mScriptLen;
+    ss->mScriptText = NULL;
+    ss->mScriptLen = 0;
     s->incUserRef();
 
-    if (!ss->runCompiler(rsc, s.get(), resName, cacheDir)) {
+    if (!ss->runCompiler(rsc, s, resName, cacheDir)) {
         // Error during compile, destroy s and return null.
-        s->zeroUserRef();
+        delete s;
         return NULL;
     }
-    ss->clear(rsc);
-    return s.get();
+    return s;
 }
 
 }
diff --git a/libs/rs/rsScriptC.h b/libs/rs/rsScriptC.h
index 612e38a..7143c67 100644
--- a/libs/rs/rsScriptC.h
+++ b/libs/rs/rsScriptC.h
@@ -76,11 +76,9 @@ public:
     ScriptCState();
     ~ScriptCState();
 
-    ObjectBaseRef<ScriptC> mScript;
+    char * mScriptText;
+    size_t mScriptLen;
 
-    void init(Context *rsc);
-
-    void clear(Context *rsc);
     bool runCompiler(Context *rsc, ScriptC *s, const char *resName, const char *cacheDir);
 
     struct SymbolTable_t {
@@ -88,7 +86,6 @@ public:
         void * mPtr;
         bool threadable;
     };
-    //static SymbolTable_t gSyms[];
     static const SymbolTable_t * lookupSymbol(const char *);
     static const SymbolTable_t * lookupSymbolCL(const char *);
     static const SymbolTable_t * lookupSymbolGL(const char *);
diff --git a/libs/rs/rsScriptC_Lib.cpp b/libs/rs/rsScriptC_Lib.cpp
index f550d98..8a85f6e 100644
--- a/libs/rs/rsScriptC_Lib.cpp
+++ b/libs/rs/rsScriptC_Lib.cpp
@@ -305,6 +305,14 @@ int SC_modsi3(int a, int b) {
     return a % b;
 }
 
+unsigned int SC_udivsi3(unsigned int a, unsigned int b) {
+    return a / b;
+}
+
+unsigned int SC_umodsi3(unsigned int a, unsigned int b) {
+    return a % b;
+}
+
 int SC_getAllocation(const void *ptr) {
     GET_TLS();
     const Allocation *alloc = sc->ptrToAllocation(ptr);
@@ -339,6 +347,489 @@ void SC_ForEach2(RsScript vs,
     s->runForEach(rsc, ain, aout, usr, call);
 }
 
+
+//////////////////////////////////////////////////////////////////////////////
+// Heavy math functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+static inline void
+rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v) {
+    m->m[row * 4 + col] = v;
+}
+
+static inline float
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col) {
+    return m->m[row * 4 + col];
+}
+
+static inline void
+rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v) {
+    m->m[row * 3 + col] = v;
+}
+
+static inline float
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col) {
+    return m->m[row * 3 + col];
+}
+
+static inline void
+rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v) {
+    m->m[row * 2 + col] = v;
+}
+
+static inline float
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col) {
+    return m->m[row * 2 + col];
+}
+
+
+static void SC_MatrixLoadIdentity_4x4(rs_matrix4x4 *m) {
+    m->m[0] = 1.f;
+    m->m[1] = 0.f;
+    m->m[2] = 0.f;
+    m->m[3] = 0.f;
+    m->m[4] = 0.f;
+    m->m[5] = 1.f;
+    m->m[6] = 0.f;
+    m->m[7] = 0.f;
+    m->m[8] = 0.f;
+    m->m[9] = 0.f;
+    m->m[10] = 1.f;
+    m->m[11] = 0.f;
+    m->m[12] = 0.f;
+    m->m[13] = 0.f;
+    m->m[14] = 0.f;
+    m->m[15] = 1.f;
+}
+
+static void SC_MatrixLoadIdentity_3x3(rs_matrix3x3 *m) {
+    m->m[0] = 1.f;
+    m->m[1] = 0.f;
+    m->m[2] = 0.f;
+    m->m[3] = 0.f;
+    m->m[4] = 1.f;
+    m->m[5] = 0.f;
+    m->m[6] = 0.f;
+    m->m[7] = 0.f;
+    m->m[8] = 1.f;
+}
+
+static void SC_MatrixLoadIdentity_2x2(rs_matrix2x2 *m) {
+    m->m[0] = 1.f;
+    m->m[1] = 0.f;
+    m->m[2] = 0.f;
+    m->m[3] = 1.f;
+}
+
+static void SC_MatrixLoad_4x4_f(rs_matrix4x4 *m, const float *v) {
+    m->m[0] = v[0];
+    m->m[1] = v[1];
+    m->m[2] = v[2];
+    m->m[3] = v[3];
+    m->m[4] = v[4];
+    m->m[5] = v[5];
+    m->m[6] = v[6];
+    m->m[7] = v[7];
+    m->m[8] = v[8];
+    m->m[9] = v[9];
+    m->m[10] = v[10];
+    m->m[11] = v[11];
+    m->m[12] = v[12];
+    m->m[13] = v[13];
+    m->m[14] = v[14];
+    m->m[15] = v[15];
+}
+
+static void SC_MatrixLoad_3x3_f(rs_matrix3x3 *m, const float *v) {
+    m->m[0] = v[0];
+    m->m[1] = v[1];
+    m->m[2] = v[2];
+    m->m[3] = v[3];
+    m->m[4] = v[4];
+    m->m[5] = v[5];
+    m->m[6] = v[6];
+    m->m[7] = v[7];
+    m->m[8] = v[8];
+}
+
+static void SC_MatrixLoad_2x2_f(rs_matrix2x2 *m, const float *v) {
+    m->m[0] = v[0];
+    m->m[1] = v[1];
+    m->m[2] = v[2];
+    m->m[3] = v[3];
+}
+
+static void SC_MatrixLoad_4x4_4x4(rs_matrix4x4 *m, const rs_matrix4x4 *v) {
+    m->m[0] = v->m[0];
+    m->m[1] = v->m[1];
+    m->m[2] = v->m[2];
+    m->m[3] = v->m[3];
+    m->m[4] = v->m[4];
+    m->m[5] = v->m[5];
+    m->m[6] = v->m[6];
+    m->m[7] = v->m[7];
+    m->m[8] = v->m[8];
+    m->m[9] = v->m[9];
+    m->m[10] = v->m[10];
+    m->m[11] = v->m[11];
+    m->m[12] = v->m[12];
+    m->m[13] = v->m[13];
+    m->m[14] = v->m[14];
+    m->m[15] = v->m[15];
+}
+
+static void SC_MatrixLoad_4x4_3x3(rs_matrix4x4 *m, const rs_matrix3x3 *v) {
+    m->m[0] = v->m[0];
+    m->m[1] = v->m[1];
+    m->m[2] = v->m[2];
+    m->m[3] = 0.f;
+    m->m[4] = v->m[3];
+    m->m[5] = v->m[4];
+    m->m[6] = v->m[5];
+    m->m[7] = 0.f;
+    m->m[8] = v->m[6];
+    m->m[9] = v->m[7];
+    m->m[10] = v->m[8];
+    m->m[11] = 0.f;
+    m->m[12] = 0.f;
+    m->m[13] = 0.f;
+    m->m[14] = 0.f;
+    m->m[15] = 1.f;
+}
+
+static void SC_MatrixLoad_4x4_2x2(rs_matrix4x4 *m, const rs_matrix2x2 *v) {
+    m->m[0] = v->m[0];
+    m->m[1] = v->m[1];
+    m->m[2] = 0.f;
+    m->m[3] = 0.f;
+    m->m[4] = v->m[2];
+    m->m[5] = v->m[3];
+    m->m[6] = 0.f;
+    m->m[7] = 0.f;
+    m->m[8] = 0.f;
+    m->m[9] = 0.f;
+    m->m[10] = 1.f;
+    m->m[11] = 0.f;
+    m->m[12] = 0.f;
+    m->m[13] = 0.f;
+    m->m[14] = 0.f;
+    m->m[15] = 1.f;
+}
+
+static void SC_MatrixLoad_3x3_3x3(rs_matrix3x3 *m, const rs_matrix3x3 *v) {
+    m->m[0] = v->m[0];
+    m->m[1] = v->m[1];
+    m->m[2] = v->m[2];
+    m->m[3] = v->m[3];
+    m->m[4] = v->m[4];
+    m->m[5] = v->m[5];
+    m->m[6] = v->m[6];
+    m->m[7] = v->m[7];
+    m->m[8] = v->m[8];
+}
+
+static void SC_MatrixLoad_2x2_2x2(rs_matrix2x2 *m, const rs_matrix2x2 *v) {
+    m->m[0] = v->m[0];
+    m->m[1] = v->m[1];
+    m->m[2] = v->m[2];
+    m->m[3] = v->m[3];
+}
+
+static void SC_MatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z) {
+    float c, s;
+    m->m[3] = 0;
+    m->m[7] = 0;
+    m->m[11]= 0;
+    m->m[12]= 0;
+    m->m[13]= 0;
+    m->m[14]= 0;
+    m->m[15]= 1;
+    rot *= (float)(M_PI / 180.0f);
+    c = cos(rot);
+    s = sin(rot);
+
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    const float nc = 1.0f - c;
+    const float xy = x * y;
+    const float yz = y * z;
+    const float zx = z * x;
+    const float xs = x * s;
+    const float ys = y * s;
+    const float zs = z * s;
+    m->m[ 0] = x*x*nc +  c;
+    m->m[ 4] =  xy*nc - zs;
+    m->m[ 8] =  zx*nc + ys;
+    m->m[ 1] =  xy*nc + zs;
+    m->m[ 5] = y*y*nc +  c;
+    m->m[ 9] =  yz*nc - xs;
+    m->m[ 2] =  zx*nc - ys;
+    m->m[ 6] =  yz*nc + xs;
+    m->m[10] = z*z*nc +  c;
+}
+
+static void SC_MatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z) {
+    SC_MatrixLoadIdentity_4x4(m);
+    m->m[0] = x;
+    m->m[5] = y;
+    m->m[10] = z;
+}
+
+static void SC_MatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z) {
+    SC_MatrixLoadIdentity_4x4(m);
+    m->m[12] = x;
+    m->m[13] = y;
+    m->m[14] = z;
+}
+
+static void SC_MatrixLoadMultiply_4x4_4x4_4x4(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
+    for (int i=0 ; i<4 ; i++) {
+        float ri0 = 0;
+        float ri1 = 0;
+        float ri2 = 0;
+        float ri3 = 0;
+        for (int j=0 ; j<4 ; j++) {
+            const float rhs_ij = rsMatrixGet(rhs, i,j);
+            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
+            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
+            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
+            ri3 += rsMatrixGet(lhs, j, 3) * rhs_ij;
+        }
+        rsMatrixSet(m, i, 0, ri0);
+        rsMatrixSet(m, i, 1, ri1);
+        rsMatrixSet(m, i, 2, ri2);
+        rsMatrixSet(m, i, 3, ri3);
+    }
+}
+
+static void SC_MatrixMultiply_4x4_4x4(rs_matrix4x4 *m, const rs_matrix4x4 *rhs) {
+    rs_matrix4x4 mt;
+    SC_MatrixLoadMultiply_4x4_4x4_4x4(&mt, m, rhs);
+    SC_MatrixLoad_4x4_4x4(m, &mt);
+}
+
+static void SC_MatrixLoadMultiply_3x3_3x3_3x3(rs_matrix3x3 *m, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
+    for (int i=0 ; i<3 ; i++) {
+        float ri0 = 0;
+        float ri1 = 0;
+        float ri2 = 0;
+        for (int j=0 ; j<3 ; j++) {
+            const float rhs_ij = rsMatrixGet(rhs, i,j);
+            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
+            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
+            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
+        }
+        rsMatrixSet(m, i, 0, ri0);
+        rsMatrixSet(m, i, 1, ri1);
+        rsMatrixSet(m, i, 2, ri2);
+    }
+}
+
+static void SC_MatrixMultiply_3x3_3x3(rs_matrix3x3 *m, const rs_matrix3x3 *rhs) {
+    rs_matrix3x3 mt;
+    SC_MatrixLoadMultiply_3x3_3x3_3x3(&mt, m, rhs);
+    SC_MatrixLoad_3x3_3x3(m, &mt);
+}
+
+static void SC_MatrixLoadMultiply_2x2_2x2_2x2(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
+    for (int i=0 ; i<2 ; i++) {
+        float ri0 = 0;
+        float ri1 = 0;
+        for (int j=0 ; j<2 ; j++) {
+            const float rhs_ij = rsMatrixGet(rhs, i,j);
+            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
+            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
+        }
+        rsMatrixSet(m, i, 0, ri0);
+        rsMatrixSet(m, i, 1, ri1);
+    }
+}
+
+static void SC_MatrixMultiply_2x2_2x2(rs_matrix2x2 *m, const rs_matrix2x2 *rhs) {
+    rs_matrix2x2 mt;
+    SC_MatrixLoadMultiply_2x2_2x2_2x2(&mt, m, rhs);
+    SC_MatrixLoad_2x2_2x2(m, &mt);
+}
+
+static void SC_MatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z) {
+    rs_matrix4x4 m1;
+    SC_MatrixLoadRotate(&m1, rot, x, y, z);
+    SC_MatrixMultiply_4x4_4x4(m, &m1);
+}
+
+static void SC_MatrixScale(rs_matrix4x4 *m, float x, float y, float z) {
+    rs_matrix4x4 m1;
+    SC_MatrixLoadScale(&m1, x, y, z);
+    SC_MatrixMultiply_4x4_4x4(m, &m1);
+}
+
+static void SC_MatrixTranslate(rs_matrix4x4 *m, float x, float y, float z) {
+    rs_matrix4x4 m1;
+    SC_MatrixLoadTranslate(&m1, x, y, z);
+    SC_MatrixMultiply_4x4_4x4(m, &m1);
+}
+
+static void SC_MatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far) {
+    SC_MatrixLoadIdentity_4x4(m);
+    m->m[0] = 2.f / (right - left);
+    m->m[5] = 2.f / (top - bottom);
+    m->m[10]= -2.f / (far - near);
+    m->m[12]= -(right + left) / (right - left);
+    m->m[13]= -(top + bottom) / (top - bottom);
+    m->m[14]= -(far + near) / (far - near);
+}
+
+static void SC_MatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far) {
+    SC_MatrixLoadIdentity_4x4(m);
+    m->m[0] = 2.f * near / (right - left);
+    m->m[5] = 2.f * near / (top - bottom);
+    m->m[8] = (right + left) / (right - left);
+    m->m[9] = (top + bottom) / (top - bottom);
+    m->m[10]= -(far + near) / (far - near);
+    m->m[11]= -1.f;
+    m->m[14]= -2.f * far * near / (far - near);
+    m->m[15]= 0.f;
+}
+
+static void SC_MatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far) {
+    float top = near * tan((float) (fovy * M_PI / 360.0f));
+    float bottom = -top;
+    float left = bottom * aspect;
+    float right = top * aspect;
+    SC_MatrixLoadFrustum(m, left, right, bottom, top, near, far);
+}
+
+
+// Returns true if the matrix was successfully inversed
+static bool SC_MatrixInverse_4x4(rs_matrix4x4 *m) {
+    rs_matrix4x4 result;
+
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+            // computeCofactor for int i, int j
+            int c0 = (i+1) % 4;
+            int c1 = (i+2) % 4;
+            int c2 = (i+3) % 4;
+            int r0 = (j+1) % 4;
+            int r1 = (j+2) % 4;
+            int r2 = (j+3) % 4;
+
+            float minor = (m->m[c0 + 4*r0] * (m->m[c1 + 4*r1] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r1]))
+                         - (m->m[c0 + 4*r1] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r0]))
+                         + (m->m[c0 + 4*r2] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r1] - m->m[c1 + 4*r1] * m->m[c2 + 4*r0]));
+
+            float cofactor = (i+j) & 1 ? -minor : minor;
+
+            result.m[4*i + j] = cofactor;
+        }
+    }
+
+    // Dot product of 0th column of source and 0th row of result
+    float det = m->m[0]*result.m[0] + m->m[4]*result.m[1] +
+                 m->m[8]*result.m[2] + m->m[12]*result.m[3];
+
+    if (fabs(det) < 1e-6) {
+        return false;
+    }
+
+    det = 1.0f / det;
+    for (i = 0; i < 16; ++i) {
+        m->m[i] = result.m[i] * det;
+    }
+
+    return true;
+}
+
+// Returns true if the matrix was successfully inversed
+static bool SC_MatrixInverseTranspose_4x4(rs_matrix4x4 *m) {
+    rs_matrix4x4 result;
+
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+            // computeCofactor for int i, int j
+            int c0 = (i+1) % 4;
+            int c1 = (i+2) % 4;
+            int c2 = (i+3) % 4;
+            int r0 = (j+1) % 4;
+            int r1 = (j+2) % 4;
+            int r2 = (j+3) % 4;
+
+            float minor = (m->m[c0 + 4*r0] * (m->m[c1 + 4*r1] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r1]))
+                         - (m->m[c0 + 4*r1] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r0]))
+                         + (m->m[c0 + 4*r2] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r1] - m->m[c1 + 4*r1] * m->m[c2 + 4*r0]));
+
+            float cofactor = (i+j) & 1 ? -minor : minor;
+
+            result.m[4*j + i] = cofactor;
+        }
+    }
+
+    // Dot product of 0th column of source and 0th column of result
+    float det = m->m[0]*result.m[0] + m->m[4]*result.m[4] +
+                 m->m[8]*result.m[8] + m->m[12]*result.m[12];
+
+    if (fabs(det) < 1e-6) {
+        return false;
+    }
+
+    det = 1.0f / det;
+    for (i = 0; i < 16; ++i) {
+        m->m[i] = result.m[i] * det;
+    }
+
+    return true;
+}
+
+static void SC_MatrixTranspose_4x4(rs_matrix4x4 *m) {
+    int i, j;
+    float temp;
+    for (i = 0; i < 3; ++i) {
+        for (j = i + 1; j < 4; ++j) {
+            temp = m->m[i*4 + j];
+            m->m[i*4 + j] = m->m[j*4 + i];
+            m->m[j*4 + i] = temp;
+        }
+    }
+}
+
+static void SC_MatrixTranspose_3x3(rs_matrix3x3 *m) {
+    int i, j;
+    float temp;
+    for (i = 0; i < 2; ++i) {
+        for (j = i + 1; j < 3; ++j) {
+            temp = m->m[i*3 + j];
+            m->m[i*3 + j] = m->m[j*4 + i];
+            m->m[j*3 + i] = temp;
+        }
+    }
+}
+
+static void SC_MatrixTranspose_2x2(rs_matrix2x2 *m) {
+    float temp = m->m[1];
+    m->m[1] = m->m[2];
+    m->m[2] = temp;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 // Class implementation
 //////////////////////////////////////////////////////////////////////////////
@@ -363,6 +854,8 @@ void SC_ForEach2(RsScript vs,
 static ScriptCState::SymbolTable_t gSyms[] = {
     { "__divsi3", (void *)&SC_divsi3, true },
     { "__modsi3", (void *)&SC_modsi3, true },
+    { "__udivsi3", (void *)&SC_udivsi3, true },
+    { "__umodsi3", (void *)&SC_umodsi3, true },
 
     // allocation
     { "_Z19rsAllocationGetDimX13rs_allocation", (void *)&SC_allocGetDimX, true },
@@ -463,6 +956,45 @@ static ScriptCState::SymbolTable_t gSyms[] = {
     { "_Z22rsSendToClientBlockingi", (void *)&SC_toClientBlocking, false },
     { "_Z22rsSendToClientBlockingiPKvj", (void *)&SC_toClientBlocking2, false },
 
+    // matrix
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix4x4", (void *)&SC_MatrixLoadIdentity_4x4, false },
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix3x3", (void *)&SC_MatrixLoadIdentity_3x3, false },
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix2x2", (void *)&SC_MatrixLoadIdentity_2x2, false },
+
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PKf", (void *)&SC_MatrixLoad_4x4_f, false },
+    { "_Z12rsMatrixLoadP12rs_matrix3x3PKf", (void *)&SC_MatrixLoad_3x3_f, false },
+    { "_Z12rsMatrixLoadP12rs_matrix2x2PKf", (void *)&SC_MatrixLoad_2x2_f, false },
+
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PKS_", (void *)&SC_MatrixLoad_4x4_4x4, false },
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PK12rs_matrix3x3", (void *)&SC_MatrixLoad_4x4_3x3, false },
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PK12rs_matrix2x2", (void *)&SC_MatrixLoad_4x4_2x2, false },
+    { "_Z12rsMatrixLoadP12rs_matrix3x3PKS_", (void *)&SC_MatrixLoad_3x3_3x3, false },
+    { "_Z12rsMatrixLoadP12rs_matrix2x2PKS_", (void *)&SC_MatrixLoad_2x2_2x2, false },
+
+    { "_Z18rsMatrixLoadRotateP12rs_matrix4x4ffff", (void *)&SC_MatrixLoadRotate, false },
+    { "_Z17rsMatrixLoadScaleP12rs_matrix4x4fff", (void *)&SC_MatrixLoadScale, false },
+    { "_Z21rsMatrixLoadTranslateP12rs_matrix4x4fff", (void *)&SC_MatrixLoadTranslate, false },
+    { "_Z14rsMatrixRotateP12rs_matrix4x4ffff", (void *)&SC_MatrixRotate, false },
+    { "_Z13rsMatrixScaleP12rs_matrix4x4fff", (void *)&SC_MatrixScale, false },
+    { "_Z17rsMatrixTranslateP12rs_matrix4x4fff", (void *)&SC_MatrixTranslate, false },
+
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix4x4PKS_S2_", (void *)&SC_MatrixLoadMultiply_4x4_4x4_4x4, false },
+    { "_Z16rsMatrixMultiplyP12rs_matrix4x4PKS_", (void *)&SC_MatrixMultiply_4x4_4x4, false },
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix3x3PKS_S2_", (void *)&SC_MatrixLoadMultiply_3x3_3x3_3x3, false },
+    { "_Z16rsMatrixMultiplyP12rs_matrix3x3PKS_", (void *)&SC_MatrixMultiply_3x3_3x3, false },
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix2x2PKS_S2_", (void *)&SC_MatrixLoadMultiply_2x2_2x2_2x2, false },
+    { "_Z16rsMatrixMultiplyP12rs_matrix2x2PKS_", (void *)&SC_MatrixMultiply_2x2_2x2, false },
+
+    { "_Z17rsMatrixLoadOrthoP12rs_matrix4x4ffffff", (void *)&SC_MatrixLoadOrtho, false },
+    { "_Z19rsMatrixLoadFrustumP12rs_matrix4x4ffffff", (void *)&SC_MatrixLoadFrustum, false },
+    { "_Z23rsMatrixLoadPerspectiveP12rs_matrix4x4ffff", (void *)&SC_MatrixLoadPerspective, false },
+
+    { "_Z15rsMatrixInverseP12rs_matrix4x4", (void *)&SC_MatrixInverse_4x4, false },
+    { "_Z24rsMatrixInverseTransposeP12rs_matrix4x4", (void *)&SC_MatrixInverseTranspose_4x4, false },
+    { "_Z17rsMatrixTransposeP12rs_matrix4x4", (void *)&SC_MatrixTranspose_4x4, false },
+    { "_Z17rsMatrixTransposeP12rs_matrix4x4", (void *)&SC_MatrixTranspose_3x3, false },
+    { "_Z17rsMatrixTransposeP12rs_matrix4x4", (void *)&SC_MatrixTranspose_2x2, false },
+
     { "_Z9rsForEach9rs_script13rs_allocationS0_PKv", (void *)&SC_ForEach, false },
     //{ "_Z9rsForEach9rs_script13rs_allocationS0_PKv", (void *)&SC_ForEach2, true },
 
diff --git a/libs/rs/rsScriptC_LibCL.cpp b/libs/rs/rsScriptC_LibCL.cpp
index 02d33b7..57855db 100644
--- a/libs/rs/rsScriptC_LibCL.cpp
+++ b/libs/rs/rsScriptC_LibCL.cpp
@@ -195,7 +195,7 @@ static ScriptCState::SymbolTable_t gSyms[] = {
     { "_Z4logbf", (void *)&logbf, true },
     { "_Z3madfff", (void *)&SC_mad, true },
     { "_Z4modffPf", (void *)&modff, true },
-    //{ "nan", (void *)&, true },
+    //{ "_Z3nanj", (void *)&SC_nan, true },
     { "_Z9nextafterff", (void *)&nextafterf, true },
     { "_Z3powff", (void *)&powf, true },
     { "_Z9remainderff", (void *)&remainderf, true },
@@ -210,7 +210,7 @@ static ScriptCState::SymbolTable_t gSyms[] = {
     { "_Z4sqrtf", (void *)&sqrtf, true },
     { "_Z3tanf", (void *)&tanf, true },
     { "_Z4tanhf", (void *)&tanhf, true },
-    { "_Z6tgammaf", (void *)&lgammaf, true }, // FIXME!!! NEEDS TO USE tgammaf
+    { "_Z6tgammaf", (void *)&tgammaf, true },
     { "_Z5truncf", (void *)&truncf, true },
 
     // OpenCL Int
diff --git a/libs/rs/scriptc/rs_cl.rsh b/libs/rs/scriptc/rs_cl.rsh
index 3c0496d..d78e62e 100644
--- a/libs/rs/scriptc/rs_cl.rsh
+++ b/libs/rs/scriptc/rs_cl.rsh
@@ -1,30 +1,17 @@
 #ifndef __RS_CL_RSH__
 #define __RS_CL_RSH__
 
-#ifdef BCC_PREPARE_BC
-#define _RS_STATIC  extern
-#else
-#define _RS_STATIC  static
-#endif
+#define _RS_RUNTIME  extern
 
 // Conversions
 #define CVT_FUNC_2(typeout, typein)                             \
-_RS_STATIC typeout##2 __attribute__((overloadable))             \
-        convert_##typeout##2(typein##2 v) {                     \
-    typeout##2 r = {(typeout)v.x, (typeout)v.y};                \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC typeout##3 __attribute__((overloadable))             \
-        convert_##typeout##3(typein##3 v) {                     \
-    typeout##3 r = {(typeout)v.x, (typeout)v.y, (typeout)v.z};  \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC typeout##4 __attribute__((overloadable))             \
-        convert_##typeout##4(typein##4 v) {                     \
-    typeout##4 r = {(typeout)v.x, (typeout)v.y, (typeout)v.z,   \
-                    (typeout)v.w};                              \
-    return r;                                                   \
-}
+_RS_RUNTIME typeout##2 __attribute__((overloadable))             \
+        convert_##typeout##2(typein##2 v);                      \
+_RS_RUNTIME typeout##3 __attribute__((overloadable))             \
+        convert_##typeout##3(typein##3 v);                      \
+_RS_RUNTIME typeout##4 __attribute__((overloadable))             \
+        convert_##typeout##4(typein##4 v);
+
 
 #define CVT_FUNC(type)  CVT_FUNC_2(type, uchar)     \
                         CVT_FUNC_2(type, char)      \
@@ -45,279 +32,63 @@ CVT_FUNC(float)
 // Float ops, 6.11.2
 
 #define FN_FUNC_FN(fnc)                                         \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v) { \
-    float2 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v) { \
-    float3 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v) { \
-    float4 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v);  \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v);  \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v);
 
 #define IN_FUNC_FN(fnc)                                         \
-_RS_STATIC int2 __attribute__((overloadable)) fnc(float2 v) {   \
-    int2 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC int3 __attribute__((overloadable)) fnc(float3 v) {   \
-    int3 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-_RS_STATIC int4 __attribute__((overloadable)) fnc(float4 v) {   \
-    int4 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
+_RS_RUNTIME int2 __attribute__((overloadable)) fnc(float2 v);    \
+_RS_RUNTIME int3 __attribute__((overloadable)) fnc(float3 v);    \
+_RS_RUNTIME int4 __attribute__((overloadable)) fnc(float4 v);
 
 #define FN_FUNC_FN_FN(fnc)                                                  \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    r.w = fnc(v1.w, v2.w);                                                  \
-    return r;                                                               \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2);  \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2);  \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2);
 
 #define FN_FUNC_FN_F(fnc)                                                   \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v1, float v2) {  \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v1, float v2) {  \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v1, float v2) {  \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    r.w = fnc(v1.w, v2);                                                    \
-    return r;                                                               \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float v2);   \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float v2);   \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, float v2);
 
 #define FN_FUNC_FN_IN(fnc)                                                  \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) {   \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) {   \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) {   \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    r.w = fnc(v1.w, v2.w);                                                  \
-    return r;                                                               \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2);    \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2);    \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2);    \
 
 #define FN_FUNC_FN_I(fnc)                                                   \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v1, int v2) {    \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v1, int v2) {    \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v1, int v2) {    \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    r.w = fnc(v1.w, v2);                                                    \
-    return r;                                                               \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int v2);     \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int v2);     \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int v2);
 
 #define FN_FUNC_FN_PFN(fnc)                     \
-_RS_STATIC float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 *v2) {            \
-    float2 r;                                   \
-    float t[2];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    return r;                                   \
-}                                               \
-_RS_STATIC float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 *v2) {            \
-    float3 r;                                   \
-    float t[3];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    r.z = fnc(v1.z, &t[2]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    v2->z = t[2];                               \
-    return r;                                   \
-}                                               \
-_RS_STATIC float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 *v2) {            \
-    float4 r;                                   \
-    float t[4];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    r.z = fnc(v1.z, &t[2]);                     \
-    r.w = fnc(v1.w, &t[3]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    v2->z = t[2];                               \
-    v2->w = t[3];                               \
-    return r;                                   \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) \
+        fnc(float2 v1, float2 *v2);             \
+_RS_RUNTIME float3 __attribute__((overloadable)) \
+        fnc(float3 v1, float3 *v2);             \
+_RS_RUNTIME float4 __attribute__((overloadable)) \
+        fnc(float4 v1, float4 *v2);
 
 #define FN_FUNC_FN_PIN(fnc)                                                 \
-_RS_STATIC float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) {  \
-    float2 r;                                                               \
-    int t[2];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) {  \
-    float3 r;                                                               \
-    int t[3];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    r.z = fnc(v1.z, &t[2]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    v2->z = t[2];                                                           \
-    return r;                                                               \
-}                                                                           \
-_RS_STATIC float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) {  \
-    float4 r;                                                               \
-    int t[4];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    r.z = fnc(v1.z, &t[2]);                                                 \
-    r.w = fnc(v1.w, &t[3]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    v2->z = t[2];                                                           \
-    v2->w = t[3];                                                           \
-    return r;                                                               \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2);   \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2);   \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2);
 
 #define FN_FUNC_FN_FN_FN(fnc)                   \
-_RS_STATIC float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, float2 v3) {  \
-    float2 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    return r;                                   \
-}                                               \
-_RS_STATIC float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, float3 v3) {  \
-    float3 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    r.z = fnc(v1.z, v2.z, v3.z);                \
-    return r;                                   \
-}                                               \
-_RS_STATIC float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 v2, float4 v3) {  \
-    float4 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    r.z = fnc(v1.z, v2.z, v3.z);                \
-    r.w = fnc(v1.w, v2.w, v3.w);                \
-    return r;                                   \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) \
+        fnc(float2 v1, float2 v2, float2 v3);   \
+_RS_RUNTIME float3 __attribute__((overloadable)) \
+        fnc(float3 v1, float3 v2, float3 v3);   \
+_RS_RUNTIME float4 __attribute__((overloadable)) \
+        fnc(float4 v1, float4 v2, float4 v3);
 
 #define FN_FUNC_FN_FN_PIN(fnc)                  \
-_RS_STATIC float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, int2 *v3) {   \
-    float2 r;                                   \
-    int t[2];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    return r;                                   \
-}                                               \
-_RS_STATIC float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, int3 *v3) {   \
-    float3 r;                                   \
-    int t[3];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    r.z = fnc(v1.z, v2.z, &t[2]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    v3->z = t[2];                               \
-    return r;                                   \
-}                                               \
-_RS_STATIC float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 v2, int4 *v3) {   \
-    float4 r;                                   \
-    int t[4];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    r.z = fnc(v1.z, v2.z, &t[2]);               \
-    r.w = fnc(v1.w, v2.w, &t[3]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    v3->z = t[2];                               \
-    v3->w = t[3];                               \
-    return r;                                   \
-}
+_RS_RUNTIME float2 __attribute__((overloadable)) \
+        fnc(float2 v1, float2 v2, int2 *v3);    \
+_RS_RUNTIME float3 __attribute__((overloadable)) \
+        fnc(float3 v1, float3 v2, int3 *v3);    \
+_RS_RUNTIME float4 __attribute__((overloadable)) \
+        fnc(float4 v1, float4 v2, int4 *v3);
 
 
 extern float __attribute__((overloadable)) acos(float);
@@ -326,9 +97,9 @@ FN_FUNC_FN(acos)
 extern float __attribute__((overloadable)) acosh(float);
 FN_FUNC_FN(acosh)
 
-_RS_STATIC float __attribute__((overloadable)) acospi(float v) {
-    return acos(v) / M_PI;
-}
+_RS_RUNTIME float __attribute__((overloadable)) acospi(float v);
+
+
 FN_FUNC_FN(acospi)
 
 extern float __attribute__((overloadable)) asin(float);
@@ -337,9 +108,8 @@ FN_FUNC_FN(asin)
 extern float __attribute__((overloadable)) asinh(float);
 FN_FUNC_FN(asinh)
 
-_RS_STATIC float __attribute__((overloadable)) asinpi(float v) {
-    return asin(v) / M_PI;
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) asinpi(float v);
 FN_FUNC_FN(asinpi)
 
 extern float __attribute__((overloadable)) atan(float);
@@ -351,14 +121,12 @@ FN_FUNC_FN_FN(atan2)
 extern float __attribute__((overloadable)) atanh(float);
 FN_FUNC_FN(atanh)
 
-_RS_STATIC float __attribute__((overloadable)) atanpi(float v) {
-    return atan(v) / M_PI;
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) atanpi(float v);
 FN_FUNC_FN(atanpi)
 
-_RS_STATIC float __attribute__((overloadable)) atan2pi(float y, float x) {
-    return atan2(y, x) / M_PI;
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) atan2pi(float y, float x);
 FN_FUNC_FN_FN(atan2pi)
 
 extern float __attribute__((overloadable)) cbrt(float);
@@ -376,9 +144,8 @@ FN_FUNC_FN(cos)
 extern float __attribute__((overloadable)) cosh(float);
 FN_FUNC_FN(cosh)
 
-_RS_STATIC float __attribute__((overloadable)) cospi(float v) {
-    return cos(v * M_PI);
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) cospi(float v);
 FN_FUNC_FN(cospi)
 
 extern float __attribute__((overloadable)) erfc(float);
@@ -394,9 +161,8 @@ extern float __attribute__((overloadable)) exp2(float);
 FN_FUNC_FN(exp2)
 
 extern float __attribute__((overloadable)) pow(float, float);
-_RS_STATIC float __attribute__((overloadable)) exp10(float v) {
-    return pow(10.f, v);
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) exp10(float v);
 FN_FUNC_FN(exp10)
 
 extern float __attribute__((overloadable)) expm1(float);
@@ -425,11 +191,8 @@ FN_FUNC_FN_F(fmin);
 extern float __attribute__((overloadable)) fmod(float, float);
 FN_FUNC_FN_FN(fmod)
 
-_RS_STATIC float __attribute__((overloadable)) fract(float v, float *iptr) {
-    int i = (int)floor(v);
-    iptr[0] = i;
-    return fmin(v - i, 0x1.fffffep-1f);
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) fract(float v, float *iptr);
 FN_FUNC_FN_PFN(fract)
 
 extern float __attribute__((overloadable)) frexp(float, int *);
@@ -457,9 +220,8 @@ FN_FUNC_FN(log)
 extern float __attribute__((overloadable)) log10(float);
 FN_FUNC_FN(log10)
 
-_RS_STATIC float __attribute__((overloadable)) log2(float v) {
-    return log10(v) / log10(2.f);
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) log2(float v);
 FN_FUNC_FN(log2)
 
 extern float __attribute__((overloadable)) log1p(float);
@@ -481,31 +243,15 @@ FN_FUNC_FN_FN(nextafter)
 
 FN_FUNC_FN_FN(pow)
 
-_RS_STATIC float __attribute__((overloadable)) pown(float v, int p) {
-    return pow(v, (float)p);
-}
-_RS_STATIC float2 __attribute__((overloadable)) pown(float2 v, int2 p) {
-    return pow(v, (float2)p);
-}
-_RS_STATIC float3 __attribute__((overloadable)) pown(float3 v, int3 p) {
-    return pow(v, (float3)p);
-}
-_RS_STATIC float4 __attribute__((overloadable)) pown(float4 v, int4 p) {
-    return pow(v, (float4)p);
-}
-
-_RS_STATIC float __attribute__((overloadable)) powr(float v, float p) {
-    return pow(v, p);
-}
-_RS_STATIC float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
-    return pow(v, p);
-}
-_RS_STATIC float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
-    return pow(v, p);
-}
-_RS_STATIC float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
-    return pow(v, p);
-}
+_RS_RUNTIME float __attribute__((overloadable)) pown(float v, int p);
+_RS_RUNTIME float2 __attribute__((overloadable)) pown(float2 v, int2 p);
+_RS_RUNTIME float3 __attribute__((overloadable)) pown(float3 v, int3 p);
+_RS_RUNTIME float4 __attribute__((overloadable)) pown(float4 v, int4 p);
+
+_RS_RUNTIME float __attribute__((overloadable)) powr(float v, float p);
+_RS_RUNTIME float2 __attribute__((overloadable)) powr(float2 v, float2 p);
+_RS_RUNTIME float3 __attribute__((overloadable)) powr(float3 v, float3 p);
+_RS_RUNTIME float4 __attribute__((overloadable)) powr(float4 v, float4 p);
 
 extern float __attribute__((overloadable)) remainder(float, float);
 FN_FUNC_FN_FN(remainder)
@@ -516,57 +262,33 @@ FN_FUNC_FN_FN_PIN(remquo)
 extern float __attribute__((overloadable)) rint(float);
 FN_FUNC_FN(rint)
 
-_RS_STATIC float __attribute__((overloadable)) rootn(float v, int r) {
-    return pow(v, 1.f / r);
-}
-_RS_STATIC float2 __attribute__((overloadable)) rootn(float2 v, int2 r) {
-    float2 t = {1.f / r.x, 1.f / r.y};
-    return pow(v, t);
-}
-_RS_STATIC float3 __attribute__((overloadable)) rootn(float3 v, int3 r) {
-    float3 t = {1.f / r.x, 1.f / r.y, 1.f / r.z};
-    return pow(v, t);
-}
-_RS_STATIC float4 __attribute__((overloadable)) rootn(float4 v, int4 r) {
-    float4 t = {1.f / r.x, 1.f / r.y, 1.f / r.z, 1.f / r.w};
-    return pow(v, t);
-}
+
+_RS_RUNTIME float __attribute__((overloadable)) rootn(float v, int r);
+_RS_RUNTIME float2 __attribute__((overloadable)) rootn(float2 v, int2 r);
+_RS_RUNTIME float3 __attribute__((overloadable)) rootn(float3 v, int3 r);
+_RS_RUNTIME float4 __attribute__((overloadable)) rootn(float4 v, int4 r);
+
 
 extern float __attribute__((overloadable)) round(float);
 FN_FUNC_FN(round)
 
+
 extern float __attribute__((overloadable)) sqrt(float);
-_RS_STATIC float __attribute__((overloadable)) rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
+_RS_RUNTIME float __attribute__((overloadable)) rsqrt(float v);
 FN_FUNC_FN(rsqrt)
 
 extern float __attribute__((overloadable)) sin(float);
 FN_FUNC_FN(sin)
 
-_RS_STATIC float __attribute__((overloadable)) sincos(float v, float *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-_RS_STATIC float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-_RS_STATIC float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-_RS_STATIC float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
+_RS_RUNTIME float __attribute__((overloadable)) sincos(float v, float *cosptr);
+_RS_RUNTIME float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr);
+_RS_RUNTIME float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr);
+_RS_RUNTIME float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr);
 
 extern float __attribute__((overloadable)) sinh(float);
 FN_FUNC_FN(sinh)
 
-_RS_STATIC float __attribute__((overloadable)) sinpi(float v) {
-    return sin(v * M_PI);
-}
+_RS_RUNTIME float __attribute__((overloadable)) sinpi(float v);
 FN_FUNC_FN(sinpi)
 
 FN_FUNC_FN(sqrt)
@@ -577,11 +299,10 @@ FN_FUNC_FN(tan)
 extern float __attribute__((overloadable)) tanh(float);
 FN_FUNC_FN(tanh)
 
-_RS_STATIC float __attribute__((overloadable)) tanpi(float v) {
-    return tan(v * M_PI);
-}
+_RS_RUNTIME float __attribute__((overloadable)) tanpi(float v);
 FN_FUNC_FN(tanpi)
 
+
 extern float __attribute__((overloadable)) tgamma(float);
 FN_FUNC_FN(tgamma)
 
@@ -592,27 +313,9 @@ FN_FUNC_FN(trunc)
 
 #define XN_FUNC_YN(typeout, fnc, typein)                                \
 extern typeout __attribute__((overloadable)) fnc(typein);               \
-_RS_STATIC typeout##2 __attribute__((overloadable)) fnc(typein##2 v) {  \
-    typeout##2 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    return r;                                                           \
-}                                                                       \
-_RS_STATIC typeout##3 __attribute__((overloadable)) fnc(typein##3 v) {  \
-    typeout##3 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    r.z = fnc(v.z);                                                     \
-    return r;                                                           \
-}                                                                       \
-_RS_STATIC typeout##4 __attribute__((overloadable)) fnc(typein##4 v) {  \
-    typeout##4 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    r.z = fnc(v.z);                                                     \
-    r.w = fnc(v.w);                                                     \
-    return r;                                                           \
-}
+_RS_RUNTIME typeout##2 __attribute__((overloadable)) fnc(typein##2 v);   \
+_RS_RUNTIME typeout##3 __attribute__((overloadable)) fnc(typein##3 v);   \
+_RS_RUNTIME typeout##4 __attribute__((overloadable)) fnc(typein##4 v);
 
 #define UIN_FUNC_IN(fnc)          \
 XN_FUNC_YN(uchar, fnc, char)      \
@@ -627,35 +330,16 @@ XN_FUNC_YN(short, fnc, short)     \
 XN_FUNC_YN(uint, fnc, uint)       \
 XN_FUNC_YN(int, fnc, int)
 
+
 #define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
-_RS_STATIC type __attribute__((overloadable))       \
-        fnc(type v1, type v2) {                     \
-    return body;                                    \
-}                                                   \
-_RS_STATIC type##2 __attribute__((overloadable))    \
-        fnc(type##2 v1, type##2 v2) {               \
-    type##2 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    return r;                                       \
-}                                                   \
-_RS_STATIC type##3 __attribute__((overloadable))    \
-        fnc(type##3 v1, type##3 v2) {               \
-    type##3 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    r.z = fnc(v1.z, v2.z);                          \
-    return r;                                       \
-}                                                   \
-_RS_STATIC type##4 __attribute__((overloadable))    \
-        fnc(type##4 v1, type##4 v2) {               \
-    type##4 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    r.z = fnc(v1.z, v2.z);                          \
-    r.w = fnc(v1.w, v2.w);                          \
-    return r;                                       \
-}
+_RS_RUNTIME type __attribute__((overloadable))       \
+        fnc(type v1, type v2);                      \
+_RS_RUNTIME type##2 __attribute__((overloadable))    \
+        fnc(type##2 v1, type##2 v2);                \
+_RS_RUNTIME type##3 __attribute__((overloadable))    \
+        fnc(type##3 v1, type##3 v2);                \
+_RS_RUNTIME type##4 __attribute__((overloadable))    \
+        fnc(type##4 v1, type##4 v2);
 
 #define IN_FUNC_IN_IN_BODY(fnc, body) \
 XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
@@ -677,129 +361,35 @@ FN_FUNC_FN_F(max)
 
 // 6.11.4
 
-_RS_STATIC float __attribute__((overloadable)) clamp(float amount, float low, float high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-_RS_STATIC float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
-    float2 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    return r;
-}
-_RS_STATIC float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
-    float3 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    return r;
-}
-_RS_STATIC float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
-    float4 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
-    return r;
-}
-_RS_STATIC float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
-    float2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-_RS_STATIC float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
-    float3 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    return r;
-}
-_RS_STATIC float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
-    float4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
-
-_RS_STATIC float __attribute__((overloadable)) degrees(float radians) {
-    return radians * (180.f / M_PI);
-}
+_RS_RUNTIME float __attribute__((overloadable)) clamp(float amount, float low, float high);
+_RS_RUNTIME float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high);
+_RS_RUNTIME float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high);
+_RS_RUNTIME float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
+_RS_RUNTIME float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high);
+_RS_RUNTIME float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high);
+_RS_RUNTIME float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high);
+
+_RS_RUNTIME float __attribute__((overloadable)) degrees(float radians);
 FN_FUNC_FN(degrees)
 
-_RS_STATIC float __attribute__((overloadable)) mix(float start, float stop, float amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-_RS_STATIC float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-
-_RS_STATIC float __attribute__((overloadable)) radians(float degrees) {
-    return degrees * (M_PI / 180.f);
-}
+_RS_RUNTIME float __attribute__((overloadable)) mix(float start, float stop, float amount);
+_RS_RUNTIME float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount);
+_RS_RUNTIME float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount);
+_RS_RUNTIME float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount);
+_RS_RUNTIME float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount);
+_RS_RUNTIME float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount);
+_RS_RUNTIME float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount);
+
+_RS_RUNTIME float __attribute__((overloadable)) radians(float degrees);
 FN_FUNC_FN(radians)
 
-_RS_STATIC float __attribute__((overloadable)) step(float edge, float v) {
-    return (v < edge) ? 0.f : 1.f;
-}
-_RS_STATIC float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
-    float2 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    return r;
-}
-_RS_STATIC float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
-    float3 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    r.z = (v.z < edge.z) ? 0.f : 1.f;
-    return r;
-}
-_RS_STATIC float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
-    float4 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    r.z = (v.z < edge.z) ? 0.f : 1.f;
-    r.w = (v.w < edge.w) ? 0.f : 1.f;
-    return r;
-}
-_RS_STATIC float2 __attribute__((overloadable)) step(float2 edge, float v) {
-    float2 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    return r;
-}
-_RS_STATIC float3 __attribute__((overloadable)) step(float3 edge, float v) {
-    float3 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    r.z = (v < edge.z) ? 0.f : 1.f;
-    return r;
-}
-_RS_STATIC float4 __attribute__((overloadable)) step(float4 edge, float v) {
-    float4 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    r.z = (v < edge.z) ? 0.f : 1.f;
-    r.w = (v < edge.w) ? 0.f : 1.f;
-    return r;
-}
+_RS_RUNTIME float __attribute__((overloadable)) step(float edge, float v);
+_RS_RUNTIME float2 __attribute__((overloadable)) step(float2 edge, float2 v);
+_RS_RUNTIME float3 __attribute__((overloadable)) step(float3 edge, float3 v);
+_RS_RUNTIME float4 __attribute__((overloadable)) step(float4 edge, float4 v);
+_RS_RUNTIME float2 __attribute__((overloadable)) step(float2 edge, float v);
+_RS_RUNTIME float3 __attribute__((overloadable)) step(float3 edge, float v);
+_RS_RUNTIME float4 __attribute__((overloadable)) step(float4 edge, float v);
 
 extern float __attribute__((overloadable)) smoothstep(float, float, float);
 extern float2 __attribute__((overloadable)) smoothstep(float2, float2, float2);
@@ -809,82 +399,33 @@ extern float2 __attribute__((overloadable)) smoothstep(float, float, float2);
 extern float3 __attribute__((overloadable)) smoothstep(float, float, float3);
 extern float4 __attribute__((overloadable)) smoothstep(float, float, float4);
 
-_RS_STATIC float __attribute__((overloadable)) sign(float v) {
-    if (v > 0) return 1.f;
-    if (v < 0) return -1.f;
-    return v;
-}
+_RS_RUNTIME float __attribute__((overloadable)) sign(float v);
 FN_FUNC_FN(sign)
 
 // 6.11.5
-_RS_STATIC float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
-    float3 r;
-    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
-    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
-    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
-    return r;
-}
-
-_RS_STATIC float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
-    float4 r;
-    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
-    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
-    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
-    r.w = 0.f;
-    return r;
-}
-
-_RS_STATIC float __attribute__((overloadable)) dot(float lhs, float rhs) {
-    return lhs * rhs;
-}
-_RS_STATIC float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y;
-}
-_RS_STATIC float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
-}
-_RS_STATIC float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
-}
-
-_RS_STATIC float __attribute__((overloadable)) length(float v) {
-    return v;
-}
-_RS_STATIC float __attribute__((overloadable)) length(float2 v) {
-    return sqrt(v.x*v.x + v.y*v.y);
-}
-_RS_STATIC float __attribute__((overloadable)) length(float3 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-_RS_STATIC float __attribute__((overloadable)) length(float4 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
-_RS_STATIC float __attribute__((overloadable)) distance(float lhs, float rhs) {
-    return length(lhs - rhs);
-}
-_RS_STATIC float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
-    return length(lhs - rhs);
-}
-_RS_STATIC float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
-    return length(lhs - rhs);
-}
-_RS_STATIC float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
-    return length(lhs - rhs);
-}
-
-_RS_STATIC float __attribute__((overloadable)) normalize(float v) {
-    return 1.f;
-}
-_RS_STATIC float2 __attribute__((overloadable)) normalize(float2 v) {
-    return v / length(v);
-}
-_RS_STATIC float3 __attribute__((overloadable)) normalize(float3 v) {
-    return v / length(v);
-}
-_RS_STATIC float4 __attribute__((overloadable)) normalize(float4 v) {
-    return v / length(v);
-}
+_RS_RUNTIME float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs);
+
+_RS_RUNTIME float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs);
+
+_RS_RUNTIME float __attribute__((overloadable)) dot(float lhs, float rhs);
+_RS_RUNTIME float __attribute__((overloadable)) dot(float2 lhs, float2 rhs);
+_RS_RUNTIME float __attribute__((overloadable)) dot(float3 lhs, float3 rhs);
+_RS_RUNTIME float __attribute__((overloadable)) dot(float4 lhs, float4 rhs);
+
+_RS_RUNTIME float __attribute__((overloadable)) length(float v);
+_RS_RUNTIME float __attribute__((overloadable)) length(float2 v);
+_RS_RUNTIME float __attribute__((overloadable)) length(float3 v);
+_RS_RUNTIME float __attribute__((overloadable)) length(float4 v);
+
+_RS_RUNTIME float __attribute__((overloadable)) distance(float lhs, float rhs);
+_RS_RUNTIME float __attribute__((overloadable)) distance(float2 lhs, float2 rhs);
+_RS_RUNTIME float __attribute__((overloadable)) distance(float3 lhs, float3 rhs);
+_RS_RUNTIME float __attribute__((overloadable)) distance(float4 lhs, float4 rhs);
+
+_RS_RUNTIME float __attribute__((overloadable)) normalize(float v);
+_RS_RUNTIME float2 __attribute__((overloadable)) normalize(float2 v);
+_RS_RUNTIME float3 __attribute__((overloadable)) normalize(float3 v);
+_RS_RUNTIME float4 __attribute__((overloadable)) normalize(float4 v);
 
 #undef CVT_FUNC
 #undef CVT_FUNC_2
@@ -903,6 +444,6 @@ _RS_STATIC float4 __attribute__((overloadable)) normalize(float4 v) {
 #undef IN_FUNC_IN
 #undef XN_FUNC_XN_XN_BODY
 #undef IN_FUNC_IN_IN_BODY
-#undef _RS_STATIC
+#undef _RS_RUNTIME
 
 #endif
diff --git a/libs/rs/scriptc/rs_core.rsh b/libs/rs/scriptc/rs_core.rsh
index f3e0ab0..e32d435 100644
--- a/libs/rs/scriptc/rs_core.rsh
+++ b/libs/rs/scriptc/rs_core.rsh
@@ -1,11 +1,7 @@
 #ifndef __RS_CORE_RSH__
 #define __RS_CORE_RSH__
 
-#ifdef BCC_PREPARE_BC
-#define _RS_STATIC  extern
-#else
-#define _RS_STATIC  static
-#endif
+#define _RS_RUNTIME extern
 
 // Debugging, print to the LOG a description string and a value.
 extern void __attribute__((overloadable))
@@ -41,56 +37,19 @@ extern void __attribute__((overloadable))
 #define RS_DEBUG(a) rsDebug(#a, a)
 #define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
 
-_RS_STATIC void __attribute__((overloadable)) rsDebug(const char *s, float2 v) {
-    rsDebug(s, v.x, v.y);
-}
-_RS_STATIC void __attribute__((overloadable)) rsDebug(const char *s, float3 v) {
-    rsDebug(s, v.x, v.y, v.z);
-}
-_RS_STATIC void __attribute__((overloadable)) rsDebug(const char *s, float4 v) {
-    rsDebug(s, v.x, v.y, v.z, v.w);
-}
-
-_RS_STATIC uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
-{
-    uchar4 c;
-    c.x = (uchar)(r * 255.f);
-    c.y = (uchar)(g * 255.f);
-    c.z = (uchar)(b * 255.f);
-    c.w = 255;
-    return c;
-}
-
-_RS_STATIC uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
-{
-    uchar4 c;
-    c.x = (uchar)(r * 255.f);
-    c.y = (uchar)(g * 255.f);
-    c.z = (uchar)(b * 255.f);
-    c.w = (uchar)(a * 255.f);
-    return c;
-}
-
-_RS_STATIC uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-{
-    color *= 255.f;
-    uchar4 c = {color.x, color.y, color.z, 255};
-    return c;
-}
-
-_RS_STATIC uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
-{
-    color *= 255.f;
-    uchar4 c = {color.x, color.y, color.z, color.w};
-    return c;
-}
-
-_RS_STATIC float4 rsUnpackColor8888(uchar4 c)
-{
-    float4 ret = (float4)0.0039156862745f;
-    ret *= convert_float4(c);
-    return ret;
-}
+_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float2 v);
+_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float3 v);
+_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float4 v);
+
+_RS_RUNTIME uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b);
+
+_RS_RUNTIME uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a);
+
+_RS_RUNTIME uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color);
+
+_RS_RUNTIME uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color);
+
+_RS_RUNTIME float4 rsUnpackColor8888(uchar4 c);
 
 //extern uchar4 __attribute__((overloadable)) rsPackColorTo565(float r, float g, float b);
 //extern uchar4 __attribute__((overloadable)) rsPackColorTo565(float3);
@@ -101,830 +60,117 @@ _RS_STATIC float4 rsUnpackColor8888(uchar4 c)
 // Matrix ops
 /////////////////////////////////////////////////////
 
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 4 + col] = v;
-}
-
-_RS_STATIC float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 4 + col];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 3 + col] = v;
-}
-
-_RS_STATIC float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 3 + col];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 2 + col] = v;
-}
-
-_RS_STATIC float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 2 + col];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix4x4 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = 0.f;
-    m->m[5] = 1.f;
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = 0.f;
-    m->m[9] = 0.f;
-    m->m[10] = 1.f;
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix3x3 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = 1.f;
-    m->m[5] = 0.f;
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = 1.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix2x2 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 1.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const float *v) {
-    m->m[0] = v[0];
-    m->m[1] = v[1];
-    m->m[2] = v[2];
-    m->m[3] = v[3];
-    m->m[4] = v[4];
-    m->m[5] = v[5];
-    m->m[6] = v[6];
-    m->m[7] = v[7];
-    m->m[8] = v[8];
-    m->m[9] = v[9];
-    m->m[10] = v[10];
-    m->m[11] = v[11];
-    m->m[12] = v[12];
-    m->m[13] = v[13];
-    m->m[14] = v[14];
-    m->m[15] = v[15];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix3x3 *m, const float *v) {
-    m->m[0] = v[0];
-    m->m[1] = v[1];
-    m->m[2] = v[2];
-    m->m[3] = v[3];
-    m->m[4] = v[4];
-    m->m[5] = v[5];
-    m->m[6] = v[6];
-    m->m[7] = v[7];
-    m->m[8] = v[8];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix2x2 *m, const float *v) {
-    m->m[0] = v[0];
-    m->m[1] = v[1];
-    m->m[2] = v[2];
-    m->m[3] = v[3];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = v->m[2];
-    m->m[3] = v->m[3];
-    m->m[4] = v->m[4];
-    m->m[5] = v->m[5];
-    m->m[6] = v->m[6];
-    m->m[7] = v->m[7];
-    m->m[8] = v->m[8];
-    m->m[9] = v->m[9];
-    m->m[10] = v->m[10];
-    m->m[11] = v->m[11];
-    m->m[12] = v->m[12];
-    m->m[13] = v->m[13];
-    m->m[14] = v->m[14];
-    m->m[15] = v->m[15];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = v->m[2];
-    m->m[3] = 0.f;
-    m->m[4] = v->m[3];
-    m->m[5] = v->m[4];
-    m->m[6] = v->m[5];
-    m->m[7] = 0.f;
-    m->m[8] = v->m[6];
-    m->m[9] = v->m[7];
-    m->m[10] = v->m[8];
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = v->m[3];
-    m->m[5] = v->m[4];
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = v->m[6];
-    m->m[9] = v->m[7];
-    m->m[10] = 1.f;
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix3x3 *m, const rs_matrix3x3 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = v->m[2];
-    m->m[3] = v->m[3];
-    m->m[4] = v->m[4];
-    m->m[5] = v->m[5];
-    m->m[6] = v->m[6];
-    m->m[7] = v->m[7];
-    m->m[8] = v->m[8];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix2x2 *m, const rs_matrix2x2 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = v->m[2];
-    m->m[3] = v->m[3];
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z) {
-    float c, s;
-    m->m[3] = 0;
-    m->m[7] = 0;
-    m->m[11]= 0;
-    m->m[12]= 0;
-    m->m[13]= 0;
-    m->m[14]= 0;
-    m->m[15]= 1;
-    rot *= (float)(M_PI / 180.0f);
-    c = cos(rot);
-    s = sin(rot);
-
-    const float len = x*x + y*y + z*z;
-    if (len != 1) {
-        const float recipLen = 1.f / sqrt(len);
-        x *= recipLen;
-        y *= recipLen;
-        z *= recipLen;
-    }
-    const float nc = 1.0f - c;
-    const float xy = x * y;
-    const float yz = y * z;
-    const float zx = z * x;
-    const float xs = x * s;
-    const float ys = y * s;
-    const float zs = z * s;
-    m->m[ 0] = x*x*nc +  c;
-    m->m[ 4] =  xy*nc - zs;
-    m->m[ 8] =  zx*nc + ys;
-    m->m[ 1] =  xy*nc + zs;
-    m->m[ 5] = y*y*nc +  c;
-    m->m[ 9] =  yz*nc - xs;
-    m->m[ 2] =  zx*nc - ys;
-    m->m[ 6] =  yz*nc + xs;
-    m->m[10] = z*z*nc +  c;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z) {
-    rsMatrixLoadIdentity(m);
-    m->m[0] = x;
-    m->m[5] = y;
-    m->m[10] = z;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z) {
-    rsMatrixLoadIdentity(m);
-    m->m[12] = x;
-    m->m[13] = y;
-    m->m[14] = z;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
-    for (int i=0 ; i<4 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        float ri2 = 0;
-        float ri3 = 0;
-        for (int j=0 ; j<4 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i,j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
-            ri3 += rsMatrixGet(lhs, j, 3) * rhs_ij;
-        }
-        rsMatrixSet(m, i, 0, ri0);
-        rsMatrixSet(m, i, 1, ri1);
-        rsMatrixSet(m, i, 2, ri2);
-        rsMatrixSet(m, i, 3, ri3);
-    }
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *rhs) {
-    rs_matrix4x4 mt;
-    rsMatrixLoadMultiply(&mt, m, rhs);
-    rsMatrixLoad(m, &mt);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
-    for (int i=0 ; i<3 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        float ri2 = 0;
-        for (int j=0 ; j<3 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i,j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
-        }
-        rsMatrixSet(m, i, 0, ri0);
-        rsMatrixSet(m, i, 1, ri1);
-        rsMatrixSet(m, i, 2, ri2);
-    }
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *rhs) {
-    rs_matrix3x3 mt;
-    rsMatrixLoadMultiply(&mt, m, rhs);
-    rsMatrixLoad(m, &mt);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
-    for (int i=0 ; i<2 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        for (int j=0 ; j<2 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i,j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-        }
-        rsMatrixSet(m, i, 0, ri0);
-        rsMatrixSet(m, i, 1, ri1);
-    }
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *rhs) {
-    rs_matrix2x2 mt;
-    rsMatrixLoadMultiply(&mt, m, rhs);
-    rsMatrixLoad(m, &mt);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z) {
-    rs_matrix4x4 m1;
-    rsMatrixLoadRotate(&m1, rot, x, y, z);
-    rsMatrixMultiply(m, &m1);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixScale(rs_matrix4x4 *m, float x, float y, float z) {
-    rs_matrix4x4 m1;
-    rsMatrixLoadScale(&m1, x, y, z);
-    rsMatrixMultiply(m, &m1);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixTranslate(rs_matrix4x4 *m, float x, float y, float z) {
-    rs_matrix4x4 m1;
-    rsMatrixLoadTranslate(&m1, x, y, z);
-    rsMatrixMultiply(m, &m1);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far) {
-    rsMatrixLoadIdentity(m);
-    m->m[0] = 2.f / (right - left);
-    m->m[5] = 2.f / (top - bottom);
-    m->m[10]= -2.f / (far - near);
-    m->m[12]= -(right + left) / (right - left);
-    m->m[13]= -(top + bottom) / (top - bottom);
-    m->m[14]= -(far + near) / (far - near);
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far) {
-    rsMatrixLoadIdentity(m);
-    m->m[0] = 2.f * near / (right - left);
-    m->m[5] = 2.f * near / (top - bottom);
-    m->m[8] = (right + left) / (right - left);
-    m->m[9] = (top + bottom) / (top - bottom);
-    m->m[10]= -(far + near) / (far - near);
-    m->m[11]= -1.f;
-    m->m[14]= -2.f * far * near / (far - near);
-    m->m[15]= 0.f;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far) {
-    float top = near * tan((float) (fovy * M_PI / 360.0f));
-    float bottom = -top;
-    float left = bottom * aspect;
-    float right = top * aspect;
-    rsMatrixLoadFrustum(m, left, right, bottom, top, near, far);
-}
-
-_RS_STATIC float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float4 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + (m->m[8] * in.z) + (m->m[12] * in.w);
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + (m->m[9] * in.z) + (m->m[13] * in.w);
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + (m->m[10] * in.z) + (m->m[14] * in.w);
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + (m->m[11] * in.z) + (m->m[15] * in.w);
-    return ret;
-}
-
-_RS_STATIC float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float3 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + (m->m[8] * in.z) + m->m[12];
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + (m->m[9] * in.z) + m->m[13];
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + (m->m[10] * in.z) + m->m[14];
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + (m->m[11] * in.z) + m->m[15];
-    return ret;
-}
-
-_RS_STATIC float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float2 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + m->m[12];
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + m->m[13];
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + m->m[14];
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + m->m[15];
-    return ret;
-}
-
-_RS_STATIC float3 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *m, float3 in) {
-    float3 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[3] * in.y) + (m->m[6] * in.z);
-    ret.y = (m->m[1] * in.x) + (m->m[4] * in.y) + (m->m[7] * in.z);
-    ret.z = (m->m[2] * in.x) + (m->m[5] * in.y) + (m->m[8] * in.z);
-    return ret;
-}
-
-_RS_STATIC float3 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *m, float2 in) {
-    float3 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[3] * in.y);
-    ret.y = (m->m[1] * in.x) + (m->m[4] * in.y);
-    ret.z = (m->m[2] * in.x) + (m->m[5] * in.y);
-    return ret;
-}
-
-_RS_STATIC float2 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix2x2 *m, float2 in) {
-    float2 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[2] * in.y);
-    ret.y = (m->m[1] * in.x) + (m->m[3] * in.y);
-    return ret;
-}
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v);
 
-// Returns true if the matrix was successfully inversed
-_RS_STATIC bool __attribute__((overloadable))
-rsMatrixInverse(rs_matrix4x4 *m) {
-    rs_matrix4x4 result;
-
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-        for (j = 0; j < 4; ++j) {
-            // computeCofactor for int i, int j
-            int c0 = (i+1) % 4;
-            int c1 = (i+2) % 4;
-            int c2 = (i+3) % 4;
-            int r0 = (j+1) % 4;
-            int r1 = (j+2) % 4;
-            int r2 = (j+3) % 4;
-
-            float minor = (m->m[c0 + 4*r0] * (m->m[c1 + 4*r1] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r1]))
-                         - (m->m[c0 + 4*r1] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r0]))
-                         + (m->m[c0 + 4*r2] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r1] - m->m[c1 + 4*r1] * m->m[c2 + 4*r0]));
-
-            float cofactor = (i+j) & 1 ? -minor : minor;
-
-            result.m[4*i + j] = cofactor;
-        }
-    }
-
-    // Dot product of 0th column of source and 0th row of result
-    float det = m->m[0]*result.m[0] + m->m[4]*result.m[1] +
-                 m->m[8]*result.m[2] + m->m[12]*result.m[3];
-
-    if (fabs(det) < 1e-6) {
-        return false;
-    }
-
-    det = 1.0f / det;
-    for (i = 0; i < 16; ++i) {
-        m->m[i] = result.m[i] * det;
-    }
-
-    return true;
-}
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col);
 
-// Returns true if the matrix was successfully inversed
-_RS_STATIC bool __attribute__((overloadable))
-rsMatrixInverseTranspose(rs_matrix4x4 *m) {
-    rs_matrix4x4 result;
-
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-        for (j = 0; j < 4; ++j) {
-            // computeCofactor for int i, int j
-            int c0 = (i+1) % 4;
-            int c1 = (i+2) % 4;
-            int c2 = (i+3) % 4;
-            int r0 = (j+1) % 4;
-            int r1 = (j+2) % 4;
-            int r2 = (j+3) % 4;
-
-            float minor = (m->m[c0 + 4*r0] * (m->m[c1 + 4*r1] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r1]))
-                         - (m->m[c0 + 4*r1] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r2] - m->m[c1 + 4*r2] * m->m[c2 + 4*r0]))
-                         + (m->m[c0 + 4*r2] * (m->m[c1 + 4*r0] * m->m[c2 + 4*r1] - m->m[c1 + 4*r1] * m->m[c2 + 4*r0]));
-
-            float cofactor = (i+j) & 1 ? -minor : minor;
-
-            result.m[4*j + i] = cofactor;
-        }
-    }
-
-    // Dot product of 0th column of source and 0th column of result
-    float det = m->m[0]*result.m[0] + m->m[4]*result.m[4] +
-                 m->m[8]*result.m[8] + m->m[12]*result.m[12];
-
-    if (fabs(det) < 1e-6) {
-        return false;
-    }
-
-    det = 1.0f / det;
-    for (i = 0; i < 16; ++i) {
-        m->m[i] = result.m[i] * det;
-    }
-
-    return true;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixTranspose(rs_matrix4x4 *m) {
-    int i, j;
-    float temp;
-    for (i = 0; i < 3; ++i) {
-        for (j = i + 1; j < 4; ++j) {
-            temp = m->m[i*4 + j];
-            m->m[i*4 + j] = m->m[j*4 + i];
-            m->m[j*4 + i] = temp;
-        }
-    }
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixTranspose(rs_matrix3x3 *m) {
-    int i, j;
-    float temp;
-    for (i = 0; i < 2; ++i) {
-        for (j = i + 1; j < 3; ++j) {
-            temp = m->m[i*3 + j];
-            m->m[i*3 + j] = m->m[j*4 + i];
-            m->m[j*3 + i] = temp;
-        }
-    }
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsMatrixTranspose(rs_matrix2x2 *m) {
-    float temp = m->m[1];
-    m->m[1] = m->m[2];
-    m->m[2] = temp;
-}
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v);
 
-/////////////////////////////////////////////////////
-// quaternion ops
-/////////////////////////////////////////////////////
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col);
 
-_RS_STATIC void __attribute__((overloadable))
-rsQuaternionSet(rs_quaternion *q, float w, float x, float y, float z) {
-    q->w = w;
-    q->x = x;
-    q->y = y;
-    q->z = z;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsQuaternionSet(rs_quaternion *q, const rs_quaternion *rhs) {
-    q->w = rhs->w;
-    q->x = rhs->x;
-    q->y = rhs->y;
-    q->z = rhs->z;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsQuaternionMultiply(rs_quaternion *q, float s) {
-    q->w *= s;
-    q->x *= s;
-    q->y *= s;
-    q->z *= s;
-}
-
-_RS_STATIC void __attribute__((overloadable))
-rsQuaternionMultiply(rs_quaternion *q, const rs_quaternion *rhs) {
-    q->w = -q->x*rhs->x - q->y*rhs->y - q->z*rhs->z + q->w*rhs->w;
-    q->x =  q->x*rhs->w + q->y*rhs->z - q->z*rhs->y + q->w*rhs->x;
-    q->y = -q->x*rhs->z + q->y*rhs->w + q->z*rhs->z + q->w*rhs->y;
-    q->z =  q->x*rhs->y - q->y*rhs->x + q->z*rhs->w + q->w*rhs->z;
-}
-
-_RS_STATIC void
-rsQuaternionAdd(rs_quaternion *q, const rs_quaternion *rhs) {
-    q->w *= rhs->w;
-    q->x *= rhs->x;
-    q->y *= rhs->y;
-    q->z *= rhs->z;
-}
-
-_RS_STATIC void
-rsQuaternionLoadRotateUnit(rs_quaternion *q, float rot, float x, float y, float z) {
-    rot *= (float)(M_PI / 180.0f) * 0.5f;
-    float c = cos(rot);
-    float s = sin(rot);
-
-    q->w = c;
-    q->x = x * s;
-    q->y = y * s;
-    q->z = z * s;
-}
-
-_RS_STATIC void
-rsQuaternionLoadRotate(rs_quaternion *q, float rot, float x, float y, float z) {
-    const float len = x*x + y*y + z*z;
-    if (len != 1) {
-        const float recipLen = 1.f / sqrt(len);
-        x *= recipLen;
-        y *= recipLen;
-        z *= recipLen;
-    }
-    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
-}
-
-_RS_STATIC void
-rsQuaternionConjugate(rs_quaternion *q) {
-    q->x = -q->x;
-    q->y = -q->y;
-    q->z = -q->z;
-}
-
-_RS_STATIC float
-rsQuaternionDot(const rs_quaternion *q0, const rs_quaternion *q1) {
-    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
-}
-
-_RS_STATIC void
-rsQuaternionNormalize(rs_quaternion *q) {
-    const float len = rsQuaternionDot(q, q);
-    if (len != 1) {
-        const float recipLen = 1.f / sqrt(len);
-        rsQuaternionMultiply(q, recipLen);
-    }
-}
-
-_RS_STATIC void
-rsQuaternionSlerp(rs_quaternion *q, const rs_quaternion *q0, const rs_quaternion *q1, float t) {
-    if (t <= 0.0f) {
-        rsQuaternionSet(q, q0);
-        return;
-    }
-    if (t >= 1.0f) {
-        rsQuaternionSet(q, q1);
-        return;
-    }
-
-    rs_quaternion tempq0, tempq1;
-    rsQuaternionSet(&tempq0, q0);
-    rsQuaternionSet(&tempq1, q1);
-
-    float angle = rsQuaternionDot(q0, q1);
-    if (angle < 0) {
-        rsQuaternionMultiply(&tempq0, -1.0f);
-        angle *= -1.0f;
-    }
-
-    float scale, invScale;
-    if (angle + 1.0f > 0.05f) {
-        if (1.0f - angle >= 0.05f) {
-            float theta = acos(angle);
-            float invSinTheta = 1.0f / sin(theta);
-            scale = sin(theta * (1.0f - t)) * invSinTheta;
-            invScale = sin(theta * t) * invSinTheta;
-        } else {
-            scale = 1.0f - t;
-            invScale = t;
-        }
-    } else {
-        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
-        scale = sin(M_PI * (0.5f - t));
-        invScale = sin(M_PI * t);
-    }
-
-    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
-                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
-}
-
-_RS_STATIC void rsQuaternionGetMatrixUnit(rs_matrix4x4 *m, const rs_quaternion *q) {
-    float x2 = 2.0f * q->x * q->x;
-    float y2 = 2.0f * q->y * q->y;
-    float z2 = 2.0f * q->z * q->z;
-    float xy = 2.0f * q->x * q->y;
-    float wz = 2.0f * q->w * q->z;
-    float xz = 2.0f * q->x * q->z;
-    float wy = 2.0f * q->w * q->y;
-    float wx = 2.0f * q->w * q->x;
-    float yz = 2.0f * q->y * q->z;
-
-    m->m[0] = 1.0f - y2 - z2;
-    m->m[1] = xy - wz;
-    m->m[2] = xz + wy;
-    m->m[3] = 0.0f;
-
-    m->m[4] = xy + wz;
-    m->m[5] = 1.0f - x2 - z2;
-    m->m[6] = yz - wx;
-    m->m[7] = 0.0f;
-
-    m->m[8] = xz - wy;
-    m->m[9] = yz - wx;
-    m->m[10] = 1.0f - x2 - y2;
-    m->m[11] = 0.0f;
-
-    m->m[12] = 0.0f;
-    m->m[13] = 0.0f;
-    m->m[14] = 0.0f;
-    m->m[15] = 1.0f;
-}
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v);
 
-/////////////////////////////////////////////////////
-// utility funcs
-/////////////////////////////////////////////////////
-__inline__ _RS_STATIC void __attribute__((overloadable, always_inline))
-rsExtractFrustumPlanes(const rs_matrix4x4 *modelViewProj,
-                         float4 *left, float4 *right,
-                         float4 *top, float4 *bottom,
-                         float4 *near, float4 *far) {
-    // x y z w = a b c d in the plane equation
-    left->x = modelViewProj->m[3] + modelViewProj->m[0];
-    left->y = modelViewProj->m[7] + modelViewProj->m[4];
-    left->z = modelViewProj->m[11] + modelViewProj->m[8];
-    left->w = modelViewProj->m[15] + modelViewProj->m[12];
-
-    right->x = modelViewProj->m[3] - modelViewProj->m[0];
-    right->y = modelViewProj->m[7] - modelViewProj->m[4];
-    right->z = modelViewProj->m[11] - modelViewProj->m[8];
-    right->w = modelViewProj->m[15] - modelViewProj->m[12];
-
-    top->x = modelViewProj->m[3] - modelViewProj->m[1];
-    top->y = modelViewProj->m[7] - modelViewProj->m[5];
-    top->z = modelViewProj->m[11] - modelViewProj->m[9];
-    top->w = modelViewProj->m[15] - modelViewProj->m[13];
-
-    bottom->x = modelViewProj->m[3] + modelViewProj->m[1];
-    bottom->y = modelViewProj->m[7] + modelViewProj->m[5];
-    bottom->z = modelViewProj->m[11] + modelViewProj->m[9];
-    bottom->w = modelViewProj->m[15] + modelViewProj->m[13];
-
-    near->x = modelViewProj->m[3] + modelViewProj->m[2];
-    near->y = modelViewProj->m[7] + modelViewProj->m[6];
-    near->z = modelViewProj->m[11] + modelViewProj->m[10];
-    near->w = modelViewProj->m[15] + modelViewProj->m[14];
-
-    far->x = modelViewProj->m[3] - modelViewProj->m[2];
-    far->y = modelViewProj->m[7] - modelViewProj->m[6];
-    far->z = modelViewProj->m[11] - modelViewProj->m[10];
-    far->w = modelViewProj->m[15] - modelViewProj->m[14];
-
-    float len = length(left->xyz);
-    *left /= len;
-    len = length(right->xyz);
-    *right /= len;
-    len = length(top->xyz);
-    *top /= len;
-    len = length(bottom->xyz);
-    *bottom /= len;
-    len = length(near->xyz);
-    *near /= len;
-    len = length(far->xyz);
-    *far /= len;
-}
-
-__inline__ _RS_STATIC bool __attribute__((overloadable, always_inline))
-rsIsSphereInFrustum(float4 *sphere,
-                      float4 *left, float4 *right,
-                      float4 *top, float4 *bottom,
-                      float4 *near, float4 *far) {
-
-    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
-    if (distToCenter < -sphere->w) {
-        return false;
-    }
-    return true;
-}
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col);
+
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix4x4 *m);
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix3x3 *m);
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix2x2 *m);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const float *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const float *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const float *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const rs_matrix3x3 *v);
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const rs_matrix2x2 *v);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *rhs);
+
+extern void __attribute__((overloadable))
+rsMatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixScale(rs_matrix4x4 *m, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
 
+extern void __attribute__((overloadable))
+rsMatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
+
+extern void __attribute__((overloadable))
+rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float4 in);
+
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float3 in);
+
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float2 in);
+
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float3 in);
+
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float2 in);
+
+_RS_RUNTIME float2 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, float2 in);
+
+// Returns true if the matrix was successfully inversed
+extern bool __attribute__((overloadable)) rsMatrixInverse(rs_matrix4x4 *m);
+extern bool __attribute__((overloadable)) rsMatrixInverseTranspose(rs_matrix4x4 *m);
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix4x4 *m);
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix3x3 *m);
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix2x2 *m);
 
 /////////////////////////////////////////////////////
 // int ops
 /////////////////////////////////////////////////////
 
-__inline__ _RS_STATIC uint __attribute__((overloadable, always_inline)) rsClamp(uint amount, uint low, uint high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-__inline__ _RS_STATIC int __attribute__((overloadable, always_inline)) rsClamp(int amount, int low, int high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-__inline__ _RS_STATIC ushort __attribute__((overloadable, always_inline)) rsClamp(ushort amount, ushort low, ushort high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-__inline__ _RS_STATIC short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-__inline__ _RS_STATIC uchar __attribute__((overloadable, always_inline)) rsClamp(uchar amount, uchar low, uchar high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-__inline__ _RS_STATIC char __attribute__((overloadable, always_inline)) rsClamp(char amount, char low, char high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-
-#undef _RS_STATIC
+_RS_RUNTIME uint __attribute__((overloadable, always_inline)) rsClamp(uint amount, uint low, uint high);
+_RS_RUNTIME int __attribute__((overloadable, always_inline)) rsClamp(int amount, int low, int high);
+_RS_RUNTIME ushort __attribute__((overloadable, always_inline)) rsClamp(ushort amount, ushort low, ushort high);
+_RS_RUNTIME short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
+_RS_RUNTIME uchar __attribute__((overloadable, always_inline)) rsClamp(uchar amount, uchar low, uchar high);
+_RS_RUNTIME char __attribute__((overloadable, always_inline)) rsClamp(char amount, char low, char high);
 
-#endif
+#undef _RS_RUNTIME
 
+#endif