#pragma once #include #include #include "private.h" #include "types.h" #ifndef GL_FORCE_INLINE #define GL_NO_INSTRUMENT inline __attribute__((no_instrument_function)) #define GL_INLINE_DEBUG GL_NO_INSTRUMENT __attribute__((always_inline)) #define GL_FORCE_INLINE static GL_INLINE_DEBUG #endif #define PREFETCH(addr) __builtin_prefetch((addr)) GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) { if(!len) { return dest; } const uint8_t *s = (uint8_t *)src; uint8_t *d = (uint8_t *)dest; uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated // Underflow would be like adding a negative offset // Can use 'd' as a scratch reg now asm volatile ( "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn ".align 2\n" "0:\n\t" "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1) "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2) "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2) " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1) : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs : [offset] "z" (diff) // inputs : "t", "memory" // clobbers ); return dest; } #define PT_ALPHA_REF 0x011c static inline void GPUSetAlphaCutOff(uint8_t val) { PVR_SET(PT_ALPHA_REF, val); } typedef struct { uint32_t cmd; uint32_t mode1; uint32_t mode2; uint32_t mode3; uint32_t d1; uint32_t d2; uint32_t d3; uint32_t d4; } PolyHeader; void SceneListSubmit(Vertex* v2, int n); static inline int DimensionFlag(const int w) { switch(w) { case 16: return 1; case 32: return 2; case 64: return 3; case 128: return 4; case 256: return 5; case 512: return 6; case 1024: return 7; case 8: default: return 0; } }