diff options
Diffstat (limited to 'third_party/gldc/src/sh4_math.h')
-rw-r--r-- | third_party/gldc/src/sh4_math.h | 1820 |
1 files changed, 1820 insertions, 0 deletions
diff --git a/third_party/gldc/src/sh4_math.h b/third_party/gldc/src/sh4_math.h new file mode 100644 index 0000000..c12c30d --- /dev/null +++ b/third_party/gldc/src/sh4_math.h @@ -0,0 +1,1820 @@ +// ---- sh4_math.h - SH7091 Math Module ---- +// +// Version 1.1.3 +// +// This file is part of the DreamHAL project, a hardware abstraction library +// primarily intended for use on the SH7091 found in hardware such as the SEGA +// Dreamcast game console. +// +// This math module is hereby released into the public domain in the hope that it +// may prove useful. Now go hit 60 fps! :) +// +// --Moopthehedgehog +// + +// Notes: +// - GCC 4 users have a different return type for the fsca functions due to an +// internal compiler error regarding complex numbers; no issue under GCC 9.2.0 +// - Using -m4 instead of -m4-single-only completely breaks the matrix and +// vector operations +// - Function inlining must be enabled and not blocked by compiler options such +// as -ffunction-sections, as blocking inlining will result in significant +// performance degradation for the vector and matrix functions employing a +// RETURN_VECTOR_STRUCT return type. I have added compiler hints and attributes +// "static inline __attribute__((always_inline))" to mitigate this, so in most +// cases the functions should be inlined regardless. If in doubt, check the +// compiler asm output! +// + +#ifndef __SH4_MATH_H_ +#define __SH4_MATH_H_ + +#define GNUC_FSCA_ERROR_VERSION 4 + +// +// Fast SH4 hardware math functions +// +// +// High-accuracy users beware, the fsrra functions have an error of +/- 2^-21 +// per http://www.shared-ptr.com/sh_insns.html +// + +//============================================================================== +// Definitions +//============================================================================== +// +// Structures, useful definitions, and reference comments +// + +// Front matrix format: +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ fr0 fr4 fr8 fr12 ] +// [ fr1 fr5 fr9 fr13 ] +// [ fr2 fr6 fr10 fr14 ] +// [ fr3 fr7 fr11 fr15 ] +// +// Back matrix, XMTRX, is similar, although it has no FVn vector groups: +// +// [ xf0 xf4 xf8 xf12 ] +// [ xf1 xf5 xf9 xf13 ] +// [ xf2 xf6 xf10 xf14 ] +// [ xf3 xf7 xf11 xf15 ] +// + +typedef struct __attribute__((aligned(32))) { + float fr0; + float fr1; + float fr2; + float fr3; + float fr4; + float fr5; + float fr6; + float fr7; + float fr8; + float fr9; + float fr10; + float fr11; + float fr12; + float fr13; + float fr14; + float fr15; +} ALL_FLOATS_STRUCT; + +// Return structs should be defined locally so that GCC optimizes them into +// register usage instead of memory accesses. +typedef struct { + float z1; + float z2; + float z3; + float z4; +} RETURN_VECTOR_STRUCT; + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +typedef struct { + float sine; + float cosine; +} RETURN_FSCA_STRUCT; +#endif + +// Identity Matrix +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ 1 0 0 0 ] +// [ 0 1 0 0 ] +// [ 0 0 1 0 ] +// [ 0 0 0 1 ] +// + +static const ALL_FLOATS_STRUCT MATH_identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; + +// Constants +#define MATH_pi 3.14159265358979323846264338327950288419716939937510f +#define MATH_e 2.71828182845904523536028747135266249775724709369995f +#define MATH_phi 1.61803398874989484820458683436563811772030917980576f + +//============================================================================== +// Basic math functions +//============================================================================== +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // |x| + float MATH_fabs(float x) + + // sqrt(x) + float MATH_fsqrt(float x) + + // a*b+c + float MATH_fmac(float a, float b, float c) + + // a*b-c + float MATH_fmac_Dec(float a, float b, float c) + + // fminf() - return the min of two floats + // This doesn't check for NaN + float MATH_Fast_Fminf(float a, float b) + + // fmaxf() - return the max of two floats + // This doesn't check for NaN + float MATH_Fast_Fmaxf(float a, float b) + + // Fast floorf() - return the nearest integer <= x as a float + // This doesn't check for NaN + float MATH_Fast_Floorf(float x) + + // Fast ceilf() - return the nearest integer >= x as a float + // This doesn't check for NaN + float MATH_Fast_Ceilf(float x) + + // Very fast floorf() - return the nearest integer <= x as a float + // Inspired by a cool trick I came across here: + // https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions + // This doesn't check for NaN + float MATH_Very_Fast_Floorf(float x) + + // Very fast ceilf() - return the nearest integer >= x as a float + // Inspired by a cool trick I came across here: + // https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions + // This doesn't check for NaN + float MATH_Very_Fast_Ceilf(float x) +*/ + +// |x| +// This one works on ARM and x86, too! +static inline __attribute__((always_inline)) float MATH_fabs(float x) +{ + asm volatile ("fabs %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// sqrt(x) +// This one works on ARM and x86, too! +// NOTE: There is a much faster version (MATH_Fast_Sqrt()) in the fsrra section of +// this file. Chances are you probably want that one. +static inline __attribute__((always_inline)) float MATH_fsqrt(float x) +{ + asm volatile ("fsqrt %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// a*b+c +static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, float c) +{ + asm volatile ("fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+f" (c) // outputs, "+" means r/w + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +// a*b-c +static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c) +{ + asm volatile ("fneg %[floatc]\n\t" + "fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +// Fast fminf() - return the min of two floats +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Fminf(float a, float b) +{ + float output_float; + + asm volatile ( + "fcmp/gt %[floata], %[floatb]\n\t" // b > a (NaN evaluates to !GT; 0 -> T) + "bt.s 1f\n\t" // yes, a is smaller + " fmov %[floata], %[float_out]\n\t" // so return a + "fmov %[floatb], %[float_out]\n" // no, either b is smaller or they're equal and it doesn't matter + "1:\n" + : [float_out] "=&f" (output_float) // outputs + : [floata] "f" (a), [floatb] "f" (b) // inputs + : "t" // clobbers + ); + + return output_float; +} + +// Fast fmaxf() - return the max of two floats +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Fmaxf(float a, float b) +{ + float output_float; + + asm volatile ( + "fcmp/gt %[floata], %[floatb]\n\t" // b > a (NaN evaluates to !GT; 0 -> T) + "bt.s 1f\n\t" // yes, a is smaller + " fmov %[floatb], %[float_out]\n\t" // so return b + "fmov %[floata], %[float_out]\n" // no, either a is bigger or they're equal and it doesn't matter + "1:\n" + : [float_out] "=&f" (output_float) // outputs + : [floata] "f" (a), [floatb] "f" (b) // inputs + : "t" // clobbers + ); + + return output_float; +} + +// Fast floorf() - return the nearest integer <= x as a float +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Floorf(float x) +{ + float output_float; + + // To hold -1.0f + float minus_one; + + asm volatile ( + "fldi1 %[minus_1]\n\t" + "fneg %[minus_1]\n\t" + "fcmp/gt %[minus_1], %[floatx]\n\t" // x >= 0 + "ftrc %[floatx], fpul\n\t" // convert float to int + "bt.s 1f\n\t" + " float fpul, %[float_out]\n\t" // convert int to float + "fadd %[minus_1], %[float_out]\n" // if input x < 0, subtract 1.0 + "1:\n" + : [minus_1] "=&f" (minus_one), [float_out] "=f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Fast ceilf() - return the nearest integer >= x as a float +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Ceilf(float x) +{ + float output_float; + + // To hold 0.0f and 1.0f + float zero_one; + + asm volatile ( + "fldi0 %[zero_1]\n\t" + "fcmp/gt %[zero_1], %[floatx]\n\t" // x > 0 + "ftrc %[floatx], fpul\n\t" // convert float to int + "bf.s 1f\n\t" + " float fpul, %[float_out]\n\t" // convert int to float + "fldi1 %[zero_1]\n\t" + "fadd %[zero_1], %[float_out]\n" // if input x > 0, add 1.0 + "1:\n" + : [zero_1] "=&f" (zero_one), [float_out] "=f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Very fast floorf() - return the nearest integer <= x as a float +// Inspired by a cool trick I came across here: +// https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Very_Fast_Floorf(float x) +{ + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + // 0x4f000000 == 2^31 in float -- 0x4f << 24 is INT_MAX + 1.0f + // 0x80000000 == -2^31 == INT_MIN == -(INT_MAX + 1.0f) + + // floor = (float)( (int)(x + (float)2^31) - 2^31) + + asm volatile ( + "mov #0x4f, %[scratch]\n\t" // Build float INT_MAX + 1 as a float using only regs (EX) + "shll16 %[scratch]\n\t" // (EX) + "shll8 %[scratch]\n\t" // (EX) + "lds %[scratch], fpul\n\t" // move float INT_MAX + 1 to float regs (LS) + "mov #1, %[scratch2]\n\t" // Build INT_MIN from scratch in parallel (EX) + "fsts fpul, %[float_out]\n\t" // (LS) + "fadd %[floatx], %[float_out]\n\t" // float-add float INT_MAX + 1 to x (FE) + "rotr %[scratch2]\n\t" // rotate the 1 in bit 0 from LSB to MSB for INT_MIN, clobber T (EX) + "ftrc %[float_out], fpul\n\t" // convert float to int (FE) -- ftrc -> sts is special combo + "sts fpul, %[scratch]\n\t" // move back to int regs (LS) + "add %[scratch2], %[scratch]\n\t" // Add INT_MIN to int (EX) + "lds %[scratch], fpul\n\t" // (LS) -- lds -> float is a special combo + "float fpul, %[float_out]\n" // convert back to float (FE) + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Very fast ceilf() - return the nearest integer >= x as a float +// Inspired by a cool trick I came across here: +// https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Very_Fast_Ceilf(float x) +{ + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + // 0x4f000000 == 2^31 in float -- 0x4f << 24 is INT_MAX + 1.0f + // 0x80000000 == -2^31 == INT_MIN == -(INT_MAX + 1.0f) + + // Ceiling is the inverse of floor such that f^-1(x) = -f(-x) + // To make very fast ceiling have as wide a range as very fast floor, + // use this property to subtract x from INT_MAX + 1 and get the negative of the + // ceiling, and then negate the final output. This allows ceiling to use + // -2^31 and have the same range as very fast floor. + + // Given: + // floor = (float)( (int)(x + (float)2^31) - 2^31 ) + // We can do: + // ceiling = -( (float)( (int)((float)2^31 - x) - 2^31 ) ) + // or (slower on SH4 since 'fneg' is faster than 'neg'): + // ceiling = (float) -( (int)((float)2^31 - x) - 2^31 ) + // Since mathematically these functions are related by f^-1(x) = -f(-x). + + asm volatile ( + "mov #0x4f, %[scratch]\n\t" // Build float INT_MAX + 1 as a float using only regs (EX) + "shll16 %[scratch]\n\t" // (EX) + "shll8 %[scratch]\n\t" // (EX) + "lds %[scratch], fpul\n\t" // move float INT_MAX + 1 to float regs (LS) + "mov #1, %[scratch2]\n\t" // Build INT_MIN from scratch in parallel (EX) + "fsts fpul, %[float_out]\n\t" // (LS) + "fsub %[floatx], %[float_out]\n\t" // float-sub x from float INT_MAX + 1 (FE) + "rotr %[scratch2]\n\t" // rotate the 1 in bit 0 from LSB to MSB for INT_MIN, clobber T (EX) + "ftrc %[float_out], fpul\n\t" // convert float to int (FE) -- ftrc -> sts is special combo + "sts fpul, %[scratch]\n\t" // move back to int regs (LS) + "add %[scratch2], %[scratch]\n\t" // Add INT_MIN to int (EX) + "lds %[scratch], fpul\n\t" // (LS) -- lds -> float is a special combo + "float fpul, %[float_out]\n\t" // convert back to float (FE) + "fneg %[float_out]\n" + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +//============================================================================== +// Fun with fsrra, which does 1/sqrt(x) in one cycle +//============================================================================== +// +// Error of 'fsrra' is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // 1/sqrt(x) + float MATH_fsrra(float x) + + // 1/x + float MATH_Fast_Invert(float x) + + // A faster divide than the 'fdiv' instruction + float MATH_Fast_Divide(float numerator, float denominator) + + // A faster square root then the 'fsqrt' instruction + float MATH_Fast_Sqrt(float x) + + // Standard, accurate, and slow float divide. Use this if MATH_Fast_Divide() gives you issues. + float MATH_Slow_Divide(float numerator, float denominator) +*/ + +// 1/sqrt(x) +static inline __attribute__((always_inline)) float MATH_fsrra(float x) +{ + asm volatile ("fsrra %[one_div_sqrt]\n" + : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// 1/x +// 1.0f / sqrt(x^2) +static inline __attribute__((always_inline)) float MATH_Fast_Invert(float x) +{ + int neg = 0; + + if(x < 0.0f) + { + neg = 1; + } + + x = MATH_fsrra(x*x); // 1.0f / sqrt(x^2) + + if(neg) + { + return -x; + } + else + { + return x; + } +} + +// It's faster to do this than to use 'fdiv'. +// Only fdiv can do doubles, however. +// Of course, not having to divide at all is generally the best way to go. :P +static inline __attribute__((always_inline)) float MATH_Fast_Divide(float numerator, float denominator) +{ + denominator = MATH_Fast_Invert(denominator); + return numerator * denominator; +} + +// fast sqrt(x) +// Crazy thing: invert(fsrra(x)) is actually about 3x faster than fsqrt. +static inline __attribute__((always_inline)) float MATH_Fast_Sqrt(float x) +{ + return MATH_Fast_Invert(MATH_fsrra(x)); +} + +// Standard, accurate, and slow float divide. Use this if MATH_Fast_Divide() gives you issues. +// This DOES work on negatives. +static inline __attribute__((always_inline)) float MATH_Slow_Divide(float numerator, float denominator) +{ + asm volatile ("fdiv %[div_denom], %[div_numer]\n" + : [div_numer] "+f" (numerator) // outputs, "+" means r/w + : [div_denom] "f" (denominator) // inputs + : // clobbers + ); + + return numerator; +} + +//============================================================================== +// Fun with fsca, which does simultaneous sine and cosine in 3 cycles +//============================================================================== +// +// NOTE: GCC 4.7 has a bug that prevents it from working with fsca and complex +// numbers in m4-single-only mode, so GCC 4 users will get a RETURN_FSCA_STRUCT +// instead of a _Complex float. This may be much slower in some instances. +// +// VERY IMPORTANT USAGE INFORMATION (sine and cosine functions): +// +// Due to the nature in which the fsca instruction behaves, you MUST do the +// following in your code to get sine and cosine from these functions: +// +// _Complex float sine_cosine = [Call the fsca function here] +// float sine_value = __real__ sine_cosine; +// float cosine_value = __imag__ sine_cosine; +// Your output is now in sine_value and cosine_value. +// +// This is necessary because fsca outputs both sine and cosine simultaneously +// and uses a double register to do so. The fsca functions do not actually +// return a double--they return two floats--and using a complex float here is +// just a bit of hacking the C language to make GCC do something that's legal in +// assembly according to the SH4 calling convention (i.e. multiple return values +// stored in floating point registers FR0-FR3). This is better than using a +// struct of floats for optimization purposes--this will operate at peak +// performance even at -O0, whereas a struct will not be fast at low +// optimization levels due to memory accesses. +// +// Technically you may be able to use the complex return values as a complex +// number if you wanted to, but that's probably not what you're after and they'd +// be flipped anyways (in mathematical convention, sine is the imaginary part). +// + +// Notes: +// - From http://www.shared-ptr.com/sh_insns.html: +// The input angle is specified as a signed fraction in twos complement. +// The result of sin and cos is a single-precision floating-point number. +// 0x7FFFFFFF to 0x00000001: 360×2^15−360/2^16 to 360/2^16 degrees +// 0x00000000: 0 degree +// 0xFFFFFFFF to 0x80000000: −360/2^16 to −360×2^15 degrees +// - fsca format is 2^16 is 360 degrees, so a value of 1 is actually +// 1/182.044444444 of a degree or 1/10430.3783505 of a radian +// - fsca does a %360 automatically for values over 360 degrees +// +// Also: +// In order to make the best use of fsca units, a program must expect them from +// the outset and not "make them" by dividing radians or degrees to get them, +// otherwise it's just giving the 'fsca' instruction radians or degrees! +// + +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // For integer input in native fsca units (fastest) + _Complex float MATH_fsca_Int(unsigned int input_int) + + // For integer input in degrees + _Complex float MATH_fsca_Int_Deg(unsigned int input_int) + + // For integer input in radians + _Complex float MATH_fsca_Int_Rad(unsigned int input_int) + + // For float input in native fsca units + _Complex float MATH_fsca_Float(float input_float) + + // For float input in degrees + _Complex float MATH_fsca_Float_Deg(float input_float) + + // For float input in radians + _Complex float MATH_fsca_Float_Rad(float input_float) +*/ + +//------------------------------------------------------------------------------ +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for old versions of GCC. +// See later for functions for newer versions of GCC. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int(unsigned int input_int) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float(float input_float) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +//------------------------------------------------------------------------------ +#else +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for newer versions of GCC. They +// work fine under GCC 9.2.0. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int(unsigned int input_int) +{ + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float(float input_float) +{ + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +//------------------------------------------------------------------------------ +#endif +//------------------------------------------------------------------------------ + +//============================================================================== +// Hardware vector and matrix operations +//============================================================================== +// +// These functions each have very specific usage instructions. Please be sure to +// read them before use or else they won't seem to work right! +// +// The following functions are available. +// Please see their definitions for important usage info, otherwise they may not +// work for you. +// +/* + + //------------------------------------------------------------------------------ + // Vector and matrix math operations + //------------------------------------------------------------------------------ + + // Inner/dot product (4x1 vec . 4x1 vec = scalar) + float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Sum of Squares (w^2 + x^2 + y^2 + z^2) + float MATH_Sum_of_Squares(float w, float x, float y, float z) + + // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c) + RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) + + // Cross product (vec X vec = orthogonal vec) + RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) + + // Outer product (vec (X) vec = 4x4 matrix) + void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec) + RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) + + // 4x4 Matrix transpose (XMTRX^T) + void MATH_Matrix_Transpose(void) + + // 4x4 Matrix product (XMTRX and one from memory) + void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) + + // 4x4 Matrix product (two from memory) + void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) + + //------------------------------------------------------------------------------ + // Matrix load and store operations + //------------------------------------------------------------------------------ + + // Load 4x4 XMTRX from memory + void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) + + // Store 4x4 XMTRX to memory + ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) +*/ + +//------------------------------------------------------------------------------ +// Vector and matrix math operations +//------------------------------------------------------------------------------ + +// Inner/dot product: vec . vec = scalar +// _ _ +// | y1 | +// [ x1 x2 x3 x4 ] . | y2 | = scalar +// | y3 | +// |_ y4 _| +// +// SH4 calling convention states we get 8 float arguments. Perfect! +static inline __attribute__((always_inline)) float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, aka vectors FV4 and FV8. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; + register float __y4 __asm__("fr11") = ty4; + + // take care of all the floats in one fell swoop + asm volatile ("fipr FV4, FV8\n" + : "+f" (__y4) // output (gets written to FR11) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3) // inputs + : // clobbers + ); + + return __y4; +} + +// Sum of Squares +// _ _ +// | w | +// [ w x y z ] . | x | = w^2 + x^2 + y^2 + z^2 = scalar +// | y | +// |_ z _| +// +static inline __attribute__((always_inline)) float MATH_Sum_of_Squares(float w, float x, float y, float z) +{ + // FR4-FR7 are the regs that are passed in, aka vector FV4. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tw = w; + float tx = x; + float ty = y; + float tz = z; + + // vector FV4 + register float __w __asm__("fr4") = tw; + register float __x __asm__("fr5") = tx; + register float __y __asm__("fr6") = ty; + register float __z __asm__("fr7") = tz; + + // take care of all the floats in one fell swoop + asm volatile ("fipr FV4, FV4\n" + : "+f" (__z) // output (gets written to FR7) + : "f" (__w), "f" (__x), "f" (__y) // inputs + : // clobbers + ); + + return __z; +} + +// Cross product: vec X vec = orthogonal vec +// _ _ _ _ _ _ +// | x1 | | y1 | | z1 | +// | x2 | X | y2 | = | z2 | +// |_ x3 _| |_ y3 _| |_ z3 _| +// +// With bonus multiply: +// +// a * b = c +// +// IMPORTANT USAGE INFORMATION (cross product): +// +// Return vector struct maps as below to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0) +// } RETURN_VECTOR_STRUCT; +// +// For people familiar with the unit vector notation, z1 == 'i', z2 == 'j', +// and z3 == 'k'. +// +// The cross product matrix will also be stored in XMTRX after this, so calling +// MATH_Matrix_Transform() on a vector after using this function will do a cross +// product with the same x1-x3 values and a multiply with the same 'a' value +// as used in this function. In this a situation, 'a' will be multiplied with +// the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult' +// version of the cross product function. +// +// For reference, XMTRX will look like this: +// +// [ 0 -x3 x2 0 ] +// [ x3 0 -x1 0 ] +// [ -x2 x1 0 0 ] +// [ 0 0 0 a ] (<-- a = 0 if not using 'with mult') +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the cross product functions actually return 4 floats. The first 3 are the +// cross product output, and the 4th is a*b. The SH4 only multiplies 4x4 +// matrices with 4x1 vectors, which is why the output is like that--but it means +// we also get a bonus float multiplication while we do our cross product! +// + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float ta = a; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tb = b; + + register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9) + register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4) + register float __a __asm__("fr5") = ta; + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + register float __y3 __asm__("fr6") = ty3; + register float __b __asm__("fr7") = tb; + + register float __z1 __asm__("fr0") = 0.0f; // z1 + register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed) + register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2) + register float __c __asm__("fr3") = 0.0f; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 a ] |_ b _| |_ c _| + // + + asm volatile ( + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fmov FR5, FR15\n\t" // fmov 'a' into position + "fmov FR0, FR14\n\t" // clear out FR14 + "fmov FR0, FR7\n\t" // clear out FR7 + "fmov FR0, FR5\n\t" // clear out FR5 + + "fneg FR2\n\t" // set up 'x2' + "fmov FR9, FR6\n\t" // set up 'x1' + "fneg FR9\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs + : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx1_2 = x1_2; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tzero = zero; + + register float __x1 __asm__("fr6") = tx1; // in place + register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4) + + register float __zero __asm__("fr5") = tzero; // in place + register float __x1_2 __asm__("fr9") = tx1_2; // need to negate + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + // no __y3 needed in this function + + register float __z1 __asm__("fr0") = tzero; // z1 + register float __z2 __asm__("fr1") = tzero; // z2 + register float __z3 __asm__("fr2") = ty3; // z3 + register float __c __asm__("fr3") = tzero; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 0 ] |_ 0 _| |_ 0 _| + // + + asm volatile ( + // zero out FR7. For some reason, if this is done in C after __z3 is set: + // register float __y3 __asm__("fr7") = tzero; + // then GCC will emit a spurious stack push (pushing FR12). So just zero it here. + "fmov FR5, FR7\n\t" + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fmov DR0, DR14\n\t" // clear FR14, FR15 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fneg FR9\n\t" // set up 'x1' + "fmov FR8, FR2\n\t" // set up 'x2' + "fneg FR2\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs + : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +//------------------------------------------------------------------------------ +// Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make +// it easier to organize parameters +//------------------------------------------------------------------------------ + +// Cross product with a bonus float multiply (c = a * b) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) +{ + return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2); +} + +// Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0) +// This is a tiny bit faster than 'with_mult' (about 2 cycles faster) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) +{ + return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2); +} + +// Outer product: vec (X) vec = matrix +// _ _ +// | x1 | +// | x2 | (X) [ y1 y2 y3 y4 ] = 4x4 matrix +// | x3 | +// |_ x4 _| +// +// This returns the floats in the back bank (XF0-15), which are inaccessible +// outside of using frchg or paired-move fmov. It's meant to set up a matrix for +// use with other matrix functions. GCC also does not touch the XFn bank. +// This will also wipe out anything stored in the float registers, as it uses the +// whole FPU register file (all 32 of the float registers). +static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; // in place already + register float __y4 __asm__("fr11") = ty4; + + // This actually does a 4x4 matrix multiply to do the outer product. + // It's this: + // + // [ x1 x1 x1 x1 ] [ y1 0 0 0 ] [ x1y1 x1y2 x1y3 x1y4 ] + // [ x2 x2 x2 x2 ] [ 0 y2 0 0 ] = [ x2y1 x2y2 x2y3 x2y4 ] + // [ x3 x3 x3 x3 ] [ 0 0 y3 0 ] [ x3y1 x3y2 x3y3 x3y4 ] + // [ x4 x4 x4 x4 ] [ 0 0 0 y4 ] [ x4y1 x4y2 x4y3 x4y4 ] + // + + asm volatile ( + // zero out unoccupied front floats to make a double 0 in DR2 + "fldi0 FR2\n\t" + "fmov FR2, FR3\n\t" + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13) + "fmov DR4, XD0\n\t" + "fmov DR4, XD4\n\t" + "fmov DR4, XD8\n\t" + "fmov DR4, XD12\n\t" + // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15) + "fmov DR6, XD2\n\t" + "fmov DR6, XD6\n\t" + "fmov DR6, XD10\n\t" + "fmov DR6, XD14\n\t" + // set up front floats (y1-y4) + "fmov DR8, DR0\n\t" + "fmov DR8, DR4\n\t" + "fmov DR10, DR14\n\t" + // finish zeroing out front floats + "fmov DR2, DR6\n\t" + "fmov DR2, DR8\n\t" + "fmov DR2, DR12\n\t" + "fschg\n\t" // switch back to single-move mode + // zero out remaining values and matrix multiply 4x4 + "fmov FR2, FR1\n\t" + "ftrv XMTRX, FV0\n\t" + + "fmov FR6, FR4\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov FR8, FR11\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov FR12, FR14\n\t" + "ftrv XMTRX, FV12\n\t" + // Save output in XF regs + "frchg\n" + : // no outputs + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs + : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it + ); + // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank. +} + +// Matrix transform: matrix * vector = vector +// _ _ _ _ +// [ ----------- ] | x1 | | z1 | +// [ ---XMTRX--- ] | x2 | = | z2 | +// [ ----------- ] | x3 | | z3 | +// [ ----------- ] |_ x4 _| |_ z4 _| +// +// IMPORTANT USAGE INFORMATION (matrix transform): +// +// Return vector struct maps 1:1 to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; +// } RETURN_VECTOR_STRUCT; +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the matrix transform function actually returns 4 floats. The SH4 only multiplies +// 4x4 matrices with 4x1 vectors, which is why the output is like that. +// +// Multiply a matrix stored in the back bank (XMTRX) with an input vector +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) +{ + // The floats comprising FV4 are the regs that are passed in. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + // output vector FV0 + register float __z1 __asm__("fr0") = tx1; + register float __z2 __asm__("fr1") = tx2; + register float __z3 __asm__("fr2") = tx3; + register float __z4 __asm__("fr3") = tx4; + + asm volatile ("ftrv XMTRX, FV0\n\t" + // have to do this to obey SH4 calling convention--output returned in FV0 + : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4}; + return output; +} + +// Matrix Transpose +// +// This does a matrix transpose on the matrix in XMTRX, which swaps rows with +// columns as follows (math notation is [XMTRX]^T): +// +// [ a b c d ] T [ a e i m ] +// [ e f g h ] = [ b f j n ] +// [ i j k l ] [ c g k o ] +// [ m n o p ] [ d h l p ] +// +// PLEASE NOTE: It is faster to avoid the need for a transpose altogether by +// structuring matrices and vectors accordingly. +static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void) +{ + asm volatile ( + "frchg\n\t" // fmov for singles only works on front bank + // FR0, FR5, FR10, and FR15 are already in place + // swap FR1 and FR4 + "flds FR1, FPUL\n\t" + "fmov FR4, FR1\n\t" + "fsts FPUL, FR4\n\t" + // swap FR2 and FR8 + "flds FR2, FPUL\n\t" + "fmov FR8, FR2\n\t" + "fsts FPUL, FR8\n\t" + // swap FR3 and FR12 + "flds FR3, FPUL\n\t" + "fmov FR12, FR3\n\t" + "fsts FPUL, FR12\n\t" + // swap FR6 and FR9 + "flds FR6, FPUL\n\t" + "fmov FR9, FR6\n\t" + "fsts FPUL, FR9\n\t" + // swap FR7 and FR13 + "flds FR7, FPUL\n\t" + "fmov FR13, FR7\n\t" + "fsts FPUL, FR13\n\t" + // swap FR11 and FR14 + "flds FR11, FPUL\n\t" + "fmov FR14, FR11\n\t" + "fsts FPUL, FR14\n\t" + // restore XMTRX to back bank + "frchg\n" + : // no outputs + : // no inputs + : "fpul" // clobbers + ); +} + +// Matrix product: matrix * matrix = matrix +// +// These use the whole dang floating point unit. +// +// [ ----------- ] [ ----------- ] [ ----------- ] +// [ ---Back---- ] [ ---Front--- ] = [ ---XMTRX--- ] +// [ ---Matrix-- ] [ ---Matrix-- ] [ ----------- ] +// [ --(XMTRX)-- ] [ ----------- ] [ ----------- ] +// +// Multiply a matrix stored in the back bank with a matrix loaded from memory +// Output is stored in the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "front_matrix" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(front_matrix); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[fmtrx], %[pref_scratch]\n\t" // parallel-load address for prefetching (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" // (LS) + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" // prefetch should work for here + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [fmtrx] "+r" ((unsigned int)front_matrix), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Load two 4x4 matrices and multiply them, storing the output into the back bank (XMTRX) +// +// MATH_Load_Matrix_Product() is slightly faster than doing this: +// MATH_Load_XMTRX(matrix1) +// MATH_Matrix_Product(matrix2) +// as it saves having to do 2 extraneous 'fschg' instructions. +// +static inline __attribute__((always_inline)) void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "matrix1" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(matrix1); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // back matrix + "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) + "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fmov.d @%[bmtrx], XD14\n\t" + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // front matrix + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +//------------------------------------------------------------------------------ +// Matrix load and store operations +//------------------------------------------------------------------------------ + +// Load a matrix from memory into the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "back_matrix" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(back_matrix); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + "fmov.d @%[bmtrx]+, XD0\n\t" + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "fmov.d @%[bmtrx]+, XD8\n\t" + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "fmov.d @%[bmtrx], XD14\n\t" + "fschg\n" // switch back to single moves + : [bmtrx] "+r" ((unsigned int)back_matrix), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w + : // no inputs + : // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Store XMTRX to memory +static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "destination" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch( (ALL_FLOATS_STRUCT*)((unsigned char*)destination + 32) ); // Store works backwards, so note the '+32' here + */ + + char * output = ((char*)destination) + sizeof(ALL_FLOATS_STRUCT) + 8; // ALL_FLOATS_STRUCT should be 64 bytes + + asm volatile ( + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[dest_base]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + "fmov.d XD0, @-%[out_mtrx]\n\t" // These do *(--output) = XDn (LS) + "fmov.d XD2, @-%[out_mtrx]\n\t" + "fmov.d XD4, @-%[out_mtrx]\n\t" + "fmov.d XD6, @-%[out_mtrx]\n\t" + "fmov.d XD8, @-%[out_mtrx]\n\t" + "fmov.d XD10, @-%[out_mtrx]\n\t" + "fmov.d XD12, @-%[out_mtrx]\n\t" + "fmov.d XD14, @-%[out_mtrx]\n\t" + "fschg\n" // switch back to single moves + : [out_mtrx] "+&r" ((unsigned int)output) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : [dest_base] "r" ((unsigned int)destination) // inputs + : "memory" // clobbers + ); + + return destination; +} + + +// In general, writing the entire required math routine in one asm function is +// the best way to go for performance reasons anyways, and in that situation one +// can just throw calling convention to the wind until returning back to C. + +//============================================================================== +// Miscellaneous Functions +//============================================================================== +// +// The following functions are provided as examples of ways in which these math +// functions can be used. +// +// Reminder: 1 fsca unit = 1/182.044444444 of a degree or 1/10430.3783505 of a radian +// In order to make the best use of fsca units, a program must expect them from +// the outset and not "make them" by dividing radians or degrees to get them, +// otherwise it's just giving the 'fsca' instruction radians or degrees! +// +/* + + //------------------------------------------------------------------------------ + // Commonly useful functions + //------------------------------------------------------------------------------ + + // Returns 1 if point 't' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not + int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty) + + //------------------------------------------------------------------------------ + // Interpolation + //------------------------------------------------------------------------------ + + // Linear interpolation + float MATH_Lerp(float a, float b, float t) + + // Speherical interpolation ('theta' in fsca units) + float MATH_Slerp(float a, float b, float t, float theta) + + //------------------------------------------------------------------------------ + // Fast Sinc functions (unnormalized, sin(x)/x version) + //------------------------------------------------------------------------------ + // Just pass in MATH_pi * x for normalized versions :) + + // Sinc function (fsca units) + float MATH_Fast_Sincf(float x) + + // Sinc function (degrees) + float MATH_Fast_Sincf_Deg(float x) + + // Sinc function (rads) + float MATH_Fast_Sincf_Rad(float x) + +*/ + +//------------------------------------------------------------------------------ +// Commonly useful functions +//------------------------------------------------------------------------------ + +// Returns 1 if point 'pt' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not +// Determines triangle center using barycentric coordinate transformation +// Adapted from: https://stackoverflow.com/questions/2049582/how-to-determine-if-a-point-is-in-a-2d-triangle +// Specifically the answer by user 'adreasdr' in addition to the comment by user 'urraka' on the answer from user 'Andreas Brinck' +// +// The notation here assumes v0x is the x-component of v0, v0y is the y-component of v0, etc. +// +static inline __attribute__((always_inline)) int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty) +{ + float sdot = MATH_fipr(v0y, -v0x, v2y - v0y, v0x - v2x, v2x, v2y, ptx, pty); + float tdot = MATH_fipr(v0x, -v0y, v0y - v1y, v1x - v0x, v1y, v1x, ptx, pty); + + float areadot = MATH_fipr(-v1y, v0y, v0x, v1x, v2x, -v1x + v2x, v1y - v2y, v2y); + + // 'areadot' could be negative depending on the winding of the triangle + if(areadot < 0.0f) + { + sdot *= -1.0f; + tdot *= -1.0f; + areadot *= -1.0f; + } + + if( (sdot > 0.0f) && (tdot > 0.0f) && (areadot > (sdot + tdot)) ) + { + return 1; + } + + return 0; +} + +//------------------------------------------------------------------------------ +// Interpolation +//------------------------------------------------------------------------------ + +// Linear interpolation +static inline __attribute__((always_inline)) float MATH_Lerp(float a, float b, float t) +{ + return MATH_fmac(t, (b-a), a); +} + +// Speherical interpolation ('theta' in fsca units) +static inline __attribute__((always_inline)) float MATH_Slerp(float a, float b, float t, float theta) +{ + // a is an element of v0, b is an element of v1 + // v = ( v0 * sin(theta - t * theta) + v1 * sin(t * theta) ) / sin(theta) + // by using sine/cosine identities and properties, this can be optimized to: + // v = v0 * cos(-t * theta) + ( v0 * ( cos(theta) * sin(-t * theta) ) - sin(-t * theta) * v1 ) / sin(theta) + // which only requires two calls to fsca. + // Specifically, sin(a + b) = sin(a)cos(b) + cos(a)sin(b) & sin(-a) = -sin(a) + + // MATH_fsca_* functions return reverse-ordered complex numbers for speed reasons (i.e. normally sine is the imaginary part) + // This could be made even faster by using MATH_fsca_Int() with 'theta' and 't' as unsigned ints + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(theta); + float sine_value_theta = sine_cosine.sine; + float cosine_value_theta = sine_cosine.cosine; + + RETURN_FSCA_STRUCT sine_cosine2 = MATH_fsca_Float(-t * theta); + float sine_value_minus_t_theta = sine_cosine2.sine; + float cosine_value_minus_t_theta = sine_cosine2.cosine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float(theta); + float sine_value_theta = __real__ sine_cosine; + float cosine_value_theta = __imag__ sine_cosine; + + _Complex float sine_cosine2 = MATH_fsca_Float(-t * theta); + float sine_value_minus_t_theta = __real__ sine_cosine2; + float cosine_value_minus_t_theta = __imag__ sine_cosine2; + +#endif + + float numer = a * cosine_value_theta * sine_value_minus_t_theta - sine_value_minus_t_theta * b; + float output_float = a * cosine_value_minus_t_theta + MATH_Fast_Divide(numer, sine_value_theta); + + return output_float; +} + +//------------------------------------------------------------------------------ +// Fast Sinc (unnormalized, sin(x)/x version) +//------------------------------------------------------------------------------ +// +// Just pass in MATH_pi * x for normalized versions :) +// + +// Sinc function (fsca units) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +// Sinc function (degrees) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Deg(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Deg(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float_Deg(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +// Sinc function (rads) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Rad(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Rad(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float_Rad(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +//============================================================================== +// Miscellaneous Snippets +//============================================================================== +// +// The following snippets are best implemented manually in user code (they can't +// be put into their own functions without incurring performance penalties). +// +// They also serve as examples of how one might use the functions in this header. +// +/* + Normalize a vector (x, y, z) and get its pre-normalized magnitude (length) +*/ + +// +// Normalize a vector (x, y, z) and get its pre-normalized magnitude (length) +// +// magnitude = sqrt(x^2 + y^2 + z^2) +// (x, y, z) = 1/magnitude * (x, y, z) +// +// x, y, z, and magnitude are assumed already existing floats +// + +/* -- start -- + + // Don't need an 'else' with this (if length is 0, x = y = z = 0) + magnitude = 0; + + if(__builtin_expect(x || y || z, 1)) + { + temp = MATH_Sum_of_Squares(x, y, z, 0); // temp = x^2 + y^2 + z^2 + 0^2 + float normalizer = MATH_fsrra(temp); // 1/sqrt(temp) + x = normalizer * x; + y = normalizer * y; + z = normalizer * z; + magnitude = MATH_Fast_Invert(normalizer); + } + +-- end -- */ + + +#endif /* __SH4_MATH_H_ */ + |