1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
|
! =========================================================
! ======================== PROCESSOR INFO =================
! =========================================================
! The SH4 can dual issue (i.e. parallel execution) two instructions
! as long as the groups of the two instructions are different:
! * LS - most APU and FPU register load/stores
! * EX - most APU arithmetic instructions
! * MT - TST, CMP, NOP, MOV Rm,Rn
! * FE - most FPU arithmetic instructions
! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
! Thee following general aspects of instructions are important to note per the SH4 manual:
! * Issue rate: Interval between the issue of an instruction and that of the next instruction
! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
! (although different cases may either increase or decrease Latency)
!
! =========================================================
! ======================== REGISTER USAGES ================
! =========================================================
! SH4 C ABI:
! - R0 to R3 are return values (can be overwritten)
! - R4 to R7 are input arguments (can be overwritten)
! - R8 to R13 are non-volatile (must be restored at end)
! - R14 is the frame pointer (must be restored at end)
! - R15 is the stack pointer (must be restored at end)
! - FR0 to FR3 are return values (can be overwritten)
! - FR4 to FR11 are input arguments (can be overwritten)
! - FR12 to FR13 are non-volatile (must be restored at end)
!r0 = clip flags
!r1 = GPU command
!r2 = temp
!r3 = prefetch address
!r4 = src pointer ARG
!r5 = dst pointer ARG
!r6 = quads count ARG
!r7 = ?
!fr0 = temp
!fr1 = u
!fr2 = v
!fr3 = c
!fr4 = x
!fr5 = y
!fr6 = z
!fr7 = w
!fr8 = VIEWPORT_HWIDTH
!fr9 = VIEWPORT_HHEIGHT
!fr10 = VIEWPORT_X_PLUS_HWIDTH
!fr11 = VIEWPORT_Y_PLUS_HHEIGHT
!fv4 = XYZW
! =========================================================
! ========================= VERTEX LOADING ================
! =========================================================
.macro LoadColouredVertex
! LOAD XYZ
fmov @r4+, fr4 ! LS, X = src->x
fmov @r4+, fr5 ! LS, Y = src->y
fmov @r4+, fr6 ! LS, Z = src->z
fldi1 fr7 ! LS, W = 1.0
! PREPARE NEXT VERTEX
add #16, r3 ! EX, r3 += VERTEX_STRIDE
pref @r3 ! LS, PREFETCH r3 (next vertex)
add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX)
! TRANSFORM VERTEX
ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW)
! LOAD ATTRIBUTES
fmov @r4+,fr3 ! LS, C = src->color
.endm
.macro LoadTexturedVertex
! LOAD XYZ
fmov @r4+, fr4 ! LS, X = src->x
fmov @r4+, fr5 ! LS, Y = src->y
fmov @r4+, fr6 ! LS, Z = src->z
fldi1 fr7 ! LS, W = 1.0
! PREPARE NEXT VERTEX
add #24, r3 ! EX, r3 += VERTEX_STRIDE
pref @r3 ! LS, PREFETCH r3 (next vertex)
add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX)
! TRANSFORM VERTEX
ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW)
! LOAD ATTRIBUTES
fmov @r4+,fr3 ! LS, C = src->color
fmov @r4+,fr1 ! LS, U = src->u
fmov @r4+,fr2 ! LS, V = src->v
.endm
! =========================================================
! ========================= VERTEX OUTPUT =================
! =========================================================
! To take advantage of SH4 dual instruction processing,
! clipflag calculation and vertex output are interleaved
.macro ProcessVertex1
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr7,fr6 ! FE, T = Z > W (i.e. Z > -W)
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r0 ! EX, CLIPFLAGS = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
.endm
.macro ProcessVertex2
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr7,fr6 ! FE, T = Z > W (i.e. Z > -W)
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
add r2,r2 ! EX, tmp = tmp + tmp
fmov.s fr5,@-r5 ! LS, dst->y = Y
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1)
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
.endm
.macro ProcessVertex3
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr7,fr6 ! FE, T = Z > W (i.e. Z > -W)
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr4,@-r5 ! LS, dst->x = X
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2)
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
.endm
.macro ProcessVertex4 eos_addr
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr7,fr6 ! FE, T = Z > W (i.e. Z > -W)
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr5,@-r5 ! LS, dst->y = Y
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2)
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l \eos_addr, r1 ! LS, r1 = GPU EOS command
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3)
or r0,r1 ! EX, r1 |= CLIPFLAGS
mov.l r1,@-r5 ! LS, dst->flags = GPU EOS | CLIPFLAGS
.endm
! =========================================================
! ====================== VIEWPORT TRANSFORM ===============
! =========================================================
!r2 = return addr
!r0 = temp
!r5 = dst pointer
!fr0 = temp
!fr4 = temp
!fr5 = temp
!fr5 = temp
!fr8 = VIEWPORT_HWIDTH
!fr9 = VIEWPORT_HHEIGHT
!fr10 = VIEWPORT_X_PLUS_HWIDTH
!fr11 = VIEWPORT_Y_PLUS_HHEIGHT
.macro ViewportTransformSetup vp_addr
mova \vp_addr, r0 ! EX, r0 = &VIEWPORT
fmov.s @r0+,fr8 ! LS, fr8 = VIEWPORT_HWIDTH
fmov.s @r0+,fr9 ! LS, fr9 = VIEWPORT_HHEIGHT
fmov.s @r0+,fr10 ! LS, fr10 = VIEWPORT_X_PLUS_HWIDTH
fmov.s @r0+,fr11 ! LS, fr11 = VIEWPORT_Y_PLUS_HHEIGHT
nop ! MT (align to even instructions boundary)
.endm
.macro ViewportTransformVertex
! INVERSE W CALCULATION
add #28, r5 ! EX, r5 = &vertex->w
fmov.s @r5,fr0 ! LS, fr0 = vertex->w
fmul fr0,fr0 ! FE, fr0 = fr0 * fr0
add #-24, r5 ! EX, r5 = &vertex->x
fsrra fr0 ! FE, fr0 = 1 / sqrt(fr0) -> 1 / vertex->w
! TRANSFORM X
fmov.s @r5,fr4 ! LS, fr4 = vertex->x
fmov fr10,fr5 ! LS, fr5 = VIEWPORT_X_PLUS_HWIDTH
fmul fr8,fr4 ! FE, fr4 = VIEWPORT_HWIDTH * vertex->x
fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (X * F * hwidth) + x_plus_hwidth
fmov.s fr5,@r5 ! LS, vertex->x = fr5
add #4, r5 ! EX, r5 = &vertex->y
! TRANSFORM Y
fmov.s @r5,fr4 ! LS, fr4 = vertex->y
fmov fr11,fr5 ! LS, fr5 = VIEWPORT_Y_PLUS_HHEIGHT
fmul fr9,fr4 ! FE, fr4 = VIEWPORT_HHEIGHT * vertex->y
fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (Y * F * hheight) + y_plus_hheight
fmov.s fr5,@r5 ! LS, vertex->y = fr5
add #4, r5 ! EX, r5 = &vertex->z
! ASSIGN Z
fmov.s fr0,@r5 ! LS, vertex->z = fr0
add #20, r5 ! EX, r5 += 20 (points to start of next vertex)
.endm
|