1 /*****************************************************************************
2 * Program: C_Patterns.c
3 * Function: C specification of complex operations for the simd optimization.
4 * The order in this file of the functions is IMPORTANT since an
5 * earlier defined function has a higher scheduling priority!
6 * Author: Andreas Schoesser
8 *****************************************************************************/
12 #include "define_operation.h"
16 /* Vektor Load, 4 Komponents, single precision float */
17 void vload_4f_32(void)
19 float *param = Arg_0("vector", "memory", "gp");
20 float *result = Res("vector", "register", "xmm");
21 Emit(". movaps (%S0), %D0");
34 unsigned char *param = Arg_0("vector", "memory", "gp");
35 unsigned char *result = Res("vector", "register", "xmm");
36 Emit(". lddqu (%S0), %D0");
50 result[10] = param[10];
51 result[11] = param[11];
52 result[12] = param[12];
53 result[13] = param[13];
54 result[14] = param[14];
55 result[15] = param[15];
61 /* Vektor Load, 4 Komponents, 32 Bit integer */
64 sse_32_2 *param = Arg_0("vector", "memory", "gp");
65 sse_32_2 *result = Res("vector", "register", "xmm");
66 Emit(". movdqu (%S0), %D0");
78 sse_32_2 *param = Arg_0("vector", "memory", "gp");
79 sse_32_2 *result = Res("vector", "register", "xmm");
80 Emit(". movq (%S0), %D0");
90 sse_32_2 *param0 = Arg_0("vector", "register", "xmm");
91 sse_32_2 *param1 = Arg_1("vector", "register", "xmm");
92 sse_32_2 *result = Res("vector", "register", "in_r0");
93 Emit(". paddd %S1, %S0");
97 result[0] = param0[0] + param1[0];
98 result[1] = param0[1] + param1[1];
102 /** Register mode **/
103 void mulps_4_32(void)
105 float *param0 = Arg_0("vector", "register", "xmm");
106 float *param1 = Arg_1("vector", "register", "xmm");
107 float *result = Res("vector", "register", "in_r1");
108 Emit(". mulps %S1, %S0");
111 result[0] = param0[0] * param1[0];
112 result[1] = param0[1] * param1[1];
113 result[2] = param0[2] * param1[2];
114 result[3] = param0[3] * param1[3];
117 /** Mem mode right **/
118 void mulps_4_32_am(void)
120 float *param0 = Arg_0("vector", "register", "xmm");
121 float *param1 = Arg_1("vector", "memory", "gp");
122 float *result = Res("vector", "register", "in_r1");
123 Emit(". mulps %S1, %S0");
127 result[0] = param0[0] * param1[0];
128 result[1] = param0[1] * param1[1];
129 result[2] = param0[2] * param1[2];
130 result[3] = param0[3] * param1[3];
135 void add_horz_4_32(void)
137 float *param = Arg_0("vector", "register", "xmm");
138 float *result = Res("vector", "register", "in_r1");
139 Emit(". haddps %S0, %S0\\n. haddps %S0, %S0");
142 result[0] = param[0] + param[1] + param[2] + param[3];
146 /************************************************************************/
148 /************************************************************************/
152 float *a = Arg_0("vector", "register", "xmm");
153 float *b = Arg_1("vector", "register", "xmm");
154 float *r = Res("vector", "register", "in_r1");
155 Emit(". maxps %S1, %S0");
181 unsigned char *a = Arg_0("vector", "register", "xmm");
182 unsigned char *b = Arg_1("vector", "register", "xmm");
183 unsigned int *r = Res("vector", "register", "in_r1");
184 Emit(". psadbw %S1, %S0\\n. phaddd %S0, %S0\\n. phaddd %S0, %S0");
187 r[0] = ((a[0] > b[0]) ? (a[0] - b[0]) : (b[0] - a[0])) +
188 ((a[1] > b[1]) ? (a[1] - b[1]) : (b[1] - a[1])) +
189 ((a[2] > b[2]) ? (a[2] - b[2]) : (b[2] - a[2])) +
190 ((a[3] > b[3]) ? (a[3] - b[3]) : (b[3] - a[3])) +
191 ((a[4] > b[4]) ? (a[4] - b[4]) : (b[4] - a[4])) +
192 ((a[5] > b[5]) ? (a[5] - b[5]) : (b[5] - a[5])) +
193 ((a[6] > b[6]) ? (a[6] - b[6]) : (b[6] - a[6])) +
194 ((a[7] > b[7]) ? (a[7] - b[7]) : (b[7] - a[7])) +
195 ((a[8] > b[8]) ? (a[8] - b[8]) : (b[8] - a[8])) +
196 ((a[9] > b[9]) ? (a[9] - b[9]) : (b[9] - a[9])) +
197 ((a[10] > b[10]) ? (a[10] - b[10]) : (b[10] - a[10])) +
198 ((a[11] > b[11]) ? (a[11] - b[11]) : (b[11] - a[11])) +
199 ((a[12] > b[12]) ? (a[12] - b[12]) : (b[12] - a[12])) +
200 ((a[13] > b[13]) ? (a[13] - b[13]) : (b[13] - a[13])) +
201 ((a[14] > b[14]) ? (a[14] - b[14]) : (b[14] - a[14])) +
202 ((a[15] > b[15]) ? (a[15] - b[15]) : (b[15] - a[15]));
207 float *a = Arg_0("vector", "register", "xmm");
208 float *b = Arg_1("vector", "register", "xmm");
209 float *r = Res("vector", "register", "in_r1");
211 r[0] = (a[0] - b[0]) + (a[1] - b[1]) + (a[2] - b[2]) + (a[3] - b[3]);
219 float *param = Arg_0("vector", "register", "xmm");
220 float *result = Arg_1("vector", "memory", "gp");
221 Emit(". movaps %S0, (%S1)");
224 result[0] = param[0];
225 result[1] = param[1];
226 result[2] = param[2];
227 result[3] = param[3];
233 /************************************************************************/
235 /* Input: Vector register 1 v1, memory pointer p1 */
237 /* Operation: Store the components of v1 at memory location p1. */
238 /************************************************************************/
240 void vstore_4_32(void)
242 sse_32_2 *param = Arg_0("vector", "register", "xmm");
243 sse_32_2 *result = Arg_1("vector", "memory", "gp");
244 Emit(". movq %S0, (%S1)");
247 result[0] = param[0];
248 result[1] = param[1];
249 result[2] = param[2];
250 result[3] = param[3];
254 void vstore_2_32(void)
256 sse_32_2 *param = Arg_0("vector", "register", "xmm");
257 sse_32_2 *result = Arg_1("vector", "memory", "gp");
258 Emit(". movq %S0, (%S1)");
261 result[0] = param[0];
262 result[1] = param[1];
267 void component_0f(void)
269 float *b = Arg_0("vector", "register", "xmm");
270 float *r = Res("scalar", "register", "xmm");
271 Priority(PRIORITY_CLEANUP);
272 Emit("");//. movd %S0, %D0");
280 void component_0Iu(void)
282 int *b = Arg_0("vector", "register", "xmm");
283 int *r = Res("scalar", "register", "gp");
284 Priority(PRIORITY_CLEANUP);
285 Emit(". movd %S0, %D0");
290 void component_0(void)
292 sse_32_2 *b = Arg_0("vector", "register", "xmm");
293 sse_32_2 *r = Res("scalar", "register", "gp");
294 Priority(PRIORITY_CLEANUP);
295 Emit(". movd %S0, %D0");
300 void component_1(void)
302 sse_32_2 *b = Arg_0("vector", "register", "xmm");
303 sse_32_2 *r = Res("scalar", "register", "gp");
305 Priority(PRIORITY_CLEANUP);
306 Emit(". psrldq \\$4, %S0 \\n. movd %S0, %D0");
311 void component_2(void)
313 sse_32_2 *b = Arg_0("vector", "register", "xmm");
314 sse_32_2 *r = Res("scalar", "register", "gp");
316 Priority(PRIORITY_CLEANUP);
317 Emit(". psrldq \\$8, %S0 \\n. movd %S0, %D0");
322 void component_3(void)
324 sse_32_2 *b = Arg_0("vector", "register", "xmm");
325 sse_32_2 *r = Res("scalar", "register", "gp");
327 Priority(PRIORITY_CLEANUP);
328 Emit(". psrldq \\$12, %S0 \\n. movd %S0, %D0");
334 /********************************
336 ********************************/
339 void packed_add_8_32(void)
341 sse_32_2 *a = Arg_0("vector", "register", "xmm");
342 sse_32_2 *b = Arg_1("vector", "register", "xmm");
343 sse_32_2 *r = Res("vector", "register", "in_r0");
345 Emit(". haddps %S1, %S0");