--- /dev/null
+/*****************************************************************************
+ * Program: sad.c
+ * Function: New implementation of the intel application note
+ * AP-940: "Block matching in Motion estimation Algorithms
+ * using Streaming SIMD Extensions 3"
+ * We changed:
+ * - We used local arrays instead of pointer arithmetic
+ * because of the limited capability of the memory disambiguator
+ * - Used if/else instead of abs function since we can't
+ * use function calls in specification
+ * - Unrolled the inner loop manually since our loop
+ * unroller does not work so well.
+ * Used as a test for the simd optimization.
+ * TODO: - Maybe use the "restrict" keyword to implement pointer
+ * arithmetic
+ * Author: Andreas Schoesser
+ * Date: 2007-08-06
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <time.h>
+
+unsigned int sad(int test_blockx, int test_blocky, int *best_block_x, int *best_block_y, int iterations);
+
+main()
+{
+ int best_block_x, best_block_y;
+ unsigned int min_diff;
+ int iterations = 100;
+
+ printf("PSADBW Example\n--------------\n\n");
+
+ printf("Executing 'motion estimation' %d times...\n\n", iterations);
+ min_diff = sad(0, 0, &best_block_x, &best_block_y, iterations);
+
+ printf("MinDiff: %u\nBest X: %d\nBest Y: %d\n", min_diff, best_block_x, best_block_y);
+}
+
+unsigned int sad(int test_blockx, int test_blocky, int *best_block_x, int *best_block_y, int iterations)
+{
+ unsigned char b[256][256];
+
+ unsigned char a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
+ unsigned char b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15;
+
+ int i, x, y, blocky;
+ unsigned tmp_diff, min_diff = 0xFFFFFFFF; // MAX_UINT
+
+ clock_t t_time_bev, t_time_after, t_clocks_dauer;
+ double d_zeitdauer;
+
+ // Fill in some random values to compare
+ for(x = 0; x < 256; x++)
+ for(y = 0; y < 256; y++)
+ b[y][x] = (unsigned char) rand() % 255;
+
+ // Start time measurement
+ t_time_bev = clock();
+
+ // Execute Block matching 100 times
+ for(i = 0; i < iterations; i++)
+ {
+ // Iterate over whole frame, x,y=coords of current block
+ for(x = 1; x < 256 - 16; x++)
+ for(y = 0; y < 256 - 16; y++)
+ {
+ tmp_diff = 0;
+
+ // Compare current Block with reference block
+ for(blocky = 0; blocky < 16; blocky++)
+ {
+ // Vektor Loads
+ a0 = b[blocky][0]; a1 = b[blocky][1]; a2 = b[blocky][2]; a3 = b[blocky][3]; a4 = b[blocky][4]; a5 = b[blocky][5]; a6 = b[blocky][6]; a7 = b[blocky][7]; a8 = b[blocky][8]; a9 = b[blocky][9]; a10 = b[blocky][10]; a11 = b[blocky][11]; a12 = b[blocky][12]; a13 = b[blocky][13]; a14 = b[blocky][14]; a15 = b[blocky][15];
+ b0 = b[blocky + y][x + 0]; b1 = b[blocky + y][x + 1]; b2 = b[blocky + y][x + 2]; b3 = b[blocky + y][x + 3]; b4 = b[blocky + y][x + 4]; b5 = b[blocky + y][x + 5]; b6 = b[blocky + y][x + 6]; b7 = b[blocky + y][x + 7]; b8 = b[blocky + y][x + 8]; b9 = b[blocky + y][x + 9]; b10 = b[blocky + y][x + 10]; b11 = b[blocky + y][x + 11]; b12 = b[blocky + y][x + 12]; b13 = b[blocky + y][x + 13]; b14 = b[blocky + y][x + 14]; b15 = b[blocky + y][x + 15];
+
+ // psadpw, would be nice if this could be done by loop unrolling
+ tmp_diff += ((a0 > b0) ? (a0 - b0) : (b0 - a0)) +
+ ((a1 > b1) ? (a1 - b1) : (b1 - a1)) +
+ ((a2 > b2) ? (a2 - b2) : (b2 - a2)) +
+ ((a3 > b3) ? (a3 - b3) : (b3 - a3)) +
+ ((a4 > b4) ? (a4 - b4) : (b4 - a4)) +
+ ((a5 > b5) ? (a5 - b5) : (b5 - a5)) +
+ ((a6 > b6) ? (a6 - b6) : (b6 - a6)) +
+ ((a7 > b7) ? (a7 - b7) : (b7 - a7)) +
+ ((a8 > b8) ? (a8 - b8) : (b8 - a8)) +
+ ((a9 > b9) ? (a9 - b9) : (b9 - a9)) +
+ ((a10 > b10) ? (a10 - b10) : (b10 - a10)) +
+ ((a11 > b11) ? (a11 - b11) : (b11 - a11)) +
+ ((a12 > b12) ? (a12 - b12) : (b12 - a12)) +
+ ((a13 > b13) ? (a13 - b13) : (b13 - a13)) +
+ ((a14 > b14) ? (a14 - b14) : (b14 - a14)) +
+ ((a15 > b15) ? (a15 - b15) : (b15 - a15));
+ }
+
+ // Check if the current block is least different
+ if(min_diff > tmp_diff)
+ {
+ min_diff = tmp_diff;
+ *best_block_x = x;
+ *best_block_y = y;
+ }
+ }
+ }
+
+ // End time measurement
+ t_time_after = clock();
+
+ t_clocks_dauer = (t_time_after-t_time_bev);
+ d_zeitdauer = (double) (t_time_after-t_time_bev) / CLOCKS_PER_SEC;
+
+ #ifdef __GNUC__
+ printf("Zeitdauer %g s\n", d_zeitdauer);
+ #else
+ printf("Zeitdauer %g ms\n", d_zeitdauer);
+ #endif
+
+ return(min_diff);
+}
--- /dev/null
+/************************************************************************
+ * Program: scalar_product.c
+ * Function: Calculates the scalar product of vector lying in memory
+ * Used as a test for the simd optimization.
+ * Author: Andreas Schoesser
+ * Date: 2007-06-13
+ ************************************************************************/
+
+#include <stdio.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <time.h>
+
+float scalar_product(float *a, float *b, unsigned int max_elements);
+
+main()
+{
+ float res;
+ int i, max_elements = 100000000;
+ clock_t t_time_bev, t_time_after, t_clocks_dauer;
+ double d_zeitdauer;
+
+ // Allocate memory and make sure pointers are aligned to 16 byte addresses
+ char *a = malloc(16 + max_elements * sizeof(float));
+ char *b = malloc(16 + max_elements * sizeof(float));
+ float c;
+ char *ca = &a[0] + 16 - (unsigned int) ((unsigned int) &a[0] % 16);
+ char *cb = &b[0] + 16 - (unsigned int) ((unsigned int) &b[0] % 16);
+
+ float *aa = (float *) ca;
+ float *ab = (float *) cb;
+
+ printf("Scalar product\n==============\n\n");
+
+ //printf("Array Position: %u, %u, %u, %u\n", a, b, aa, ba/*(unsigned int) &aa[0] % 16, (unsigned int) &ba[0] % 16*/);
+
+ // Fill both arrays with random values
+ for(i = 0; i < max_elements; i++)
+ {
+ aa[i] = (float) (rand() % 10);
+ ab[i] = (float) (rand() % 10);
+
+ //printf("(%g * %g) + ", a[i], b[i]);
+ }
+
+ // Start time measurement
+ t_time_bev = clock();
+
+ //for(i = 0; i < max_elements - 4; i += 4)
+ res = scalar_product(aa, ab, max_elements);
+
+ // Stop time measurement
+ t_time_after = clock();
+ t_clocks_dauer = (t_time_after-t_time_bev);
+ d_zeitdauer = (double) (t_time_after-t_time_bev) / CLOCKS_PER_SEC;
+
+ #ifdef __GNUC__
+ printf("Zeitdauer %g s\n", d_zeitdauer);
+ #else
+ printf("Zeitdauer %g ms\n", d_zeitdauer);
+ #endif
+
+ printf("\nResult: %g\n", res);
+}
+
+
+float scalar_product(float * a, float * b, unsigned int max_elements)
+{
+ float res;
+ int i;
+
+ /*for(i = 0; i < 4; i++)
+ {
+ a[i] = (float) (rand() % 10);
+ b[i] = (float) (rand() % 10);
+
+ printf("(%g * %g) + ", a[i], b[i]);
+ }*/
+
+ for(i = 0; i < max_elements; i += 4)
+ res += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
+
+ return(res);
+}