1 /*****************************************************************************
3 * Function: New implementation of the intel application note
4 * AP-940: "Block matching in Motion estimation Algorithms
5 * using Streaming SIMD Extensions 3"
7 * - We used local arrays instead of pointer arithmetic
8 * because of the limited capability of the memory disambiguator
9 * - Used if/else instead of abs function since we can't
10 * use function calls in specification
11 * - Unrolled the inner loop manually since our loop
12 * unroller does not work so well.
13 * Used as a test for the simd optimization.
14 * TODO: - Maybe use the "restrict" keyword to implement pointer
16 * Author: Andreas Schoesser
18 *****************************************************************************/
25 unsigned int sad(int test_blockx, int test_blocky, int *best_block_x, int *best_block_y, int iterations);
29 int best_block_x, best_block_y;
30 unsigned int min_diff;
33 printf("PSADBW Example\n--------------\n\n");
35 printf("Executing 'motion estimation' %d times...\n\n", iterations);
36 min_diff = sad(0, 0, &best_block_x, &best_block_y, iterations);
38 printf("MinDiff: %u\nBest X: %d\nBest Y: %d\n", min_diff, best_block_x, best_block_y);
41 unsigned int sad(int test_blockx, int test_blocky, int *best_block_x, int *best_block_y, int iterations)
43 unsigned char b[256][256];
45 unsigned char a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
46 unsigned char b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15;
49 unsigned tmp_diff, min_diff = 0xFFFFFFFF; // MAX_UINT
51 // Fill in some random values to compare
52 for(x = 0; x < 256; x++)
53 for(y = 0; y < 256; y++)
54 b[y][x] = (unsigned char) rand() % 255;
56 // Execute Block matching 100 times
57 for(i = 0; i < iterations; i++)
59 // Iterate over whole frame, x,y=coords of current block
60 for(x = 1; x < 256 - 16; x++)
61 for(y = 0; y < 256 - 16; y++)
65 // Compare current Block with reference block
66 for(blocky = 0; blocky < 16; blocky++)
69 a0 = b[blocky][0]; a1 = b[blocky][1]; a2 = b[blocky][2]; a3 = b[blocky][3]; a4 = b[blocky][4]; a5 = b[blocky][5]; a6 = b[blocky][6]; a7 = b[blocky][7]; a8 = b[blocky][8]; a9 = b[blocky][9]; a10 = b[blocky][10]; a11 = b[blocky][11]; a12 = b[blocky][12]; a13 = b[blocky][13]; a14 = b[blocky][14]; a15 = b[blocky][15];
70 b0 = b[blocky + y][x + 0]; b1 = b[blocky + y][x + 1]; b2 = b[blocky + y][x + 2]; b3 = b[blocky + y][x + 3]; b4 = b[blocky + y][x + 4]; b5 = b[blocky + y][x + 5]; b6 = b[blocky + y][x + 6]; b7 = b[blocky + y][x + 7]; b8 = b[blocky + y][x + 8]; b9 = b[blocky + y][x + 9]; b10 = b[blocky + y][x + 10]; b11 = b[blocky + y][x + 11]; b12 = b[blocky + y][x + 12]; b13 = b[blocky + y][x + 13]; b14 = b[blocky + y][x + 14]; b15 = b[blocky + y][x + 15];
72 // psadpw, would be nice if this could be done by loop unrolling
73 tmp_diff += ((a0 > b0) ? (a0 - b0) : (b0 - a0)) +
74 ((a1 > b1) ? (a1 - b1) : (b1 - a1)) +
75 ((a2 > b2) ? (a2 - b2) : (b2 - a2)) +
76 ((a3 > b3) ? (a3 - b3) : (b3 - a3)) +
77 ((a4 > b4) ? (a4 - b4) : (b4 - a4)) +
78 ((a5 > b5) ? (a5 - b5) : (b5 - a5)) +
79 ((a6 > b6) ? (a6 - b6) : (b6 - a6)) +
80 ((a7 > b7) ? (a7 - b7) : (b7 - a7)) +
81 ((a8 > b8) ? (a8 - b8) : (b8 - a8)) +
82 ((a9 > b9) ? (a9 - b9) : (b9 - a9)) +
83 ((a10 > b10) ? (a10 - b10) : (b10 - a10)) +
84 ((a11 > b11) ? (a11 - b11) : (b11 - a11)) +
85 ((a12 > b12) ? (a12 - b12) : (b12 - a12)) +
86 ((a13 > b13) ? (a13 - b13) : (b13 - a13)) +
87 ((a14 > b14) ? (a14 - b14) : (b14 - a14)) +
88 ((a15 > b15) ? (a15 - b15) : (b15 - a15));
91 // Check if the current block is least different
92 if(min_diff > tmp_diff)