I was aware of Single Instruction Multiple Data optimizations, but I wasn't sure how to take advantage of them, or whether or not it was something I needed to directly take advantage of, versus relying on the compiler to do so for me.
So naturally I wrote some code to explore.
Results: Both GCC and Clang can perform simple SIMD optimization when -O3 is enabled. More difficult SIMD optimization problems were not explored.
test-gcc-debug sum: 99897982687.000000 time: 0.446810 test-gcc-o3 sum: 99897982687.000000 time: 0.157358 test-gcc-simd sum: 99897982687.000000 time: 0.157701 test-gcc-debug-simd sum: 99897982687.000000 time: 0.213039 test-clang-debug sum: 99897982687.000000 time: 0.410983 test-clang-o3 sum: 99897982687.000000 time: 0.159946 test-clang-simd sum: 99897982687.000000 time: 0.160412 test-clang-debug-simd sum: 99897982687.000000 time: 0.204966
This is the C code that was being timed:
#ifdef USE_SIMD for (int i = 0; i < size; i += 4) { v4si *a_ptr = (v4si *) &a[i]; v4si *b_ptr = (v4si *) &b[i]; v4si *c_ptr = (v4si *) &c[i]; *c_ptr = *a_ptr + *b_ptr; } #else for (int i = 0; i < size; i += 1) { c[i] = a[i] + b[i]; } #endif
What follows here is what assembly this C code turns into, for each of these compiler settings:
- test-gcc-debug - no optimizations, no explicit SIMD
- test-gcc-o3 - optimization level 3, no explicit SIMD
- test-gcc-simd - optimization level 3, explicit SIMD
- test-gcc-debug-simd - no optimization, explicit SIMD
You can tell from the assembly that the output generated from -O3, without explicit SIMD, is the same as the output generated from -O3, with explicit SIMD.
test-gcc-debug
41:test.c **** for (int i = 0; i < size; i += 1) { 165 .loc 1 41 0 166 0123 C745F800 movl $0, -8(%rbp) #, i 166 000000 167 012a EB2A jmp .L8 # 168 .L9: 42:test.c **** c[i] = a[i] + b[i]; 169 .loc 1 42 0 discriminator 3 170 012c 8B45F8 movl -8(%rbp), %eax # i, tmp120 171 012f 4898 cltq 172 0131 8B148500 movl a(,%rax,4), %edx # a, D.3584 172 000000 173 0138 8B45F8 movl -8(%rbp), %eax # i, tmp122 174 013b 4898 cltq 175 013d 8B048500 movl b(,%rax,4), %eax # b, D.3584 175 000000 176 0144 01C2 addl %eax, %edx # D.3584, D.3584 177 0146 8B45F8 movl -8(%rbp), %eax # i, tmp124 178 0149 4898 cltq 179 014b 89148500 movl %edx, c(,%rax,4) # D.3584, c 179 000000 41:test.c **** for (int i = 0; i < size; i += 1) { 180 .loc 1 41 0 discriminator 3 181 0152 8345F801 addl $1, -8(%rbp) #, i 182 .L8: 41:test.c **** for (int i = 0; i < size; i += 1) { 183 .loc 1 41 0 is_stmt 0 discriminator 1 184 0156 817DF8FF cmpl $99999999, -8(%rbp) #, i 184 E0F505 185 015d 7ECD jle .L9 #, 186 .LBE3: 43:test.c **** }
test-gcc-o3
163 .L4: 44:test.c **** for (int i = 0; i < size; i += 1) { 45:test.c **** c[i] = a[i] + b[i]; 167 .loc 1 45 0 discriminator 3 168 00c0 660F6F80 movdqa a(%rax), %xmm0 # MEM[symbol: a, index: ivtmp.29_77, offset: 0B], MEM[symbol: a, index: ivtm 168 00000000 169 00c8 4883C010 addq $16, %rax #, ivtmp.29 170 00cc 660FFE80 paddd b-16(%rax), %xmm0 # MEM[symbol: b, index: ivtmp.29_77, offset: 0B], vect__33.14 170 00000000 171 00d4 0F298000 movaps %xmm0, c-16(%rax) # vect__33.14, MEM[symbol: c, index: ivtmp.29_77, offset: 0B] 171 000000 172 00db 483D0084 cmpq $400000000, %rax #, ivtmp.29 172 D717 173 00e1 75DD jne .L4 #,
test-gcc-simd
163 .L4: 34:test.c **** 35:test.c **** #ifdef USE_SIMD 36:test.c **** for (int i = 0; i < size; i += 4) { 37:test.c **** v4si *a_ptr = (v4si *) &a[i]; 38:test.c **** v4si *b_ptr = (v4si *) &b[i]; 39:test.c **** v4si *c_ptr = (v4si *) &c[i]; 40:test.c **** *c_ptr = *a_ptr + *b_ptr; 168 .loc 1 40 0 discriminator 3 169 00c0 660F6F80 movdqa a(%rax), %xmm0 # MEM[symbol: a, index: ivtmp.25_86, offset: 0B], MEM[symbol: a, index: ivtm 169 00000000 170 00c8 4883C010 addq $16, %rax #, ivtmp.25 171 .LVL10: 172 00cc 660FFE80 paddd b-16(%rax), %xmm0 # MEM[symbol: b, index: ivtmp.25_86, offset: 0B], D.2962 172 00000000 173 00d4 0F298000 movaps %xmm0, c-16(%rax) # D.2962, MEM[symbol: c, index: ivtmp.25_86, offset: 0B] 173 000000 174 .LBE26: 36:test.c **** for (int i = 0; i < size; i += 4) { 175 .loc 1 36 0 discriminator 3 176 00db 483D0084 cmpq $400000000, %rax #, ivtmp.25 176 D717 177 00e1 75DD jne .L4 #,
test-gcc-debug-simd
168 .L9: 169 .LBB4: 37:test.c **** v4si *a_ptr = (v4si *) &a[i]; 170 .loc 1 37 0 discriminator 3 171 012c 8B45F8 movl -8(%rbp), %eax # i, tmp120 172 012f 4898 cltq 173 0131 48C1E002 salq $2, %rax #, tmp121 174 0135 48050000 addq $a, %rax #, tmp122 174 0000 175 013b 488945D8 movq %rax, -40(%rbp) # tmp122, a_ptr 38:test.c **** v4si *b_ptr = (v4si *) &b[i]; 176 .loc 1 38 0 discriminator 3 177 013f 8B45F8 movl -8(%rbp), %eax # i, tmp124 178 0142 4898 cltq 179 0144 48C1E002 salq $2, %rax #, tmp125 180 0148 48050000 addq $b, %rax #, tmp126 180 0000 181 014e 488945D0 movq %rax, -48(%rbp) # tmp126, b_ptr 39:test.c **** v4si *c_ptr = (v4si *) &c[i]; 182 .loc 1 39 0 discriminator 3 183 0152 8B45F8 movl -8(%rbp), %eax # i, tmp128 184 0155 4898 cltq 185 0157 48C1E002 salq $2, %rax #, tmp129 186 015b 48050000 addq $c, %rax #, tmp130 186 0000 187 0161 488945C8 movq %rax, -56(%rbp) # tmp130, c_ptr 40:test.c **** *c_ptr = *a_ptr + *b_ptr; 188 .loc 1 40 0 discriminator 3 189 0165 488B45D8 movq -40(%rbp), %rax # a_ptr, tmp131 190 0169 660F6F08 movdqa (%rax), %xmm1 # *a_ptr_38, D.2868 191 016d 488B45D0 movq -48(%rbp), %rax # b_ptr, tmp132 192 0171 660F6F00 movdqa (%rax), %xmm0 # *b_ptr_39, D.2868 193 0175 660FFEC1 paddd %xmm1, %xmm0 # D.2868, D.2868 194 0179 488B45C8 movq -56(%rbp), %rax # c_ptr, tmp133 195 017d 0F2900 movaps %xmm0, (%rax) # D.2868, *c_ptr_40 196 .LBE4: 36:test.c **** for (int i = 0; i < size; i += 4) { 197 .loc 1 36 0 discriminator 3 198 0180 8345F804 addl $4, -8(%rbp) #, i 199 .L8: 36:test.c **** for (int i = 0; i < size; i += 4) { 200 .loc 1 36 0 is_stmt 0 discriminator 1 201 0184 817DF8FF cmpl $99999999, -8(%rbp) #, i 201 E0F505 202 018b 7E9F jle .L9 #,
How to get that nice assembly output
gcc -std=c11 -S -D_POSIX_C_SOURCE=199309L -fverbose-asm -g -D USE_SIMD test.c -o test.s as -alhnd test.s