Genesis Digital Audio Workstation

SIMD Test

2015 Oct 21

I was aware of Single Instruction Multiple Data optimizations, but I wasn't sure how to take advantage of them, or whether or not it was something I needed to directly take advantage of, versus relying on the compiler to do so for me.

So naturally I wrote some code to explore.

Results: Both GCC and Clang can perform simple SIMD optimization when -O3 is enabled. More difficult SIMD optimization problems were not explored.

test-gcc-debug
sum: 99897982687.000000  time: 0.446810
test-gcc-o3
sum: 99897982687.000000  time: 0.157358
test-gcc-simd
sum: 99897982687.000000  time: 0.157701
test-gcc-debug-simd
sum: 99897982687.000000  time: 0.213039
test-clang-debug
sum: 99897982687.000000  time: 0.410983
test-clang-o3
sum: 99897982687.000000  time: 0.159946
test-clang-simd
sum: 99897982687.000000  time: 0.160412
test-clang-debug-simd
sum: 99897982687.000000  time: 0.204966

This is the C code that was being timed:

#ifdef USE_SIMD
    for (int i = 0; i < size; i += 4) {
        v4si *a_ptr = (v4si *) &a[i];
        v4si *b_ptr = (v4si *) &b[i];
        v4si *c_ptr = (v4si *) &c[i];
        *c_ptr = *a_ptr + *b_ptr;
    }

#else
    for (int i = 0; i < size; i += 1) {
        c[i] = a[i] + b[i];
    }
#endif

What follows here is what assembly this C code turns into, for each of these compiler settings:

You can tell from the assembly that the output generated from -O3, without explicit SIMD, is the same as the output generated from -O3, with explicit SIMD.

test-gcc-debug

  41:test.c        ****     for (int i = 0; i < size; i += 1) {
 165              		.loc 1 41 0
 166 0123 C745F800 		movl	$0, -8(%rbp)	#, i
 166      000000
 167 012a EB2A     		jmp	.L8	#
 168              	.L9:
  42:test.c        ****         c[i] = a[i] + b[i];
 169              		.loc 1 42 0 discriminator 3
 170 012c 8B45F8   		movl	-8(%rbp), %eax	# i, tmp120
 171 012f 4898     		cltq
 172 0131 8B148500 		movl	a(,%rax,4), %edx	# a, D.3584
 172      000000
 173 0138 8B45F8   		movl	-8(%rbp), %eax	# i, tmp122
 174 013b 4898     		cltq
 175 013d 8B048500 		movl	b(,%rax,4), %eax	# b, D.3584
 175      000000
 176 0144 01C2     		addl	%eax, %edx	# D.3584, D.3584
 177 0146 8B45F8   		movl	-8(%rbp), %eax	# i, tmp124
 178 0149 4898     		cltq
 179 014b 89148500 		movl	%edx, c(,%rax,4)	# D.3584, c
 179      000000
  41:test.c        ****     for (int i = 0; i < size; i += 1) {
 180              		.loc 1 41 0 discriminator 3
 181 0152 8345F801 		addl	$1, -8(%rbp)	#, i
 182              	.L8:
  41:test.c        ****     for (int i = 0; i < size; i += 1) {
 183              		.loc 1 41 0 is_stmt 0 discriminator 1
 184 0156 817DF8FF 		cmpl	$99999999, -8(%rbp)	#, i
 184      E0F505
 185 015d 7ECD     		jle	.L9	#,
 186              	.LBE3:
  43:test.c        ****     }

test-gcc-o3

 163              	.L4:
  44:test.c        ****     for (int i = 0; i < size; i += 1) {
  45:test.c        ****         c[i] = a[i] + b[i];
 167              		.loc 1 45 0 discriminator 3
 168 00c0 660F6F80 		movdqa	a(%rax), %xmm0	# MEM[symbol: a, index: ivtmp.29_77, offset: 0B], MEM[symbol: a, index: ivtm
 168      00000000 
 169 00c8 4883C010 		addq	$16, %rax	#, ivtmp.29
 170 00cc 660FFE80 		paddd	b-16(%rax), %xmm0	# MEM[symbol: b, index: ivtmp.29_77, offset: 0B], vect__33.14
 170      00000000 
 171 00d4 0F298000 		movaps	%xmm0, c-16(%rax)	# vect__33.14, MEM[symbol: c, index: ivtmp.29_77, offset: 0B]
 171      000000
 172 00db 483D0084 		cmpq	$400000000, %rax	#, ivtmp.29
 172      D717
 173 00e1 75DD     		jne	.L4	#,

test-gcc-simd

 163              	.L4:
  34:test.c        **** 
  35:test.c        **** #ifdef USE_SIMD
  36:test.c        ****     for (int i = 0; i < size; i += 4) {
  37:test.c        ****         v4si *a_ptr = (v4si *) &a[i];
  38:test.c        ****         v4si *b_ptr = (v4si *) &b[i];
  39:test.c        ****         v4si *c_ptr = (v4si *) &c[i];
  40:test.c        ****         *c_ptr = *a_ptr + *b_ptr;
 168              		.loc 1 40 0 discriminator 3
 169 00c0 660F6F80 		movdqa	a(%rax), %xmm0	# MEM[symbol: a, index: ivtmp.25_86, offset: 0B], MEM[symbol: a, index: ivtm
 169      00000000 
 170 00c8 4883C010 		addq	$16, %rax	#, ivtmp.25
 171              	.LVL10:
 172 00cc 660FFE80 		paddd	b-16(%rax), %xmm0	# MEM[symbol: b, index: ivtmp.25_86, offset: 0B], D.2962
 172      00000000 
 173 00d4 0F298000 		movaps	%xmm0, c-16(%rax)	# D.2962, MEM[symbol: c, index: ivtmp.25_86, offset: 0B]
 173      000000
 174              	.LBE26:
  36:test.c        ****     for (int i = 0; i < size; i += 4) {
 175              		.loc 1 36 0 discriminator 3
 176 00db 483D0084 		cmpq	$400000000, %rax	#, ivtmp.25
 176      D717
 177 00e1 75DD     		jne	.L4	#,

test-gcc-debug-simd

 168              	.L9:
 169              	.LBB4:
  37:test.c        ****         v4si *a_ptr = (v4si *) &a[i];
 170              		.loc 1 37 0 discriminator 3
 171 012c 8B45F8   		movl	-8(%rbp), %eax	# i, tmp120
 172 012f 4898     		cltq
 173 0131 48C1E002 		salq	$2, %rax	#, tmp121
 174 0135 48050000 		addq	$a, %rax	#, tmp122
 174      0000
 175 013b 488945D8 		movq	%rax, -40(%rbp)	# tmp122, a_ptr
  38:test.c        ****         v4si *b_ptr = (v4si *) &b[i];
 176              		.loc 1 38 0 discriminator 3
 177 013f 8B45F8   		movl	-8(%rbp), %eax	# i, tmp124
 178 0142 4898     		cltq
 179 0144 48C1E002 		salq	$2, %rax	#, tmp125
 180 0148 48050000 		addq	$b, %rax	#, tmp126
 180      0000
 181 014e 488945D0 		movq	%rax, -48(%rbp)	# tmp126, b_ptr
  39:test.c        ****         v4si *c_ptr = (v4si *) &c[i];
 182              		.loc 1 39 0 discriminator 3
 183 0152 8B45F8   		movl	-8(%rbp), %eax	# i, tmp128
 184 0155 4898     		cltq
 185 0157 48C1E002 		salq	$2, %rax	#, tmp129
 186 015b 48050000 		addq	$c, %rax	#, tmp130
 186      0000
 187 0161 488945C8 		movq	%rax, -56(%rbp)	# tmp130, c_ptr
  40:test.c        ****         *c_ptr = *a_ptr + *b_ptr;
 188              		.loc 1 40 0 discriminator 3
 189 0165 488B45D8 		movq	-40(%rbp), %rax	# a_ptr, tmp131
 190 0169 660F6F08 		movdqa	(%rax), %xmm1	# *a_ptr_38, D.2868
 191 016d 488B45D0 		movq	-48(%rbp), %rax	# b_ptr, tmp132
 192 0171 660F6F00 		movdqa	(%rax), %xmm0	# *b_ptr_39, D.2868
 193 0175 660FFEC1 		paddd	%xmm1, %xmm0	# D.2868, D.2868
 194 0179 488B45C8 		movq	-56(%rbp), %rax	# c_ptr, tmp133
 195 017d 0F2900   		movaps	%xmm0, (%rax)	# D.2868, *c_ptr_40
 196              	.LBE4:
  36:test.c        ****     for (int i = 0; i < size; i += 4) {
 197              		.loc 1 36 0 discriminator 3
 198 0180 8345F804 		addl	$4, -8(%rbp)	#, i
 199              	.L8:
  36:test.c        ****     for (int i = 0; i < size; i += 4) {
 200              		.loc 1 36 0 is_stmt 0 discriminator 1
 201 0184 817DF8FF 		cmpl	$99999999, -8(%rbp)	#, i
 201      E0F505
 202 018b 7E9F     		jle	.L9	#,

How to get that nice assembly output

gcc -std=c11 -S -D_POSIX_C_SOURCE=199309L -fverbose-asm -g -D USE_SIMD test.c -o test.s
as -alhnd test.s

From this nice StackOverflow answer.

Previous Post: Live Coding: Rendering the User's Project

Next Post: Live Coding: Building the Sequencer

Genesis on GitHub