Hi,
I have a program which does 1-byte vector sum using SSE intrinsics.
When I compile it with Intel compiler (ICC 15.0.1, x86_64, Linux, SandyBridge CPU) it segfaults, it looks like the end condition of the loop is not checked correctly. Same code works with GCC.
My only optimization flag -s "-O3" (When compiling with -O1 the program works).
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include <emmintrin.h>
static inline void add_single(void *dst, void *src) {
*((int8_t*)dst) += *((int8_t*)src);
}
__attribute__((noinline))
void vector_sum_char(void *dst, void *src, unsigned length)
{
const unsigned factor = sizeof(__m128i);
__m128i s[1], d[1];
int i, j;
i = 0;
while(i < (int)(length - factor + 1))
{
printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1));
j = 0;
d[j] = (__m128i)_mm_loadu_si128(dst);
s[j] = (__m128i)_mm_loadu_si128(src);
src += sizeof(__m128i);
j = 0;
_mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j]));
dst += sizeof(__m128i);
i += factor;
}
for (i = 0; i < (length%factor); ++i) {
add_single(dst, src);
++dst;
++src;
}
}
int main(int argc, char **argv)
{
int num_elems = 17;
void *src = calloc(1, num_elems);
void *dst = calloc(1, num_elems);
vector_sum_char(dst, src, num_elems);
free(src);
free(dst);
return 0;
}
Output looks like this:
src=0x6122e0 dst=0x612300 i=0 max=2 src=0x6122f0 dst=0x612310 i=16 max=2 src=0x612300 dst=0x612320 i=32 max=2 src=0x612310 dst=0x612330 i=48 max=2 src=0x612320 dst=0x612340 i=64 max=2 src=0x612330 dst=0x612350 i=80 max=2 src=0x612340 dst=0x612360 i=96 max=2 src=0x612350 dst=0x612370 i=112 max=2 ... Segmentation fault (core dumped) gdb: Program received signal SIGSEGV, Segmentation fault. vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j])); (gdb) bt #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) #1 0x00000000004015ff in main (argc=-10336, argv=0x0) (gdb) f 0 #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d[j], s[j])); 0x000000000040167f <+95>: mov %rbp,%rsi 0x0000000000401682 <+98>: mov %rbx,%rdx 0x0000000000401685 <+101>: mov %r12d,%ecx 0x0000000000401688 <+104>: mov %r15d,%r8d 0x000000000040168b <+107>: xor %eax,%eax 0x000000000040168d <+109>: callq 0x401338 <printf@plt> => 0x0000000000401692 <+114>: movdqu (%rbx),%xmm1 0x0000000000401696 <+118>: movdqu 0x0(%rbp),%xmm0 0x000000000040169b <+123>: paddb %xmm0,%xmm1 0x000000000040169f <+127>: inc %r14d 0x00000000004016a2 <+130>: movdqu %xmm1,(%rbx) 0x00000000004016a6 <+134>: add $0x10,%rbp 0x00000000004016aa <+138>: add $0x10,%rbx 0x00000000004016ae <+142>: add $0x10,%r12d 0x00000000004016b2 <+146>: cmp %r13d,%r14d 0x00000000004016b5 <+149>: jl 0x40167a <vector_sum_char+90>
please note here that in address <+146> r13d is compared, but on <+101> and <+142> r12d is being used as "i".