001: void mul_float4_sse(const float *a, const float *b, float *c)
002: {
003:    __m128 reg_a, reg_b, reg_c;
004:    
005:    reg_a = _mm_loadu_ps(a);          // reg_a <- { a[0],a[1],a[2],a[3] }
006:    reg_b = _mm_loadu_ps(b);          // reg_b <- { b[0],b[1],b[2],b[3] }
007:    reg_c = _mm_mul_ps(reg_a, reg_b); // reg_c <- { a[0]*b[0], a[1]*b[1] .... a[3]*b[3] }
008:    _mm_storeu_ps(c, reg_c);          // { c[0],c[1],c[2],c[3] } <- reg_c
009: }
