void mul_float4_sse_asm(const float *a, const float *b, float *c)
{
    asm volatile (
      "movups  xmm0, [%0];"     // xmm0 <- { a[0],a[1],a[2],a[3] }
      "movups  xmm1, [%1];"     // xmm1 <- { b[0],b[1],b[2],b[3] }
      "mulps   xmm0, xmm1;"     // xmm0 <- { a[0]*b[0], a[1]*b[1] .... a[3]*b[3] }
      "movups  [%2], xmm0;"     // { c[0],c[1],c[2],c[3] } <- xmm0
    :: "r"(a),"r"(b),"r"(c) : "xmm0");
    // ~~~~~~ ~~~~~~ ~~~~~~
    //   %0     %1     %2
}
