Hi, I don't get the reason why rev_byte_order4 is done this way, the following code compiles in 8 instructions instead of 19 and is about four times faster. PhM void rev_byte_order4(char* retval) { char Temp; Temp = retval[0]; retval[0] = retval[3]; retval[3] = Temp; Temp = retval[1]; retval[1] = retval[2]; retval[2] = Temp; } Edit : If you have a bunch of them to swap (multiple of 4), and you define some platform specific vector implementation of rev_byte_order4, then this one on PC compiles in 6 instructions and is even faster : void rev_byte_order4_4(char* retval) { static const long Shuffler = _MM_SHUFFLE(0,1,2,3); __m128 vFour32a,vFour32b; vFour32a = _mm_loadu_ps((float*)retval); // vFour32a = a3 a2 a1 a0 vFour32b = _mm_movehl_ps(vFour32a,vFour32a); // vFour32b = a3 a2 a3 a2 vFour32a = _mm_movelh_ps(vFour32a,vFour32a); // vFour32a = a1 a0 a1 a0 vFour32a = _mm_shuffle_ps(vFour32b,vFour32a,Shuffler); // vFour32a = a0 a1 a2 a3 _mm_storeu_ps((float*)retval,vFour32a); }