_mm256_dp_ps
Classification
AVX_ALL, Arithmetic, CPUID Test: AVX
Header File
immintrin.h
Instruction
VDPPS ymm, ymm, ymm, imm8
Synopsis
 _mm256_dp_ps(__m256 a, __m256 b, const int imm8);
Description
Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".
Operation
DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
	FOR j := 0 to 3
		i := j*32
		IF imm8[(4+j)%8]
			temp[i+31:i] := a[i+31:i] * b[i+31:i]
		ELSE
			temp[i+31:i] := FP32(0.0)
		FI
	ENDFOR
	
	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
	
	FOR j := 0 to 3
		i := j*32
		IF imm8[j%8]
			tmpdst[i+31:i] := sum[31:0]
		ELSE
			tmpdst[i+31:i] := FP32(0.0)
		FI
	ENDFOR
	RETURN tmpdst[127:0]
}
dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
dst[MAX:256] := 0