_mm_mask_dpbf16_ps
Classification
AVX-512, Arithmetic, CPUID Test: AVX512_BF16
Header File
immintrin.h
Instruction
VDPBF16PS xmm {k}, xmm, xmm
Synopsis
 _mm_mask_dpbf16_ps(__m128 src, __mmask8 k, __m128bh a, __m128bh b);
Description
Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
Operation
DEFINE make_fp32(x[15:0]) {
	y.fp32  := 0.0
	y[31:16] := x[15:0]
	RETURN y
}
dst := src
FOR j := 0 to 3
	IF k[j]
		dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1])
		dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0])
	ELSE
		dst.dword[j] := src.dword[j]
	FI
ENDFOR
dst[MAX:128] := 0