MV.X and MV.swizzle

Mon Jun 28 09:56:12 2021 · independent

for i in range(VL):
   for j in range(SUBVL):
      regs[rd] = regs[rd+regs[rs+j]]

for i in range(VL):
   for j in range(SUBVL):
      regs[rd] = regs[rd+regs[rs+k]]
        k++

def mv_x(rd, rs1, funct4):
    elwidth = (funct4>>2) & 0x3
    bitwidth = {0:XLEN, 1:8, 2:16, 3:32}[elwidth] # get bits per el
    bytewidth = bitwidth / 8 # get bytes per el
    for i in range(VL):
        addr = (unsigned char *)&regs[rs1]
        offset = addr + bytewidth # get offset within regfile as SRAM
        # TODO, actually, needs to respect rd and rs1 element width,
        # here, as well.  this pseudocode just illustrates that the
        # MV.X operation contains a way to compact the indices into
        # less space.
        regs[rd] = (unsigned char*)(regs)[offset]

ldimm x8, 1
ldimm x9, 3
ldimm x10, 2
ldimm x11, 0
{SVP.VL=4} MV.X x3, x8, elwidth=default

ldimm x8, 0x00020301
{SVP.VL=4} MV.X x3, x8, elwidth=8

__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);

/* Assigning transposed values back into I[0-3] */
I0 = _mm_unpacklo_epi64(T0, T1);
I1 = _mm_unpackhi_epi64(T0, T1);
I2 = _mm_unpacklo_epi64(T2, T3);
I3 = _mm_unpackhi_epi64(T2, T3);

pfscale,3 F2, F1, F10
pfscaleadd,2 F2, F1, F11, F2
pfscaleadd,1 F2, F1, F12, F2
pfscaleadd,0 F2, F1, F13, F2

fmul f2, f1.xxxx, f10
fmac f2, f1.yyyy, f11, f2
fmac f2, f1.zzzz, f12, f2
fmac f2, f1.wwww, f13, f2

pub trait SwizzleConstants: Copy + 'static {
    const CONSTANTS: &'static [Self; 4];
}

impl SwizzleConstants for u8 {
    const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFF, 0x7F];
}

impl SwizzleConstants for u16 {
    const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFFFF, 0x7FFF];
}

impl SwizzleConstants for f32 {
    const CONSTANTS: &'static [Self; 4] = &[0.0, 1.0, -1.0, 0.5];
}

// impl for other types too...

pub fn swizzle<Elm, Selector>(
    rd: &mut [Elm],
    rs1: &[Elm],
    rs2: &[Selector],
    vl: usize,
    destsubvl: usize,
    srcsubvl: usize)
where
    Elm: SwizzleConstants,
    // Selector is a copyable type that can be converted into u64
    Selector: Copy + Into<u64>,
{
    const FIELD_SIZE: usize = 3;
    const FIELD_MASK: u64 = 0b111;
    for vindex in 0..vl {
        let selector = rs2[vindex].into();
        // selector's type is u64
        if selector >> (FIELD_SIZE * destsubvl) != 0 {
            // handle illegal instruction trap
        }
        for i in 0..destsubvl {
            let mut sel_field = selector >> (FIELD_SIZE * i);
            sel_field &= FIELD_MASK;
            let src = if (sel_field & 0b100) == 0 {
                &rs1[(vindex * srcsubvl)..]
            } else {
                SwizzleConstants::CONSTANTS
            };
            sel_field &= 0b11;
            if sel_field as usize >= srcsubvl {
                // handle illegal instruction trap
            }
            let value = src[sel_field as usize];
            rd[vindex * destsubvl + i] = value;
        }
    }
}

fn swizzle2<Elm, Selector>(
    rd: &mut [Elm],
    rs1: &[Elm],
    rs2: &[Selector],
    rs3: &[Elm],
    vl: usize,
    destsubvl: usize,
    srcsubvl: usize)
where
    // Elm is a copyable type
    Elm: Copy,
    // Selector is a copyable type that can be converted into u64
    Selector: Copy + Into<u64>,
{
    const FIELD_SIZE: usize = 3;
    const FIELD_MASK: u64 = 0b111;
    for vindex in 0..vl {
        let selector = rs2[vindex].into();
        // selector's type is u64
        if selector >> (FIELD_SIZE * destsubvl) != 0 {
            // handle illegal instruction trap
        }
        for i in 0..destsubvl {
            let mut sel_field = selector >> (FIELD_SIZE * i);
            sel_field &= FIELD_MASK;
            let src = if (sel_field & 0b100) != 0 {
                rs1
            } else {
                rs3
            };
            sel_field &= 0b11;
            if sel_field as usize >= srcsubvl {
                // handle illegal instruction trap
            }
            let value = src[vindex * srcsubvl + (sel_field as usize)];
            rd[vindex * destsubvl + i] = value;
        }
    }
}

Encoding	31:27	26:25	24:20	19:15	14:12	11:7	6:2	1:0
RV32-I-type	imm[11:0]			rs1[4:0]	funct3	rd[4:0]	opcode	0b11

RV32-I-type	fn4[3:0]	swizzle[7:0]		rs1[4:0]	0b000	rd[4:0]	OP-V	0b11

7:6	5:4	3:2	1:0
w	z	y	x

Encoding	31:27	26:25	24:20	19:15	14:12	11:7	6:2	1:0
RV32-R-type	funct7		rs2[4:0]	rs1[4:0]	funct3	rd[4:0]	opcode	0b11

RV32-R-type	0b0000000		rs2[4:0]	rs1[4:0]	0b001	rd[4:0]	OP-V	0b11

int/fp	DESTSUBVL	31	30:29	28:20	19:15	14:12	11:7
int	1 to 3	0	DESTSUBVL	selector	rs	000	rd
fp	1 to 3	1	DESTSUBVL	selector	rs	000	rd
int	4	selector[11:0]			rs	001	rd
fp	4	selector[11:0]			rs	010	rd

int/fp	31:28	27:20	19:15	14:12	11:7
int	DESTMASK	selector	rs	000	rd
fp	DESTMASK	selector	rs	001	rd
int	DESTMASK	constsel	rs	010	rd
fp	DESTMASK	constsel	rs	011	rd

MV.X and MV.swizzle

MV.X with 3 operands

macro-op fusion

VBLOCK context?

mm_shuffle_ps?

Transpose

Transforms for DCT

Table to evaluate

Matrix 4x4 Vector mul

Pseudocode

	31:27	26:25	24:20	19:15	14:12	11:7
swizzle2	rs3	00	rs2	rs1	000	rd
fswizzle2	rs3	01	rs2	rs1	000	rd
swizzle	0	10	rs2	rs1	000	rd
fswizzle	0	11	rs2	rs1	000	rd
swizzlei	imm			rs1	001	rd
fswizzlei	imm			rs1	010	rd