Venus User Guide | ACE-LAB Echo

This guide provides essential information for working with the Venus framework, covering data types, intrinsic functions, and sample code for task and bas file creation.

DataType

Venus supports two types of vector lengths (8-bit or 16-bit), with the longest vector length being 65535. The definition method is as follows:

//Define a variable a as a vector of x elements, each of which is y bits wide.
//The definition length can be arbitrary.

__v(x)i(y) a

//Example:
// Define a 1024-element vector short16 where each element is 16 bits wide.
__v1024i16 short16;     
// Define a 512-element vector char8 where each element is 8 bits wide.
__v512i8 char8;         

Intrinsics

General Syntax Format

/*
input1  :vector1
input2  :vector2 or constant
MASKREAD_OFF/MASKREAD_ON (optional): Specifies whether to apply the mask register during computation. Elements are processed only if the corresponding mask bit is 1.
MASKWRITE_OFF/MASKWRITE_ON (optional): Controls whether the computation result is written to the mask register.
Length: Specifies the number of elements to process, up to the maximum vector length. If omitted, defaults to the full vector length.
*/

return_vector = vfunction(input1, input2, MASKREAD_OFF/MASKREAD_ON, MASKWRITE_OFF/MASKWRITE_ON，Length);

Base Intrinsics

1. Addition（vadd,vsadd,vsaddu）

// When MASKWREAD_ON is set, only positions with a value of 1 are calculated, 
// positions with a value of 0 are filled with the vector in_1.

out = vadd(in_1, in_2, MASKREAD_OFF,Length);    // in_2 + in_1
out = vsadd(in_1, in_2, MASKREAD_OFF,Length);    // in_2 + in_1 using saturation
out = vsaddu(in_1, in_2, MASKREAD_OFF,Length);    // unsigned(in_2) + unsigned(in_1) using saturation

2. Subtraction（vrsub,vsub,vssub,vssubu）

// When MASKWREAD_ON is set, only positions with a value of 1 are calculated,
// positions with a value of 0 are filled with the vector in_1.

out = vrsub(in_1, in_2, MASKREAD_OFF,Length);    // in_1 - in_2
out = vsub(in_1, in_2, MASKREAD_OFF,Length);    // in_2 - in_1
out = vssub(in_1, in_2, MASKREAD_OFF,Length);      // in_2 - in_1 using saturation
out = vssubu(in_1, in_2, MASKREAD_OFF,Length);     // unsigned(in_2) - unsigned(in_1) using saturation

3. Multiplication（vmul,vmulh,vmulhu,vmulhsu）

// When MASKWREAD_ON is set, only positions with a value of 1 are calculated, 
// the value of the position vector a with a value of 0.

out = vmul (in_1, in_2, MASKREAD_OFF,Length);    // in_2 * in_1,outputs the lower 8 or 16 bits of the multiplication result.
out = vmulh(in_1, in_2, MASKREAD_OFF,Length);    // in_2 * in_1,outputs the high 8 or 16 bits of the multiplication result.
out = vmulhu(in_1, in_2, MASKREAD_OFF,Length);     // unsigned(in_2) * unsigned(in_1),outputs the lower 8 or 16 bits of the multiplication result.
out = vmulhsu(in_1, in_2, MASKREAD_OFF,Length);    // signed(in_2) * unsigned(in_1),outputs the lower 8 or 16 bits of the multiplication result.

4. Division（vdiv,vdivu）

// When MASKWREAD_ON is set, only positions with a value of 1 are calculated, 
// positions with a value of 0 are filled with the vector in_1.

out = vdiv(in_1, in_2, MASKREAD_OFF,Length);    // in_2/in_1
out = vdivu(in_1, in_2, MASKREAD_OFF,Length);    // unsigned(in_2)/unsigned(in_1)

5. Modulo（vrem,vremu）

out = vrem(in_1, in_2, MASKREAD_OFF,Length);    // in_2 % in_1
out = vremu(in_1, in_2, MASKREAD_OFF,Length);    // unsigned(in_2) % unsigned(in_1)

6. Logical Operations（vand,vor,vxor）

// When MASKWREAD_ON is set, only positions with a value of 1 are calculated,
// positions with a value of 0 are filled with the vector in_1.

out = vand(in_1, in_2, MASKREAD_OFF,Length);    // Compute the bitwise AND of in_1 and in_2
out = vor(in_1, in_2, MASKREAD_OFF,Length);        // Compute the bitwise OR of in_1 and in_2
out = vxor(in_1, in_2, MASKREAD_OFF,Length);    // Compute the bitwise XOR of in_1 and in_2

7. Arithmetic Shift Operations（vsll,vsrl,vsra）

out = vsll(in_1, constant, MASKREAD_OFF,Length);    // Shift signed integers in in_1 left by 'constant' bits
out = vsrl(in_1, constant, MASKREAD_OFF,Length);    // Shift unsigned integers in in_1 right by 'constant' bits
out = vsra(in_1, constant, MASKREAD_OFF,Length);    // Shift signed integers in in_1 right by 'constant' bits

8. Comparison （vseq,vsne,vsltu,vslt,vsleu,vsle,vsgtu,vsgt）

// When MASKWRITE_ON is set, the instruction has no return value.
// When MASKWREAD_ON is set, only positions with a value of 1 are compared, 
// positions with a value of 0 are filled with the vector a.
// Example: vseq(a, b, MASKREAD_OFF, MASKWRITE_ON);

out = vseq(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // b == a
out = vsne(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // b ≠ a
out = vsltu(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);     // (unsigned)b < (unsigned)a
out = vslt(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // b < a
out = vsleu(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // (unsigned)b ≤ (unsigned)a
out = vsle(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // b ≤ a
out = vsgtu(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // (unsigned)b ≥ (unsigned)a
out = vsgt(a, b, MASKREAD_OFF, MASKWRITE_OFF,Length);    // b ≥ a

💡 Note:
Comparison functions must specify MASKWRITE_OFF / MASKWRITE_ON.

9. Composite Functions（vmuladd,vmulsub,vaddmul,vsubmul）

out = vmuladd(a,b,c, MASKREAD_OFF,Length);    // (b * a) + c
out = vmulsub(a,b,c, MASKREAD_OFF,Length);     // (b * a) - c
out = vaddmul(a,b,c, MASKREAD_OFF,Length);    // (b + a) * c
out = vsubmul(a,b,c, MASKREAD_OFF,Length);    // (b - a) * c

10. Complex Multiplication Function（vcmxmul）

__v4096i8 *cmxreal_part = &tempWnResult_real;
__v4096i8 *cmximag_part = &tempWnResult_imag;

vcmxmul(cmximag_part, cmxreal_part, tempWnResult_real, tempWnResult_imag, 
        sin_stage0, cos_stage0, MASKREAD_OFF,calculate_length);
//(tempWnResult_real + tempWnResult_imag * i) * (cos_stage0 + sin_stage0 * i) 
// = (cmxreal_part + cmximag_part * i)

Extended intrinsics

1. Gather/Scatter（vshuffle）

vshuffle(out, index, in, SHUFFLE_GATHER, Length);   // Void function. Places in[index(i)] into out(i) for i ∈ [0, Length-1].
vshuffle(out, index, in, SHUFFLE_SCATTER, Length);  // Void function. Writes in(i) to out[index(i)] for i ∈ [0, Length-1].

2. Incremental Assignment Function（vrange）

//Fill vector in_and_out with {0,1,2,…,Length-1}. Void function.
vrange(in_and_out, Length); 

💡 Note: vrange only supports 16-bit data.

3. Broadcast Function（vbrdcst）

// Void function. Broadcasts 'constant' to all elements of vector in_and_out.
vbrdcst(in_and_out, constant, MASKREAD_OFF,Length);  

4. Shift and Output Function for Multiply and Divide（vsetshamt）

/* Description: 
When performing Venus multiplication with 8-bit vectors a and b, the result of a*b is 16 bits. 
Standard multiplication instructions can only output the lower 8 bits or high 8 bits of this result. 
To extract an arbitrary 8-bit segment from the 16-bit product, use the vsetshamt(constant) instruction beforehand. 
This configures subsequent multiplication operations (e.g., vmul, vmulh) to apply shifts before output:
    - vmul: Outputs the lower 8 bits of (a*b >> constant).
    - vmulh: Outputs the upper 8 bits of (a*b << constant).
*/
vsetshamt(constant);
out = vmul(a, b, MASKREAD_OFF,Length);

5. Declaration（vclaim）

// Void function. Ensures the address space of in_and_out is preserved and not eliminated by compiler optimizations.
// Typically placed after the variable definition.
vclaim(in_and_out);

6. Address Retrieval Function（vaddr）

// Returns the starting address of the vector,out is an integer.
out = vaddr(vector);    

7. Memory Locking Function（vbarrier）

// Example: memory locking for move the array data

// Transfer of 16-bit array
VSPM_OPEN();
vbarrier();
int testdata_16bit_a_addr = vaddr(testdata_16bit_a);
for (int i = 0; i < 8; i++)
{
    *(volatile unsigned short *)(testdata_16bit_a_addr + (i << 1)) = testdata_16bit_1[i];
}
int testdata_16bit_b_addr = vaddr(testdata_16bit_b);
for (int i = 0; i < 8; i++)
{
    *(volatile unsigned short *)(testdata_16bit_b_addr + (i << 1)) = testdata_16bit_2[i];
}
    VSPM_CLOSE();

// Transfer of 8-bit array
VSPM_OPEN();
vbarrier();
int testdata_8bit_a_addr = vaddr(testdata_8bit_a);
for (int i = 0; i < 8; i++)
{
    *(volatile unsigned char *)(testdata_8bit_a_addr + i) = testdata_8bit_1[i];
}
int testdata_8bit_b_addr = vaddr(testdata_8bit_b);
for (int i = 0; i < 8; i++)
{
    *(volatile unsigned char *)(testdata_8bit_b_addr + i) = testdata_8bit_2[i];
}
VSPM_CLOSE();

8. Mask Inversion Function（vmnot）

vmnot(mask_reg); // Inverting the value of the mask register

9. Vector Return Function（vreturn）

typedef struct{
    short data;
}attribute_((aligned(64)))short_struct;

__v2048i16 Vector;
short length = 256;
short_struct constant;
constant.data = 10;

vreturn (&constant,sizeof(constant),Vector,length,...,...);
//The scalar data needs to be put into the short_struct structure, 
//and the vector data needs to return the vector length length.

10. Vector Operations（vredmin,vredmax,vredsum）

/*compute the minimum\maximum values all elements and 
the sum of all elements in a single vector.
*/

vector_b = vredmin(vector_a,MASKRED_OFF,length);
//Find the minimum value of the first length variables in the vector_a, 
//and store the value in the first variable of the vector_b.

vector_b = vredmax(vector_a,MASKRED_OFF,length);
//the usage method is the same as vredmin.
//You can invert the variable and replace it with find Minimum.

vector_b = vredsum(vector_a,MASKRED_OFF,length);
//add up the first length variables in the vector_a
//and the results are stored in the first four variables of the vector_b.

A Sample Code for Single TASK File Creation

#include "data_type.h"
#include "riscv_printf.h"
#include "venus.h"

typedef short __v2048i16 __attribute__((ext_vector_type(2048)));
typedef char  __v4096i8 __attribute__((ext_vector_type(4096)));

/**
 * @section DESCRIPTION
 * This Task checks the input data vector for a CRC error for 5G New Radio (NR)
 * physical channels as specified in 3GPP TS 38.212 .
 * 
 * Features
 * - Supports all 5G NR CRC polynomials (CRC24A, CRC24B, CRC24C, CRC16, CRC11, CRC6)
 *
 * @param[in]     tmp_vin            : A 4096i8 vector for storing the input bit sequence.
 * @param[in]     in_fullLen         : A short struct for storing length of input data in bits.
 * @param[in]     in_pariLen         : A short struct for storing length of CRC data in bits.
 * @param[in]     poly               : A 4096i8 vector (stored table in bas) for storing the CRC generation polynomial.
 * @param[out]    out_crc_result     : A short struct for storing error detection result.
 * @param[out]    buf                : A 4096i8 vector for storing calculated CRC value (optional).
 * 
 */

int Task_nrCRC(__v4096i8 tmp_vin, short_struct in_fullLen, short_struct in_pariLen, __v4096i8 poly) {
  int fullLen = in_fullLen.data + 24;
  int pariLen = in_pariLen.data + 1;
  int msgLen  = fullLen - pariLen + 1;
  int tmp;

  __v4096i8 vin;
  vclaim(vin);
  vbrdcst(vin, 1, MASKREAD_OFF, fullLen);

  __v2048i16 vin_shuffle_index;
  vclaim(vin_shuffle_index);
  vrange(vin_shuffle_index, fullLen);
  vin_shuffle_index = vsadd(vin_shuffle_index, 24, MASKREAD_OFF, fullLen);

  vshuffle(vin, vin_shuffle_index, tmp_vin, SHUFFLE_SCATTER, fullLen);

  __v4096i8  buf;
  __v4096i8  msg;
  __v2048i16 index;
  vclaim(buf);
  vclaim(msg);
  vclaim(index);

  vrange(index, msgLen);
  vbrdcst(msg, 0, MASKREAD_OFF, fullLen);
  vshuffle(msg, index, vin, SHUFFLE_GATHER, msgLen);

  for (int i = 0; i < msgLen; i++) {
    int m_addr = vaddr(msg);
    vbarrier();
    VSPM_OPEN();
    unsigned int addr = m_addr + i;
    tmp               = *(volatile unsigned char *)(addr);
    VSPM_CLOSE();

    if (tmp == 1) {
      vrange(index, pariLen);
      index = vsadd(index, i, MASKREAD_OFF, pariLen);
      vshuffle(buf, index, msg, SHUFFLE_GATHER, pariLen);
      buf = vxor(buf, poly, MASKREAD_OFF, pariLen);
      vshuffle(msg, index, buf, SHUFFLE_SCATTER, pariLen);
    }
  }

  vrange(index, pariLen - 1);
  index = vsadd(index, msgLen, MASKREAD_OFF, pariLen - 1);
  vshuffle(buf, index, msg, SHUFFLE_GATHER, pariLen - 1);

  __v4096i8 si_rnti;
  vclaim(si_rnti);
  vbrdcst(si_rnti, 1, MASKREAD_OFF, pariLen - 1);
  vbrdcst(si_rnti, 0, MASKREAD_OFF, 8);
  buf = vxor(buf, si_rnti, MASKREAD_OFF, pariLen - 1);

  __v2048i16 shuffle_index;
  vclaim(shuffle_index);
  vrange(shuffle_index, pariLen);
  shuffle_index = vsadd(shuffle_index, fullLen - pariLen + 1, MASKREAD_OFF, pariLen - 1);

  __v4096i8 compare;
  vclaim(compare);
  vshuffle(compare, shuffle_index, vin, SHUFFLE_GATHER, pariLen - 1);

  __v4096i8 compare_result;
  compare_result = vsne(buf, compare, MASKREAD_OFF, MASKWRITE_OFF, pariLen - 1);

  compare_result          = vredsum(compare_result, MASKREAD_OFF, pariLen - 1);
  int compare_result_addr = vaddr(compare_result);
  vbarrier();
  VSPM_CLOSE();
  int crc_result = *(volatile unsigned char *)(compare_result_addr);
  VSPM_CLOSE();

  short_struct out_crc_result;
  out_crc_result.data = crc_result;

  vreturn(buf, sizeof(buf), &out_crc_result, sizeof(out_crc_result));
}

💡Notes:
Try not to have scalar operations before vector operations, and add vclaim when they occur.

A Sample Code for BAS File Creation

'Define vectors
parameter char in_vec1 = {1,2,3,...}         'no ";" ending
parameter short in_vec2 = {1,2,3,...}    
'Define constants
parameter short constant = {1}     

dfedata char dfe_input_0[4096]  'the input value of dfe
dfedata char dfe_input_1[4096]     'the input value of dfe
dag_input short NCELLID2[1]         'the input value of dag

return_value short subFrameNum[1]  'dag return value

dag dag1 = {
  [dfe_output_0] = Task_example(dfe_input_0, dfe_input_1,in_vec1,in_vec2,constant)
  ' _v4096i16 Task_example(_v4096i8 in_1, __v4096i8 in_2,__v409618 in_3,__v2048i16 in_4, short_struct const)
  ' typedef struct {short data;}__attribute_((aligned(64))) short_struct;
}

END

💡Notes:
1、A single task in a bas file cannot be printed.

2、Each line of code can not be added“ ; ”.

3、Comments can not be added to the bas variable definition.

4、"data" can not be used as a variable name for bas.