DFT Using NNTool + Autotiler

Requirements

No specific requirement. This example should run without issue on all chips/boards/OSes.

Description

In this example the DFT and IDFT functions are generated using NNTool. In create_dft_graph.py a simple NNTool graph is created with 2 nodes: the DFT and IDFT (look at the file for more details, comments in the code)

dft_node = DFTNode(
    "dft_node",
    n_dft=args.n_dft,
    n_frames=1,
    frame_size=args.frame_size,
    # frame_step=args.frame_step,
    window=args.window_type,
    power=abs(args.power),
    output_complex_stft=args.power != 0
)
...
idft_node = IDFTNode(
    "idft_node",
    n_dft=args.n_dft,
    n_frames=1,
)

Since the ndft is the same, the twiddles are shared between the two graphs:

twids = dft_node.gen_twiddles()
out_dft = dft_node(inp, *twids)
out_idft = idft_node(out_dft[1] if args.power != 0 else out_dft, *twids)

Autotiler code is generated. Then the Autotiler model is compiled and run to generate GAP C code. All the rules and dependencies to generate files are done via CMakeLists.txt.

In the main application a wav file is opened from PC and frame by frame it is applied the DFT and IDFT. The reconstructed signal is then written back to a wav file and can be played.

How to run

mkdir build
cd build
cmake ../
make run -j

Code

/*
 * Copyright (C) 2017 GreenWaves Technologies
 * All rights reserved.
 *
 * This software may be modified and distributed under the terms
 * of the BSD license.  See the LICENSE file for details.
 *
 */


/* Autotiler includes. */
#include "dft_graphKernels.h"
#include "gaplib/fs_switch.h"
#include "gaplib/wavIO.h"
#define __XSTR(__s) __STR(__s)
#define __STR(__s) #__s

struct pi_device DefaultRam; 
struct pi_device* ram = &DefaultRam;
//Setting a big buffer to load files from PC to L2 and then store in ram
#define TEMP_L2_SIZE 1200000
#define AUDIO_BUFFER_SIZE (TEMP_L2_SIZE>>1)

AT_DEFAULTFLASH_EXT_ADDR_TYPE dft_graph_L3_Flash = 0;

/* Inputs */
/* Outputs */
static uint32_t inSig;
static uint32_t outSig;

L2_MEM DATATYPE_SIGNAL Audio_Frame[FRAME_SIZE];  // 
L2_MEM DATATYPE_SIGNAL Reconstructed_Frame[FRAME_SIZE]; // 
L2_MEM short int Reconstructed_Frame_temp[FRAME_SIZE];  // 
L2_MEM DATATYPE_SIGNAL DFTCplxOut[(N_DFT + 1)*2]; // the 2 is because of complex numbers
L2_MEM DATATYPE_SIGNAL DFTMag[(N_DFT + 1)]; // the 2 is because of complex numbers

static void dft_graph()
{
    dft_graphCNN_ConstructCluster();
    dft_graphCNN(Audio_Frame, DFTCplxOut, Reconstructed_Frame, DFTMag);
}

int main(int argc, char *argv[])
{
    printf("\n\n\t *** NNTOOL dft_graph Example ***\n\n");

    /* Configure And open cluster. */
    struct pi_device cluster_dev;
    struct pi_cluster_conf cl_conf;
    pi_cluster_conf_init(&cl_conf);
    cl_conf.cc_stack_size = STACK_SIZE;

    cl_conf.id = 0; /* Set cluster ID. */
                    // Enable the special icache for the master core
    cl_conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE |
                    // Enable the prefetch for all the cores, it's a 9bits mask (from bit 2 to bit 10), each bit correspond to 1 core
                    PI_CLUSTER_ICACHE_PREFETCH_ENABLE |
                    // Enable the icache for all the cores
                    PI_CLUSTER_ICACHE_ENABLE;

    pi_open_from_conf(&cluster_dev, (void *) &cl_conf);
    if (pi_cluster_open(&cluster_dev))
    {
        printf("Cluster open failed !\n");
        return -4;
    }

    /* Frequency Settings: defined in the Makefile */
    int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000);
    int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000);
    int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000);
    if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1)
    {
        printf("Error changing frequency !\nTest failed...\n");
        return -4;
    }
	printf("FC Frequency = %d Hz CL Frequency = %d Hz PERIPH Frequency = %d Hz\n", 
            pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH));

    /****
        Configure And Open the External Ram. 
    ****/
    struct pi_default_ram_conf ram_conf;
    pi_default_ram_conf_init(&ram_conf);
    ram_conf.baudrate = FREQ_FC*1000*1000;
    pi_open_from_conf(&DefaultRam, &ram_conf);
    if (pi_ram_open(&DefaultRam))
    {
        printf("Error ram open !\n");
        return -3;
    }
    printf("RAM Opened\n");

    /****
        Load Audio Wav from file 

    ****/
    // Read Audio Data from file using temp_L2_memory as temporary buffer
    // Data are prepared in L3 external memory
    char* temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
    if (temp_L2_memory == 0) {
        printf("Error when allocating L2 buffer\n");
        return 5;
    }
    
    // Allocate L3 buffers for audio IN/OUT
    if (pi_ram_alloc(&DefaultRam, &inSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
    {
        printf("inSig Ram malloc failed !\n");
        return -4;
    }
    if (pi_ram_alloc(&DefaultRam, &outSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
    {
        printf("outSig Ram malloc failed !\n");
        return -5;
    }

    // Read audio from file
    header_struct header_info;
    if (ReadWavFromFile(__XSTR(WAV_FILE), temp_L2_memory, AUDIO_BUFFER_SIZE*sizeof(short), &header_info)){
        printf("\nError reading wav file\n");
        return -1;
    }
    int samplerate = header_info.SampleRate;
    int num_samples = header_info.DataSize * 8 / (header_info.NumChannels * header_info.BitsPerSample);
    printf("Num Samples: %d with BitsPerSample: %d SR: %dkHz\n", num_samples, header_info.BitsPerSample, samplerate);

    if(num_samples*sizeof(short) > TEMP_L2_SIZE){
        printf("The size of the audio exceeds the available L2 memory space!\n");
        return -1;
    }

    // copy input data to L3
    pi_ram_write(&DefaultRam, inSig, temp_L2_memory, num_samples * sizeof(short));

    // Reset Output Buffer and copy to L3
    short * out_temp_buffer = (short *) temp_L2_memory;
    for(int i=0; i < num_samples; i++){
        out_temp_buffer[i] = 0;
    }
    pi_ram_write(&DefaultRam, outSig, temp_L2_memory, num_samples * sizeof(short));

    // free the temporary input memory
    pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);

    gap_fc_starttimer();
    gap_fc_resethwtimer();
    unsigned int start, elapsed;

    // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!!
    printf("Constructor\n");
    int ConstructorErr = dft_graphCNN_Construct();
    if (ConstructorErr)
    {
        printf("Graph constructor exited with error: %d\n(check the generated file dft_graphKernels.c to see which memory have failed to be allocated)\n", ConstructorErr);
        return -6;
    }

    /****
        Load the input audio signal and compute the MFCC
        IMP: Audio_Frame includes only a single frame for audio
    ****/
    int tot_frames = (int) (((float) (num_samples - FRAME_SIZE) / FRAME_STEP));
    printf("Number of frames to be processed: %d\n", tot_frames);

    struct pi_cluster_task task_rfft;
    pi_cluster_task(&task_rfft, (void (*)(void *))dft_graph, NULL);
    pi_cluster_task_stacks(&task_rfft, NULL, SLAVE_STACK_SIZE);

    for (int frame_id=0; frame_id < tot_frames; frame_id++)
    {
        printf("Frame [%3d/%3d]", frame_id+1, tot_frames);
        // Copy Data from L3 to L2
        short * in_temp_buffer = (short *) Audio_Frame;
        pi_ram_read(
            &DefaultRam, 
            inSig + frame_id * FRAME_STEP * sizeof(short), 
            in_temp_buffer, 
            (uint32_t) FRAME_SIZE*sizeof(short)
        );
        // cast data from Q16.15 to DATATYPE_SIGNAL (may be float16)

        for (int i=(FRAME_SIZE-1) ; i>=0; i--){
            Audio_Frame[i] = ((DATATYPE_SIGNAL) in_temp_buffer[i])/(1<<15);
        }

        /******
            Compute the RFFT + IRFFT
        ******/
        start = gap_fc_readhwtimer();
        pi_cluster_send_task_to_cl(&cluster_dev, &task_rfft);
        elapsed = gap_fc_readhwtimer() - start;
        printf(" --> %d (%.2fus) \n", elapsed, ( (float) elapsed ) / FREQ_FC);

        // Hanning window requires divide by 2 when overlapp and add 
        for (int i= 0 ; i<FRAME_SIZE; i++){
            Reconstructed_Frame[i] = Reconstructed_Frame[i] / 2;   // FIXME: divide by 2 because of current Hanning windowing
        }

        // Read the outsignal
        pi_ram_read(&DefaultRam, (uint32_t) ((short *) outSig + (frame_id*FRAME_STEP)), 
            Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));
        // Overlap And ADD
        for (int i= 0 ; i<FRAME_SIZE; i++){
            Reconstructed_Frame_temp[i] += (short int)(Reconstructed_Frame[i] * (1<<15));
        }
        pi_ram_write(&DefaultRam, (uint32_t)( (short *) outSig + (frame_id*FRAME_STEP)),
            Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));

    }   // stop looping over frames

    /*
        Exit the real-time mode (only for testing)
        and write clean speech audio to file: test_gap.wav
    */
    // allocate L2 Memory
    temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
    if (temp_L2_memory == 0) {
        printf("Error when allocating L2 buffer\n");
        return 18;
    }
    // copy input data to L3
    out_temp_buffer = (short int * ) temp_L2_memory; 
    pi_ram_read(&DefaultRam, outSig, out_temp_buffer, num_samples * sizeof(short));

    WriteWavToFile(__XSTR(OUT_FILE), 16, samplerate, 1, (uint32_t *) temp_L2_memory, num_samples* sizeof(short));
    printf("Writing wav file to %s completed successfully\n", __XSTR(OUT_FILE));

    /*
        Compare with original signal
    */
    int NFrameToCheck = Min(tot_frames-7, 10);
    printf("Frames to check: %d..%d\n", 4, 4+NFrameToCheck);
    short int *original_input = (short int *) pi_l2_malloc(((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short));
    if (original_input == 0) {
        printf("Error when allocating L2 buffer\n");
        return 18;
    }
    pi_ram_read(
        &DefaultRam, 
        inSig, 
        original_input, 
        (uint32_t) ((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short)
    ); // Copy 10 frames of the original audio and compare with the computed one
    float perr = 0.0f, psig = 0.0f;
    for (int i=4*FRAME_STEP; i<((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE); i++) {
        float diff = (float) (out_temp_buffer[i] - original_input[i]);
        perr += diff * diff;
        psig += out_temp_buffer[i] * out_temp_buffer[i];
        //printf("[%d] %d vs %d -> %f\n", i, original_input[i], out_temp_buffer[i], (diff * diff)/(out_temp_buffer[i] * out_temp_buffer[i]));
    }
    float snr = psig / perr;
    printf("SNR wrt to original signal: %.2f\n\n", snr);
    if (snr < 90) {
        printf("Big error between original signal and reconstructed\n");
        return -1;
    }

    /*
        Deallocate everything and Close the cluster
    */
    pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);

    dft_graphCNN_Destruct();
    pi_cluster_close(&cluster_dev);

    printf("Ended\n");
    return 0;
}

import os
from nntool.api import NNGraph
from nntool.api.types import DFTNode, IDFTNode
from nntool.api.utils import quantization_options, model_settings

import argparse
import argcomplete

def create_parser():
    # create the top-level parser
    parser = argparse.ArgumentParser(prog='fft_at_generators')

    parser.add_argument('--float_type', default="bfloat16",
                        help="Float data type")
    parser.add_argument('--n_dft', default=400, type=int,
                        help="number of fft points")
    parser.add_argument('--frame_size', default=400, type=int,
                        help="number of fft points")
    parser.add_argument('--window_type', default="hanning",
                        help="windowing function")
    parser.add_argument('--power', default=0, type=int,
                        help="power spectrogram: 0=X (cplx), 1=|X| (real), 2=|X|^2 (real)")

    parser.add_argument('--at_model_path', default=None,
                        help="Path to the C autotiler model file to generate")
    parser.add_argument('--tensors_dir', default=None,
                        help="Path to the autotiler model constant files to generate")
    return parser


if __name__ == '__main__':
    parser = create_parser()
    argcomplete.autocomplete(parser)
    args = parser.parse_args()

    # if power != 0, we still want the complex output to reconstruct the signal in the iDFT
    # hence, we add a second output to the node that outputs the complex DFT among the spectrum
    G = NNGraph(name='dft_graph')
    inp = G.add_input([args.frame_size])
    dft_node = DFTNode(
        "dft_node",
        n_dft=args.n_dft,
        n_frames=1,
        frame_size=args.frame_size,
        # frame_step=args.frame_step,
        window=args.window_type,
        power=abs(args.power),
        output_complex_stft=args.power != 0
    )

    # the DFT/iDFT operators require some constants to be generated
    twids = dft_node.gen_twiddles()

    # NNTool nodes can be referenced to construct the graph structure
    # here for instance we attach to the dft_node inputs the input node
    # and the twiddles constants
    out_dft = dft_node(inp, *twids)
    if args.power != 0:
        outdft = G.add_output(name="out_spectrogram")(out_dft[0])
        outcplx = G.add_output(name="out_cplx_dft")(out_dft[1])
    else:
        outcplx = G.add_output(name="out_cplx_dft")(out_dft)

    idft_node = IDFTNode(
        "idft_node",
        n_dft=args.n_dft,
        n_frames=1,
    )
    out_idft = idft_node(out_dft[1] if args.power != 0 else out_dft, *twids)
    outidft = G.add_output("out_idft")(out_idft)

    G.add_dimensions()
    G.adjust_order()
    G.quantize(
        graph_options=quantization_options(
            scheme="FLOAT",
            float_type=args.float_type
        )
    )

    G.draw()
    G.gen_at_model(
        directory=os.path.split(args.at_model_path)[0],
        settings=model_settings(
            model_file=os.path.split(args.at_model_path)[-1],
            tensor_directory=args.tensors_dir,
            l3_flash_device="AT_MEM_L3_MRAMFLASH",
            graph_l1_promotion=1,
        )
    )