RFFT + IRFFT Using NNTool

Requirements

No specific requirement. This example should run without issue on all chips/boards/OSes.

Description

In this example the RFFT and IRFFT functions are generated using NNTool. In nntool_script.py a single node graph is created in NNTool with the RFFT and another graph with IRFFT and the Autotiler code is generated. Then the Autotiler model is compiled and run to generate GAP C code. All the rules and dependencies to generate files are done via CMakeLists.txt.

In the main application a wav file is opened from PC and frame by frame it is applied the RFFT and IRFFT. The reconstructed signal is then written back to a wav file and can be played.

How to run

mkdir build
cd build
cmake ../
make run -j

Code

/*
 * Copyright (C) 2017 GreenWaves Technologies
 * All rights reserved.
 *
 * This software may be modified and distributed under the terms
 * of the BSD license.  See the LICENSE file for details.
 *
 */


/* Autotiler includes. */
#include "fft_forwardKernels.h"
#include "fft_inverseKernels.h"
#include "gaplib/fs_switch.h"
#include "gaplib/wavIO.h"
#define __XSTR(__s) __STR(__s)
#define __STR(__s) #__s

struct pi_device DefaultRam; 
struct pi_device* ram = &DefaultRam;
//Setting a big buffer to load files from PC to L2 and then store in ram
#define TEMP_L2_SIZE 1200000
#define AUDIO_BUFFER_SIZE (TEMP_L2_SIZE>>1)

AT_DEFAULTFLASH_EXT_ADDR_TYPE fft_forward_L3_Flash = 0;
AT_DEFAULTFLASH_EXT_ADDR_TYPE fft_inverse_L3_Flash = 0;

/* Inputs */
/* Outputs */
static uint32_t inSig;
static uint32_t outSig;

L2_MEM DATATYPE_SIGNAL Audio_Frame[FRAME_SIZE];  // 
L2_MEM DATATYPE_SIGNAL Reconstructed_Frame[FRAME_SIZE]; // 
L2_MEM short int Reconstructed_Frame_temp[FRAME_SIZE];  // 
L2_MEM DATATYPE_SIGNAL STFT_Spectrogram[(N_FFT / 2 + 1)*2]; // the 2 is because of complex numbers

static void copy_twiddles_to_l1()
{
}

static void rfft_irfft()
{
    fft_forwardCNN_ConstructCluster();
    fft_forwardCNN(Audio_Frame, STFT_Spectrogram, 0);
    fft_inverseCNN_ConstructCluster();
    fft_inverseCNN(STFT_Spectrogram, Reconstructed_Frame, 0);
}

int main(int argc, char *argv[])
{
    printf("\n\n\t *** NNTOOL fft_forward Example ***\n\n");

    /* Configure And open cluster. */
    struct pi_device cluster_dev;
    struct pi_cluster_conf cl_conf;
    pi_cluster_conf_init(&cl_conf);
    cl_conf.cc_stack_size = STACK_SIZE;

    cl_conf.id = 0; /* Set cluster ID. */
                    // Enable the special icache for the master core
    cl_conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE |
                    // Enable the prefetch for all the cores, it's a 9bits mask (from bit 2 to bit 10), each bit correspond to 1 core
                    PI_CLUSTER_ICACHE_PREFETCH_ENABLE |
                    // Enable the icache for all the cores
                    PI_CLUSTER_ICACHE_ENABLE;

    pi_open_from_conf(&cluster_dev, (void *) &cl_conf);
    if (pi_cluster_open(&cluster_dev))
    {
        printf("Cluster open failed !\n");
        return -4;
    }

    /* Frequency Settings: defined in the Makefile */
    int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000);
    int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000);
    int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000);
    if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1)
    {
        printf("Error changing frequency !\nTest failed...\n");
        return -4;
    }
	printf("FC Frequency = %d Hz CL Frequency = %d Hz PERIPH Frequency = %d Hz\n", 
            pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH));

    /****
        Configure And Open the External Ram. 
    ****/
    struct pi_default_ram_conf ram_conf;
    pi_default_ram_conf_init(&ram_conf);
    ram_conf.baudrate = FREQ_FC*1000*1000;
    pi_open_from_conf(&DefaultRam, &ram_conf);
    if (pi_ram_open(&DefaultRam))
    {
        printf("Error ram open !\n");
        return -3;
    }
    printf("RAM Opened\n");

    /****
        Load Audio Wav from file 

    ****/
    // Read Audio Data from file using temp_L2_memory as temporary buffer
    // Data are prepared in L3 external memory
    char* temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
    if (temp_L2_memory == 0) {
        printf("Error when allocating L2 buffer\n");
        return 5;
    }
    
    // Allocate L3 buffers for audio IN/OUT
    if (pi_ram_alloc(&DefaultRam, &inSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
    {
        printf("inSig Ram malloc failed !\n");
        return -4;
    }
    if (pi_ram_alloc(&DefaultRam, &outSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
    {
        printf("outSig Ram malloc failed !\n");
        return -5;
    }

    // Read audio from file
    header_struct header_info;
    if (ReadWavFromFile(__XSTR(WAV_FILE), temp_L2_memory, AUDIO_BUFFER_SIZE*sizeof(short), &header_info)){
        printf("\nError reading wav file\n");
        return -1;
    }
    int samplerate = header_info.SampleRate;
    int num_samples = header_info.DataSize * 8 / (header_info.NumChannels * header_info.BitsPerSample);
    printf("Num Samples: %d with BitsPerSample: %d SR: %dkHz\n", num_samples, header_info.BitsPerSample, samplerate);

    if(num_samples*sizeof(short) > TEMP_L2_SIZE){
        printf("The size of the audio exceeds the available L2 memory space!\n");
        return -1;
    }

    // copy input data to L3
    pi_ram_write(&DefaultRam, inSig, temp_L2_memory, num_samples * sizeof(short));

    // Reset Output Buffer and copy to L3
    short * out_temp_buffer = (short *) temp_L2_memory;
    for(int i=0; i < num_samples; i++){
        out_temp_buffer[i] = 0;
    }
    pi_ram_write(&DefaultRam, outSig, temp_L2_memory, num_samples * sizeof(short));

    // free the temporary input memory
    pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);

    gap_fc_starttimer();
    gap_fc_resethwtimer();
    unsigned int start, elapsed;

    // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!!
    printf("Constructor\n");
    int ConstructorErr = fft_inverseCNN_Construct(0, 1, 1, 1, 1);
    if (ConstructorErr)
    {
        printf("Graph constructor exited with error: %d\n(check the generated file fft_forwardKernels.c to see which memory have failed to be allocated)\n", ConstructorErr);
        return -6;
    }
    ConstructorErr = fft_forwardCNN_Construct(1, 1, 1, 1, 1);
    fft_inverse_L1_Memory = fft_forward_L1_Memory;
    if (ConstructorErr)
    {
        printf("Graph constructor exited with error: %d\n(check the generated file fft_forwardKernels.c to see which memory have failed to be allocated)\n", ConstructorErr);
        return -6;
    }
    struct pi_cluster_task task_ctor;
    pi_cluster_task(&task_ctor, (void (*)(void *))copy_twiddles_to_l1, NULL);
    pi_cluster_task_stacks(&task_ctor, NULL, SLAVE_STACK_SIZE);
    start = gap_fc_readhwtimer();
    pi_cluster_send_task_to_cl(&cluster_dev, &task_ctor);
    elapsed = gap_fc_readhwtimer() - start;
    printf("Time to copy twiddles: %d (%.2fus)\n", elapsed, ( (float) elapsed ) / FREQ_FC);

    /****
        Load the input audio signal and compute the MFCC
        IMP: Audio_Frame includes only a single frame for audio
    ****/
    int tot_frames = (int) (((float) (num_samples - FRAME_SIZE) / FRAME_STEP));
    printf("Number of frames to be processed: %d\n", tot_frames);

    struct pi_cluster_task task_rfft;
    pi_cluster_task(&task_rfft, (void (*)(void *))rfft_irfft, NULL);
    pi_cluster_task_stacks(&task_rfft, NULL, SLAVE_STACK_SIZE);
    for (int frame_id=0; frame_id < tot_frames; frame_id++)
    {
        printf("Frame [%3d/%3d]", frame_id+1, tot_frames);
        // Copy Data from L3 to L2
        short * in_temp_buffer = (short *) Audio_Frame;
        pi_ram_read(
            &DefaultRam, 
            inSig + frame_id * FRAME_STEP * sizeof(short), 
            in_temp_buffer, 
            (uint32_t) FRAME_SIZE*sizeof(short)
        );
        // cast data from Q16.15 to DATATYPE_SIGNAL (may be float16)

        for (int i=(FRAME_SIZE-1) ; i>=0; i--){
            Audio_Frame[i] = ((DATATYPE_SIGNAL) in_temp_buffer[i])/(1<<15);
        }

        /******
            Compute the RFFT + IRFFT
        ******/
        start = gap_fc_readhwtimer();
        pi_cluster_send_task_to_cl(&cluster_dev, &task_rfft);
        elapsed = gap_fc_readhwtimer() - start;
        printf(" --> %d (%.2fus) \n", elapsed, ( (float) elapsed ) / FREQ_FC);

        // Hanning window requires divide by 2 when overlapp and add 
        for (int i= 0 ; i<FRAME_SIZE; i++){
            Reconstructed_Frame[i] = Reconstructed_Frame[i] / 2;   // FIXME: divide by 2 because of current Hanning windowing
        }

        // Read the outsignal
        pi_ram_read(&DefaultRam, (uint32_t) ((short *) outSig + (frame_id*FRAME_STEP)), 
            Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));
        // Overlap And ADD
        for (int i= 0 ; i<FRAME_SIZE; i++){
            Reconstructed_Frame_temp[i] += (short int)(Reconstructed_Frame[i] * (1<<15));
        }
        pi_ram_write(&DefaultRam, (uint32_t)( (short *) outSig + (frame_id*FRAME_STEP)),
            Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));

    }   // stop looping over frames

    fft_forwardCNN_Destruct(1, 1, 1, 1);
    fft_inverseCNN_Destruct(0, 1, 1, 1);
    pi_cluster_close(&cluster_dev);

    /*
        Exit the real-time mode (only for testing)
        and write clean speech audio to file: test_gap.wav
    */
    // allocate L2 Memory
    temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
    if (temp_L2_memory == 0) {
        printf("Error when allocating L2 buffer\n");
        return 18;
    }
    // copy input data to L3
    out_temp_buffer = (short int * ) temp_L2_memory; 
    pi_ram_read(&DefaultRam, outSig, out_temp_buffer, num_samples * sizeof(short));

    WriteWavToFile(__XSTR(OUT_FILE), 16, samplerate, 1, (uint32_t *) temp_L2_memory, num_samples* sizeof(short));
    printf("Writing wav file to %s completed successfully\n", __XSTR(OUT_FILE));

    /*
        Compare with original signal
    */
    int NFrameToCheck = Min(tot_frames-7, 10);
    printf("Frames to check: %d..%d\n", 4, 4+NFrameToCheck);
    short int *original_input = (short int *) pi_l2_malloc(((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short));
    if (original_input == 0) {
        printf("Error when allocating L2 buffer\n");
        return 18;
    }
    pi_ram_read(
        &DefaultRam, 
        inSig, 
        original_input, 
        (uint32_t) ((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short)
    ); // Copy 10 frames of the original audio and compare with the computed one
    float perr = 0.0f, psig = 0.0f;
    for (int i=4*FRAME_STEP; i<((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE); i++) {
        float diff = (float) (out_temp_buffer[i] - original_input[i]);
        perr += diff * diff;
        psig += out_temp_buffer[i] * out_temp_buffer[i];
        //printf("[%d] %d vs %d -> %f\n", i, original_input[i], out_temp_buffer[i], (diff * diff)/(out_temp_buffer[i] * out_temp_buffer[i]));
    }
    float snr = psig / perr;
    printf("SNR wrt to original signal: %.2f\n\n", snr);
    if (snr < 90) {
        printf("Big error between original signal and reconstructed\n");
        return -1;
    }

    /*
        Deallocate everything and Close the cluster
    */
    pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);

    printf("Ended\n");
    return 0;
    return 0;
}

import os
from nntool.api import NNGraph
from nntool.api.utils import model_settings
import argparse
import argcomplete

def create_parser():
    # create the top-level parser
    parser = argparse.ArgumentParser(prog='fft_at_generators')

    parser.add_argument('--float_type', default="bfloat16",
                        help="Float data type")
    parser.add_argument('--n_fft', default=512, type=int,
                        help="number of fft points")
    parser.add_argument('--frame_size', default=400, type=int,
                        help="number of fft points")
    parser.add_argument('--window_type', default="hanning",
                        help="windowing function")
    parser.add_argument('--forward_at_model_path', default=None,
                        help="Path to the C autotiler model file to generate")
    parser.add_argument('--inverse_at_model_path', default=None,
                        help="Path to the C autotiler model file to generate")
    parser.add_argument('--forward_tensors_dir', default=None,
                        help="Path to the autotiler model constant files to generate")
    parser.add_argument('--inverse_tensors_dir', default=None,
                        help="Path to the autotiler model constant files to generate")
    return parser


if __name__ == '__main__':
    parser = create_parser()
    argcomplete.autocomplete(parser)
    args = parser.parse_args()
    # NOTE: in librosa the frame length is always n_fft. The window_size can be less and will be padded centered to n_fft before multiplying to the frame values
    frame_size = args.frame_size
    n_fft = args.n_fft
    window_type = args.window_type
    power = 0
    graph_opts={"scheme": "FLOAT", "float_type": args.float_type}

    ## Forward Graph
    G_fft_forward = NNGraph.build_rfft_graph(
        input_dim=(frame_size, ),
        rfft_node_name="RfftNode",
        graph_name='fft_forward',
        n_fft=n_fft,
        frame_size=frame_size,
        frame_step=n_fft,
        n_frames=1,
        window=window_type,
        window_size=frame_size,
        power=power,
        pad_type="center", # as librosa
        old_dsp_lib=False,
    )
    G_fft_forward.quantize(graph_options=graph_opts)
    res = G_fft_forward.gen_at_model(
        directory=os.path.split(args.forward_at_model_path)[0],
        settings=model_settings(
            model_file=os.path.split(args.forward_at_model_path)[-1],
            tensor_directory=args.forward_tensors_dir,
            l3_flash_device="AT_MEM_L3_MRAMFLASH",
            graph_l1_promotion=1,
            graph_monitor_cvar_name="RfftMonitor",
            graph_produce_operinfos_cvar_name="RfftOp",
            graph_produce_node_cvar_name="RfftNodes",
            graph_warm_construct=3,
        )
    )

    ## Inverse Graph
    G_fft_inverse = NNGraph.build_irfft_graph(
        input_dim=(2*(n_fft // 2 + 1), ),
        irfft_node_name="IRfftNode",
        graph_name='fft_inverse',
        n_fft=n_fft,
        n_frames=1,
        window=None,
        old_dsp_lib=False,
    )
    G_fft_inverse.quantize(graph_options=graph_opts)
    res = G_fft_inverse.gen_at_model(
        directory=os.path.split(args.inverse_at_model_path)[0],
        settings=model_settings(
            model_file=os.path.split(args.inverse_at_model_path)[-1],
            tensor_directory=args.inverse_tensors_dir,
            l3_flash_device="AT_MEM_L3_MRAMFLASH",
            graph_l1_promotion=1,
            graph_monitor_cvar_name="IRfftMonitor",
            graph_produce_operinfos_cvar_name="IRfftOp",
            graph_produce_node_cvar_name="IRfftNodes",
            graph_warm_construct=3,
        )
    )