DFT Using NNTool + Autotiler
Requirements
No specific requirement. This example should run without issue on all chips/boards/OSes.
Description
In this example the DFT and IDFT functions are generated using NNTool. In create_dft_graph.py a simple NNTool graph is created with 2 nodes: the DFT and IDFT (look at the file for more details, comments in the code)
dft_node = DFTNode(
"dft_node",
n_dft=args.n_dft,
n_frames=1,
frame_size=args.frame_size,
# frame_step=args.frame_step,
window=args.window_type,
power=abs(args.power),
output_complex_stft=args.power != 0
)
...
idft_node = IDFTNode(
"idft_node",
n_dft=args.n_dft,
n_frames=1,
)
Since the ndft is the same, the twiddles are shared between the two graphs:
twids = dft_node.gen_twiddles()
out_dft = dft_node(inp, *twids)
out_idft = idft_node(out_dft[1] if args.power != 0 else out_dft, *twids)
Autotiler code is generated. Then the Autotiler model is compiled and run to generate GAP C code. All the rules and dependencies to generate files are done via CMakeLists.txt.
In the main application a wav file is opened from PC and frame by frame it is applied the DFT and IDFT. The reconstructed signal is then written back to a wav file and can be played.
How to run
mkdir build
cd build
cmake ../
make run -j
Code
/*
* Copyright (C) 2017 GreenWaves Technologies
* All rights reserved.
*
* This software may be modified and distributed under the terms
* of the BSD license. See the LICENSE file for details.
*
*/
/* Autotiler includes. */
#include "dft_graphKernels.h"
#include "gaplib/fs_switch.h"
#include "gaplib/wavIO.h"
#define __XSTR(__s) __STR(__s)
#define __STR(__s) #__s
struct pi_device DefaultRam;
struct pi_device* ram = &DefaultRam;
//Setting a big buffer to load files from PC to L2 and then store in ram
#define TEMP_L2_SIZE 1200000
#define AUDIO_BUFFER_SIZE (TEMP_L2_SIZE>>1)
AT_DEFAULTFLASH_EXT_ADDR_TYPE dft_graph_L3_Flash = 0;
/* Inputs */
/* Outputs */
static uint32_t inSig;
static uint32_t outSig;
L2_MEM DATATYPE_SIGNAL Audio_Frame[FRAME_SIZE]; //
L2_MEM DATATYPE_SIGNAL Reconstructed_Frame[FRAME_SIZE]; //
L2_MEM short int Reconstructed_Frame_temp[FRAME_SIZE]; //
L2_MEM DATATYPE_SIGNAL DFTCplxOut[(N_DFT + 1)*2]; // the 2 is because of complex numbers
L2_MEM DATATYPE_SIGNAL DFTMag[(N_DFT + 1)]; // the 2 is because of complex numbers
static void dft_graph()
{
dft_graphCNN_ConstructCluster();
dft_graphCNN(Audio_Frame, DFTCplxOut, Reconstructed_Frame, DFTMag);
}
int main(int argc, char *argv[])
{
printf("\n\n\t *** NNTOOL dft_graph Example ***\n\n");
/* Configure And open cluster. */
struct pi_device cluster_dev;
struct pi_cluster_conf cl_conf;
pi_cluster_conf_init(&cl_conf);
cl_conf.cc_stack_size = STACK_SIZE;
cl_conf.id = 0; /* Set cluster ID. */
// Enable the special icache for the master core
cl_conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE |
// Enable the prefetch for all the cores, it's a 9bits mask (from bit 2 to bit 10), each bit correspond to 1 core
PI_CLUSTER_ICACHE_PREFETCH_ENABLE |
// Enable the icache for all the cores
PI_CLUSTER_ICACHE_ENABLE;
pi_open_from_conf(&cluster_dev, (void *) &cl_conf);
if (pi_cluster_open(&cluster_dev))
{
printf("Cluster open failed !\n");
return -4;
}
/* Frequency Settings: defined in the Makefile */
int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000);
int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000);
int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000);
if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1)
{
printf("Error changing frequency !\nTest failed...\n");
return -4;
}
printf("FC Frequency = %d Hz CL Frequency = %d Hz PERIPH Frequency = %d Hz\n",
pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH));
/****
Configure And Open the External Ram.
****/
struct pi_default_ram_conf ram_conf;
pi_default_ram_conf_init(&ram_conf);
ram_conf.baudrate = FREQ_FC*1000*1000;
pi_open_from_conf(&DefaultRam, &ram_conf);
if (pi_ram_open(&DefaultRam))
{
printf("Error ram open !\n");
return -3;
}
printf("RAM Opened\n");
/****
Load Audio Wav from file
****/
// Read Audio Data from file using temp_L2_memory as temporary buffer
// Data are prepared in L3 external memory
char* temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
if (temp_L2_memory == 0) {
printf("Error when allocating L2 buffer\n");
return 5;
}
// Allocate L3 buffers for audio IN/OUT
if (pi_ram_alloc(&DefaultRam, &inSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
{
printf("inSig Ram malloc failed !\n");
return -4;
}
if (pi_ram_alloc(&DefaultRam, &outSig, (uint32_t) AUDIO_BUFFER_SIZE*sizeof(short)))
{
printf("outSig Ram malloc failed !\n");
return -5;
}
// Read audio from file
header_struct header_info;
if (ReadWavFromFile(__XSTR(WAV_FILE), temp_L2_memory, AUDIO_BUFFER_SIZE*sizeof(short), &header_info)){
printf("\nError reading wav file\n");
return -1;
}
int samplerate = header_info.SampleRate;
int num_samples = header_info.DataSize * 8 / (header_info.NumChannels * header_info.BitsPerSample);
printf("Num Samples: %d with BitsPerSample: %d SR: %dkHz\n", num_samples, header_info.BitsPerSample, samplerate);
if(num_samples*sizeof(short) > TEMP_L2_SIZE){
printf("The size of the audio exceeds the available L2 memory space!\n");
return -1;
}
// copy input data to L3
pi_ram_write(&DefaultRam, inSig, temp_L2_memory, num_samples * sizeof(short));
// Reset Output Buffer and copy to L3
short * out_temp_buffer = (short *) temp_L2_memory;
for(int i=0; i < num_samples; i++){
out_temp_buffer[i] = 0;
}
pi_ram_write(&DefaultRam, outSig, temp_L2_memory, num_samples * sizeof(short));
// free the temporary input memory
pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);
gap_fc_starttimer();
gap_fc_resethwtimer();
unsigned int start, elapsed;
// IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!!
printf("Constructor\n");
int ConstructorErr = dft_graphCNN_Construct();
if (ConstructorErr)
{
printf("Graph constructor exited with error: %d\n(check the generated file dft_graphKernels.c to see which memory have failed to be allocated)\n", ConstructorErr);
return -6;
}
/****
Load the input audio signal and compute the MFCC
IMP: Audio_Frame includes only a single frame for audio
****/
int tot_frames = (int) (((float) (num_samples - FRAME_SIZE) / FRAME_STEP));
printf("Number of frames to be processed: %d\n", tot_frames);
struct pi_cluster_task task_rfft;
pi_cluster_task(&task_rfft, (void (*)(void *))dft_graph, NULL);
pi_cluster_task_stacks(&task_rfft, NULL, SLAVE_STACK_SIZE);
for (int frame_id=0; frame_id < tot_frames; frame_id++)
{
printf("Frame [%3d/%3d]", frame_id+1, tot_frames);
// Copy Data from L3 to L2
short * in_temp_buffer = (short *) Audio_Frame;
pi_ram_read(
&DefaultRam,
inSig + frame_id * FRAME_STEP * sizeof(short),
in_temp_buffer,
(uint32_t) FRAME_SIZE*sizeof(short)
);
// cast data from Q16.15 to DATATYPE_SIGNAL (may be float16)
for (int i=(FRAME_SIZE-1) ; i>=0; i--){
Audio_Frame[i] = ((DATATYPE_SIGNAL) in_temp_buffer[i])/(1<<15);
}
/******
Compute the RFFT + IRFFT
******/
start = gap_fc_readhwtimer();
pi_cluster_send_task_to_cl(&cluster_dev, &task_rfft);
elapsed = gap_fc_readhwtimer() - start;
printf(" --> %d (%.2fus) \n", elapsed, ( (float) elapsed ) / FREQ_FC);
// Hanning window requires divide by 2 when overlapp and add
for (int i= 0 ; i<FRAME_SIZE; i++){
Reconstructed_Frame[i] = Reconstructed_Frame[i] / 2; // FIXME: divide by 2 because of current Hanning windowing
}
// Read the outsignal
pi_ram_read(&DefaultRam, (uint32_t) ((short *) outSig + (frame_id*FRAME_STEP)),
Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));
// Overlap And ADD
for (int i= 0 ; i<FRAME_SIZE; i++){
Reconstructed_Frame_temp[i] += (short int)(Reconstructed_Frame[i] * (1<<15));
}
pi_ram_write(&DefaultRam, (uint32_t)( (short *) outSig + (frame_id*FRAME_STEP)),
Reconstructed_Frame_temp, FRAME_SIZE * sizeof(short));
} // stop looping over frames
/*
Exit the real-time mode (only for testing)
and write clean speech audio to file: test_gap.wav
*/
// allocate L2 Memory
temp_L2_memory = pi_l2_malloc(TEMP_L2_SIZE);
if (temp_L2_memory == 0) {
printf("Error when allocating L2 buffer\n");
return 18;
}
// copy input data to L3
out_temp_buffer = (short int * ) temp_L2_memory;
pi_ram_read(&DefaultRam, outSig, out_temp_buffer, num_samples * sizeof(short));
WriteWavToFile(__XSTR(OUT_FILE), 16, samplerate, 1, (uint32_t *) temp_L2_memory, num_samples* sizeof(short));
printf("Writing wav file to %s completed successfully\n", __XSTR(OUT_FILE));
/*
Compare with original signal
*/
int NFrameToCheck = Min(tot_frames-7, 10);
printf("Frames to check: %d..%d\n", 4, 4+NFrameToCheck);
short int *original_input = (short int *) pi_l2_malloc(((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short));
if (original_input == 0) {
printf("Error when allocating L2 buffer\n");
return 18;
}
pi_ram_read(
&DefaultRam,
inSig,
original_input,
(uint32_t) ((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE)*sizeof(short)
); // Copy 10 frames of the original audio and compare with the computed one
float perr = 0.0f, psig = 0.0f;
for (int i=4*FRAME_STEP; i<((NFrameToCheck-1)*FRAME_STEP+FRAME_SIZE); i++) {
float diff = (float) (out_temp_buffer[i] - original_input[i]);
perr += diff * diff;
psig += out_temp_buffer[i] * out_temp_buffer[i];
//printf("[%d] %d vs %d -> %f\n", i, original_input[i], out_temp_buffer[i], (diff * diff)/(out_temp_buffer[i] * out_temp_buffer[i]));
}
float snr = psig / perr;
printf("SNR wrt to original signal: %.2f\n\n", snr);
if (snr < 90) {
printf("Big error between original signal and reconstructed\n");
return -1;
}
/*
Deallocate everything and Close the cluster
*/
pi_l2_free(temp_L2_memory, TEMP_L2_SIZE);
dft_graphCNN_Destruct();
pi_cluster_close(&cluster_dev);
printf("Ended\n");
return 0;
}
import os
from nntool.api import NNGraph
from nntool.api.types import DFTNode, IDFTNode
from nntool.api.utils import quantization_options, model_settings
import argparse
import argcomplete
def create_parser():
# create the top-level parser
parser = argparse.ArgumentParser(prog='fft_at_generators')
parser.add_argument('--float_type', default="bfloat16",
help="Float data type")
parser.add_argument('--n_dft', default=400, type=int,
help="number of fft points")
parser.add_argument('--frame_size', default=400, type=int,
help="number of fft points")
parser.add_argument('--window_type', default="hanning",
help="windowing function")
parser.add_argument('--power', default=0, type=int,
help="power spectrogram: 0=X (cplx), 1=|X| (real), 2=|X|^2 (real)")
parser.add_argument('--at_model_path', default=None,
help="Path to the C autotiler model file to generate")
parser.add_argument('--tensors_dir', default=None,
help="Path to the autotiler model constant files to generate")
return parser
if __name__ == '__main__':
parser = create_parser()
argcomplete.autocomplete(parser)
args = parser.parse_args()
# if power != 0, we still want the complex output to reconstruct the signal in the iDFT
# hence, we add a second output to the node that outputs the complex DFT among the spectrum
G = NNGraph(name='dft_graph')
inp = G.add_input([args.frame_size])
dft_node = DFTNode(
"dft_node",
n_dft=args.n_dft,
n_frames=1,
frame_size=args.frame_size,
# frame_step=args.frame_step,
window=args.window_type,
power=abs(args.power),
output_complex_stft=args.power != 0
)
# the DFT/iDFT operators require some constants to be generated
twids = dft_node.gen_twiddles()
# NNTool nodes can be referenced to construct the graph structure
# here for instance we attach to the dft_node inputs the input node
# and the twiddles constants
out_dft = dft_node(inp, *twids)
if args.power != 0:
outdft = G.add_output(name="out_spectrogram")(out_dft[0])
outcplx = G.add_output(name="out_cplx_dft")(out_dft[1])
else:
outcplx = G.add_output(name="out_cplx_dft")(out_dft)
idft_node = IDFTNode(
"idft_node",
n_dft=args.n_dft,
n_frames=1,
)
out_idft = idft_node(out_dft[1] if args.power != 0 else out_dft, *twids)
outidft = G.add_output("out_idft")(out_idft)
G.add_dimensions()
G.adjust_order()
G.quantize(
graph_options=quantization_options(
scheme="FLOAT",
float_type=args.float_type
)
)
G.draw()
G.gen_at_model(
directory=os.path.split(args.at_model_path)[0],
settings=model_settings(
model_file=os.path.split(args.at_model_path)[-1],
tensor_directory=args.tensors_dir,
l3_flash_device="AT_MEM_L3_MRAMFLASH",
graph_l1_promotion=1,
)
)