MNist using NNTool’s Python APIs (Tensorflow)
Requirements
NNTool
Autotiler
Python Requirements
nntool (see gap_sdk requirements.txt)
tqdm
Optional (only needed for NN retraining):
tensorflow >= 2.7.0
tf2onnx
Description
In this example is described an end-to-end deployment of different type of NN (Convolutional, GRU and LSTM based) for handwritten digits visual recognition (MNIST) using GAPflow, GWT set of tools for NN deployment onto GAP chips.
train_tf.py
is the script to train the various NN topologies using tensorflow + keras and export the trained graph to TFLite. In train_all.sh
you have the sets of command used to train all the NNs. Note that for GRU layers Tflite does not support a fused operator (for the time this project was made still produces a subgraph structure which is not supported in GAPflow), for this reason we added a tf2onnx conversion to export the same graph to ONNX instead of TFLite. For the convolutional NN type it is also shown an example of full-integer quantization in TFLite. All the pretrained models are already provided in the models
folder.
nntool_script.py
uses NNTool to load the NN and prepare it for GAP inference, i.e. optimizes the graph topology and quantizes the NN (if not already done in TFLite) using a calibration dataset (subset of the training dataset). The NN is used then for 3 different usage modes:
- accuracy:
to test the accuracy of the deployable solution in a python environment on the MNIST test dataset. NNTool uses numpy backend to run inference bit-accurate wrt the real chip (check
test_nntool
function)
- performance:
to fastly deploy a solution and check the results and performance directly from a python script. NNTool generates a template project to run an NN on GVSOC and prints out the NN layers outputs, these are then parsed in python and compared to the ground truth execution. You can see the QSNRs between Target and NNTool execution as infinite which means there is no difference between NNTool and Target results. NNTool can also parse the Autotiler memory usage information layer by layer and plots them to inspect possible bottlenecks or memory underusage during the NN inference.
- generate_at_model:
it just prepare the model and generates an Autotiler Model description that can be used to generate GAP code. This modality is used by the project to dinamically generate the Autotiler Model to deploy.
Usage:
export MODE=accuracy
python nntool_script.py --mode=${MODE} --trained_model models/mnist_quant.tflite --quantize_in_tflite
To see available options:
python nntool_script.py --help
The whole project works with a CMake build system and all the options are available through Kconfig
. To run the example with default options:
mkdir build
cmake ../
make run -j
To change the options via menuconfig
you can use:
mkdir build
cmake ../
make menuconfig
navigate the GUI interface and once saved:
make run -j
Alternatively you can pass directly some options to the cmake command:
mkdir build
cmake ../ -DCONFIG_GRU=y -DCONFIG_MODEL_REENTRANT_MODE=y
make run -j
Also different quantization options are available, see NN MNIST test menu -> Quantization Type
menu in manueconfig
to explore them.
Reentrant mode
The NNs support both standard and reentrant mode. It can be activated via menuconfig
under GAP_SDK -> tools -> Autotiler
menu in manueconfig
or with the -DCONFIG_MODEL_REENTRANT_MODE=y
flag. In this mode the main file is slightly different mnist_reentrant.c
, similar to the one in the examples/gap9/nn/autotiler/MnistGraphReentrant
Code
/*
* Copyright (C) 2017 GreenWaves Technologies
* All rights reserved.
*
* This software may be modified and distributed under the terms
* of the BSD license. See the LICENSE file for details.
*
*/
#define __XSTR(__s) __STR(__s)
#define __STR(__s) #__s
/* Autotiler includes. */
#include "mnist.h"
#include "mnistKernels.h"
#include "gaplib/ImgIO.h"
#ifdef __EMUL__
#define pmsis_exit(n) exit(n)
#endif
#ifndef STACK_SIZE
#define STACK_SIZE 1024
#endif
AT_DEFAULTFLASH_EXT_ADDR_TYPE mnist_L3_Flash = 0;
#ifdef SQ8
typedef signed char IN_D_TYPE;
typedef short int OUT_D_TYPE;
#endif
#ifdef NE16
typedef unsigned char IN_D_TYPE;
typedef short int OUT_D_TYPE;
#endif
#ifdef FP16
typedef F16 IN_D_TYPE;
typedef F16 OUT_D_TYPE;
#endif
/* Inputs */
L2_MEM IN_D_TYPE Input_1[28*28*1];
/* Outputs */
L2_MEM OUT_D_TYPE Output_1[10];
int rec_digit;
static void RunMnist()
{
#ifdef PERF
printf("Start timer\n");
gap_cl_starttimer();
gap_cl_resethwtimer();
#endif
#ifdef RNN
mnistCNN(Input_1, 1, Output_1);
#else
mnistCNN(Input_1, Output_1);
#endif
//Checki Results
rec_digit = 0;
OUT_D_TYPE highest = Output_1[0];
for(int i = 0; i < 10; i++) {
#ifdef FP16
printf("class %d: %f \n", i, Output_1[i]);
#else
printf("class %d: %d \n", i, Output_1[i]);
#endif
if(Output_1[i] > highest) {
highest = Output_1[i];
rec_digit = i;
}
}
printf("\n");
printf("Recognized: %d\n", rec_digit);
}
int test_mnist(void)
{
printf("Entering main controller\n");
/* Configure And open cluster. */
struct pi_device cluster_dev;
struct pi_cluster_conf cl_conf;
pi_cluster_conf_init(&cl_conf);
cl_conf.cc_stack_size = STACK_SIZE;
cl_conf.id = 0; /* Set cluster ID. */
// Enable the special icache for the master core
cl_conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE |
// Enable the prefetch for all the cores, it's a 9bits mask (from bit 2 to bit 10), each bit correspond to 1 core
PI_CLUSTER_ICACHE_PREFETCH_ENABLE |
// Enable the icache for all the cores
PI_CLUSTER_ICACHE_ENABLE;
pi_open_from_conf(&cluster_dev, (void *) &cl_conf);
if (pi_cluster_open(&cluster_dev))
{
printf("Cluster open failed !\n");
pmsis_exit(-4);
}
/* Frequency Settings: defined in the Makefile */
int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000);
int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000);
int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000);
if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1)
{
printf("Error changing frequency !\nTest failed...\n");
pmsis_exit(-4);
}
printf("FC Frequency as %d Hz, CL Frequency = %d Hz, PERIIPH Frequency = %d Hz\n",
pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH));
// IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!!
printf("Constructor\n");
int ConstructorErr = mnistCNN_Construct();
if (ConstructorErr)
{
printf("Graph constructor exited with error: %d\n(check the generated file mnistKernels.c to see which memory have failed to be allocated)\n", ConstructorErr);
pmsis_exit(-6);
}
char *ImageName = __XSTR(AT_IMAGE);
unsigned char *ImgIn = (unsigned char *) pi_l2_malloc(28*28*1);
if (ReadImageFromFile(ImageName, 28,28,1, ImgIn, 28*28, IMGIO_OUTPUT_CHAR, 0))
{
printf("Failed to load image %s\n", ImageName);
pmsis_exit(-2);
}
for (int i=0; i<28*28; i++) {
#ifdef SQ8
Input_1[i] = ImgIn[i] - 128;
#endif
#ifdef NE16
Input_1[i] = ImgIn[i];
#endif
#ifdef FP16
Input_1[i] = (IN_D_TYPE) (( ((float) ImgIn[i]) / 128 ) - 1.0);
#endif
}
printf("Call cluster\n");
struct pi_cluster_task task;
pi_cluster_task(&task, (void (*)(void *))RunMnist, NULL);
pi_cluster_task_stacks(&task, NULL, SLAVE_STACK_SIZE);
pi_cluster_send_task_to_cl(&cluster_dev, &task);
mnistCNN_Destruct();
#ifdef PERF
{
unsigned int TotalCycles = 0, TotalOper = 0;
printf("\n");
for (unsigned int i=0; i<(sizeof(AT_GraphPerf)/sizeof(unsigned int)); i++) {
TotalCycles += AT_GraphPerf[i]; TotalOper += AT_GraphOperInfosNames[i];
}
for (unsigned int i=0; i<(sizeof(AT_GraphPerf)/sizeof(unsigned int)); i++) {
printf("%45s: Cycles: %12u, Cyc%%: %5.1f%%, Operations: %12u, Op%%: %5.1f%%, Operations/Cycle: %f\n", AT_GraphNodeNames[i], AT_GraphPerf[i], 100*((float) (AT_GraphPerf[i]) / TotalCycles), AT_GraphOperInfosNames[i], 100*((float) (AT_GraphOperInfosNames[i]) / TotalOper), ((float) AT_GraphOperInfosNames[i])/ AT_GraphPerf[i]);
}
printf("\n");
printf("%45s: Cycles: %12u, Cyc%%: 100.0%%, Operations: %12u, Op%%: 100.0%%, Operations/Cycle: %f\n", "Total", TotalCycles, TotalOper, ((float) TotalOper)/ TotalCycles);
printf("\n");
}
#endif
#ifdef CI
if (rec_digit != 5){
printf("Wrong result: expected %d got %d\n", 5, rec_digit);
pmsis_exit(-1);
}
printf("Correct Results\n");
#endif
printf("Ended\n");
pmsis_exit(0);
return 0;
}
int main(int argc, char *argv[])
{
printf("\n\n\t *** NNTOOL mnist Example ***\n\n");
test_mnist();
return 0;
}
/*
* Copyright (C) 2017 GreenWaves Technologies
* All rights reserved.
*
* This software may be modified and distributed under the terms
* of the BSD license. See the LICENSE file for details.
*
*/
#define __XSTR(__s) __STR(__s)
#define __STR(__s) #__s
/* Autotiler includes. */
#include "mnist.h"
#include "mnistKernels.h"
#include "gaplib/ImgIO.h"
#ifdef __EMUL__
#define pmsis_exit(n) exit(n)
#endif
#ifndef STACK_SIZE
#define STACK_SIZE 1024
#endif
AT_DEFAULTFLASH_EXT_ADDR_TYPE mnist_L3_Flash = 0;
#ifdef SQ8
typedef signed char IN_D_TYPE;
typedef short int OUT_D_TYPE;
#endif
#ifdef NE16
typedef unsigned char IN_D_TYPE;
typedef short int OUT_D_TYPE;
#endif
#ifdef FP16
typedef F16 IN_D_TYPE;
typedef F16 OUT_D_TYPE;
#endif
/* Inputs */
L2_MEM IN_D_TYPE Input_1[28*28*1];
/* Outputs */
L2_MEM OUT_D_TYPE Output_1[10];
int rec_digit;
AT_CTXT_TYPE Ctxt;
AT_DEFAULTFLASH_FS_T Flash;
typedef struct {
CNN_Graph_Descr_T *Descr;
int rec_digit;
} Arg_T;
static void RunMnist()
{
#ifdef PERF
printf("Start timer\n");
gap_cl_starttimer();
gap_cl_resethwtimer();
#endif
if (mnistCNN_CtxtGetLayerIndex(Ctxt) == 0) {
printf("Inside RunMnist function for the First time\n");
} else {
printf("Resuming RunMnist function at layer: %d\n", mnistCNN_CtxtGetLayerIndex(Ctxt));
}
mnistCNN(
Ctxt,
Input_1,
#ifdef RNN
1, /* Reset Command for RNN Layers only taken into account at the beginning of the NN execution (not when resumed) */
#endif
Output_1,
0
);
if (mnistCNN_CtxtGetLayerIndex(Ctxt) == 0) {
OUT_D_TYPE highest = Output_1[0];
for (uint8_t i = 1; i < 10; i++) {
#ifdef FP16
printf("class %d: %f \n", i, Output_1[i]);
#else
printf("class %d: %d \n", i, Output_1[i]);
#endif
if (highest < Output_1[i]) {
highest = Output_1[i];
rec_digit = i;
}
}
printf("End RunMnist function\n");
} else {
printf("Suspending RunMnist function after layer: %d\n", mnistCNN_CtxtGetLayerIndex(Ctxt));
}
}
int runprint=0;
static int RunPrint(){
while(runprint<4){
printf("Higher priority Print task execution %d/4\n",runprint++);
if(pi_cl_task_yield()) return 0;
}
runprint=0;
return 0;
}
int test_mnist(void)
{
printf("Entering main controller\n");
/* Configure And open cluster. */
struct pi_device cluster_dev;
struct pi_cluster_conf cl_conf;
int stacks_size = STACK_SIZE * pi_cl_cluster_nb_pe_cores();
pi_cluster_conf_init(&cl_conf);
cl_conf.id = 0;
cl_conf.scratch_size = stacks_size + 0x8000;
pi_open_from_conf(&cluster_dev, (void *) &cl_conf);
if (pi_cluster_open(&cluster_dev))
{
printf("Cluster open failed !\n");
pmsis_exit(-7);
}
char *ImageName = __XSTR(AT_IMAGE);
unsigned char *ImgIn = (unsigned char *) pi_l2_malloc(28*28*1);
if (ReadImageFromFile(ImageName, 28,28,1, ImgIn, 28*28, IMGIO_OUTPUT_CHAR, 0))
{
printf("Failed to load image %s\n", ImageName);
pmsis_exit(-2);
}
for (int i=0; i<28*28; i++) {
#ifdef SQ8
Input_1[i] = ImgIn[i] - 128;
#endif
#ifdef NE16
Input_1[i] = ImgIn[i];
#endif
#ifdef FP16
Input_1[i] = (IN_D_TYPE) (( ((float) ImgIn[i]) / 128 ) - 1.0);
#endif
}
printf("NN Construct\n");
int Status;
CNN_Graph_Descr_T _Descr = {0, &Flash, 0, 0, 0}, *Descr = &_Descr;
{
int Error;
AT_DEFAULTFLASH_FS_CONF_T FlashConf;
AT_DEFAULTFLASH_FS_CONF_INIT(&FlashConf, AT_MEM_L3_HFLASH, 0);
AT_DEFAULTFLASH_FS_OPEN((AT_DEFAULTFLASH_FS_T *) Descr->Flash, &FlashConf, 0, 0, &Error);
if (Error) {
printf("Flash open failed\n"); return 1;
} else printf("Flash is open\n");
}
mnistCNN_Construct(Descr);
Ctxt = mnistCNN_AllocCtxt(Descr,
0, /* Ctxt Mem */
0, /* L1 */
0, /* L2 Dyn */
0, /* L3 Dyn */
#ifdef RNN
0, /* HState Ptrs for LSTM/GRU Layers */
0, /* HState Ptrs for LSTM/GRU Layers */
#ifdef LSTM
0, /* CState Ptrs for LSTM Layers */
0, /* CState Ptrs for LSTM Layers */
#endif
#endif
&Status);
if(Status){
printf("Construct Error %d\n",Status);
pmsis_exit(-1);
}
printf("NN Constructed and ready to run\n");
Arg_T Arg = (Arg_T) {Descr, 0xAD};
struct pi_cluster_task *task_0 = pi_l2_malloc(sizeof(struct pi_cluster_task));
pi_cluster_task(task_0, (void (*)(void *))RunMnist, (void *) &Arg);
/*
pi_cluster_enqueue_task_async function is similar to pi_cluster_send_task but supports priority 0 and 1 and do not support automatic stack allocation.
Stacks must always be allocated by the caller.
*/
void *stacks = pi_cl_l1_scratch_alloc(&cluster_dev, task_0, stacks_size);
pi_cluster_task_stacks(task_0, stacks, STACK_SIZE);
struct pi_cluster_task *task_1 = pi_l2_malloc(sizeof(struct pi_cluster_task));
pi_cluster_task(task_1, (void (*)(void *))RunPrint, NULL);
stacks = pi_cl_l1_scratch_alloc(&cluster_dev, task_1, stacks_size);
pi_cluster_task_stacks(task_1, stacks, STACK_SIZE);
printf("Calling Cluster\n\n");
static pi_evt_t end_task_mnist,end_task_print;
pi_evt_sig_init(&end_task_mnist);
pi_evt_sig_init(&end_task_print);
pi_cluster_task_priority(task_1, 1);
pi_cluster_enqueue_task_async(&cluster_dev, task_0,&end_task_mnist);
pi_cluster_enqueue_task_async(&cluster_dev, task_1,&end_task_print);
printf("Waiting\n\n");
pi_evt_wait(&end_task_print);
pi_evt_wait(&end_task_mnist);
mnistCNN_DeAllocCtxt(Ctxt, 1, 1, 1, 1);
mnistCNN_Destruct(Descr, 1, 1);
pi_cluster_close(&cluster_dev);
#ifdef PERF
{
unsigned int TotalCycles = 0, TotalOper = 0;
printf("\n");
for (unsigned int i=0; i<(sizeof(AT_GraphPerf)/sizeof(unsigned int)); i++) {
TotalCycles += AT_GraphPerf[i]; TotalOper += AT_GraphOperInfosNames[i];
}
for (unsigned int i=0; i<(sizeof(AT_GraphPerf)/sizeof(unsigned int)); i++) {
printf("%45s: Cycles: %12u, Cyc%%: %5.1f%%, Operations: %12u, Op%%: %5.1f%%, Operations/Cycle: %f\n", AT_GraphNodeNames[i], AT_GraphPerf[i], 100*((float) (AT_GraphPerf[i]) / TotalCycles), AT_GraphOperInfosNames[i], 100*((float) (AT_GraphOperInfosNames[i]) / TotalOper), ((float) AT_GraphOperInfosNames[i])/ AT_GraphPerf[i]);
}
printf("\n");
printf("%45s: Cycles: %12u, Cyc%%: 100.0%%, Operations: %12u, Op%%: 100.0%%, Operations/Cycle: %f\n", "Total", TotalCycles, TotalOper, ((float) TotalOper)/ TotalCycles);
printf("\n");
}
#endif
#ifdef CI
if (rec_digit != 5){
printf("Wrong result: expected %d got %d\n", 5, rec_digit);
pmsis_exit(-1);
}
printf("Correct Results\n");
#endif
printf("Ended\n");
pmsis_exit(0);
return 0;
}
int main(int argc, char *argv[])
{
printf("\n\n\t *** NNTOOL mnist Example ***\n\n");
test_mnist();
return 0;
}