OpenMP Example

Description

This example is a basic MatAdd parallelized on the 8 cluster cores using OpenMP and the standard parallelization code used in many of the parallelized SDK libraries like the Autotiler.

The OpenMP library is linked by the CMake utilities of the SDK thanks to the cariable CONFIG_LIB_OPENMP=y set in the sdk.config.

How to run

cmake -B build
cmake --build build --target menuconfig # Select your board in the menu
cmake --build build --target run

Or use the gap command:

gap init
gap menuconfig
gap run

Results

You should have an output looking like this (order may vary):

      *** OPENMP Basic Example ***

Running the Standard parallelization code (Autotiler-like)
Elapsed: 6137 Cyc (0.16 Op/Cyc)
Running the OpenMP Code
Elapsed: 6250 Cyc (0.16 Op/Cyc)
Test Succeded !

Code

/* 
 * Copyright (C) 2024 GreenWaves Technologies
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* PMSIS includes */
#include <pmsis.h>
#include "Gap.h"

#define Abs(a)          (((int)(a)<0)?(-(a)):(a))
#define Min(a, b)       (((a)<(b))?(a):(b))
#define Max(a, b)       (((a)>(b))?(a):(b))

#define MAT_SIZE 8000
float *In1;
float *In2;
float *Out;

/*************** OpenMP Code ******************/
/* Cluster main entry, executed by core 0. */
void openmp_cluster_delegate(void *arg)
{
    printf("Running the OpenMP Code\n");
    gap_cl_resethwtimer();
    gap_cl_starttimer();
    int start = gap_cl_readhwtimer();
    #pragma omp parallel for
	for (unsigned int i=0; i<MAT_SIZE; i++) {
		Out[i] = In1[i] + In2[i];
	}
    int elapsed = gap_cl_readhwtimer() - start;
    printf("Elapsed: %d Cyc (%.2f Op/Cyc)\n\n", elapsed, ((float) MAT_SIZE) / elapsed);
}

/**************** Standard Code *****************/
static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)

{
	unsigned int NCore;
	unsigned int Log2Core;
	unsigned int Chunk;

	NCore = gap_ncore();
	Log2Core = gap_fl1(NCore);
	Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0);
	return Chunk;
}

/* Task executed by cluster cores. */
void matadd(void *arg)
{
	unsigned int CoreId = gap_coreid();
	unsigned int ChunkCell = ChunkSize(MAT_SIZE);
	unsigned int First = Min(CoreId*ChunkCell, MAT_SIZE), Last  = Min(MAT_SIZE, First+ChunkCell);

	for (unsigned int i=First; i<Last; i++) {
		Out[i] = In1[i] + In2[i];
	}
}

/* Cluster main entry, executed by core 0. */
void cluster_delegate(void *arg)
{
    printf("Running the Standard parallelization code (Autotiler-like)\n");
    gap_cl_resethwtimer();
    gap_cl_starttimer();
    int start = gap_cl_readhwtimer();
    /* Task dispatch to cluster cores. */
    pi_cl_team_fork(pi_cl_cluster_nb_cores(), matadd, arg);
    int elapsed = gap_cl_readhwtimer() - start;
    printf("Elapsed: %d Cyc (%.2f Op/Cyc)\n\n", elapsed, ((float) MAT_SIZE) / elapsed);
}

void init_arrs(float* In1, float * In2, int size) {
    for (int i=0; i<size; i++) {
        In1[i] = i / 10;
        In2[i] = i % 10;
    }
}

int check_results() {
    int err = 0;
    for (int i=0; i<MAT_SIZE; i++) {
        if ((In1[i] + In2[i]) != Out[i]) {
            printf("Error @ %d\n", i);
            err++;
        }
    }
    return err;
}

/* Program Entry. */
int main(void)
{
    printf("\n\t *** OPENMP Basic Example ***\n\n");

    pi_device_t* cluster_dev;
    if(pi_open(PI_CORE_CLUSTER, &cluster_dev))
    {
        printf("Cluster open failed !\n");
        pmsis_exit(-1);
    }
    In1 = (float *) pi_cl_l1_malloc(cluster_dev, MAT_SIZE * sizeof(float));
    In2 = (float *) pi_cl_l1_malloc(cluster_dev, MAT_SIZE * sizeof(float));
    Out = (float *) pi_cl_l1_malloc(cluster_dev, MAT_SIZE * sizeof(float));
    if ((In1 == NULL) || (In2 == NULL) || (Out == NULL)) {
        printf("Error allocating the L1\n");
        return -1;
    }

    /* Prepare cluster task and send it to cluster. */
    struct pi_cluster_task cl_task;

    init_arrs(In1, In2, MAT_SIZE);
    pi_cluster_send_task_to_cl(cluster_dev, pi_cluster_task(&cl_task, cluster_delegate, NULL));
    int err = check_results();

    init_arrs(In1, In2, MAT_SIZE);
    pi_cluster_send_task_to_cl(cluster_dev, pi_cluster_task(&cl_task, openmp_cluster_delegate, NULL));
    err += check_results();

    pi_cl_l1_free(cluster_dev, Out, MAT_SIZE * sizeof(float));
    pi_cl_l1_free(cluster_dev, In2, MAT_SIZE * sizeof(float));
    pi_cl_l1_free(cluster_dev, In1, MAT_SIZE * sizeof(float));
    pi_cluster_close(cluster_dev);


    if (err)
        printf("Test Failed !\n");
    else
        printf("Test Succeded !\n");

    return err;
}