LU Matrix Decomposition, OpenMP
From October 30, 2019
OpenMP
Is an API for multi-processing that supports multi-platform shared memory distribution, consisting of compiler directives, routines and environment variables. OpenMP has a simple interface with a scalable and flexible model.
To finish our LU matrix decomposition, this time we will do it in parallel.
#pragma omp parallel
{
}
The final program.
#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
#define WIDTH 4
#define NUM_VALUES 16
#define INDX_POS(i,j) ((WIDTH * i) + (j))
void print(char* n, float* m)
{
fprintf(stdout, "\n%s\n", n);
for(int i = 0; i < NUM_VALUES; i++)
{
fprintf(stdout, "%.3f ", m[i]);
if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
}
}
int validate(float* input, float* output)
{
for (int i = 0; i < WIDTH; i++)
{
if ( input[i] != output[i] )
{
fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
fflush(stdout);
return 0;
}
}
return 1;
}
void createL(float* output, float* l)
{
for(int i = 0; i < WIDTH; i++)
{
for(int j = 0; j < WIDTH; j++)
{
if(i == j)
{
l[INDX_POS(i,j)] = 1;
}
else if (i < j)
{
l[INDX_POS(i,j)] = 0;
}
else if(i > j)
{
l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
}
}
}
}
void createU(float* output, float* u)
{
for(int i = 0; i < WIDTH; i++)
{
for(int j = 0; j < WIDTH; j++)
{
if(j >= i)
{
u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
}
}
}
}
void LUdecomposition(float* input, float* l, float* u)
{
for(int i = 0; i < NUM_VALUES - 1; i++)
{
int row = 0;
#pragma omp parallel for private(row) shared(input)
{
for(row = i + 1; row < NUM_VALUES; row++)
{
float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
for(int col = i + 1; col < NUM_VALUES; col++)
{
input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
}
input[INDX_POS(row,i)] = factor;
}
}
}
#pragma omp parallel
{
createL(input, l);
createU(input, u);
}
}
int main(int argc, const char * argv[])
{
float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
float* l = (float*) malloc(sizeof(float) * NUM_VALUES);
float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
for (int i = 0; i < NUM_VALUES; i++)
{
l[i] = 0;
u[i] = 0;
test_out[i] = 0;
test_in[i] = i + 1;
}
// i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
// A L U
// 4 3 1 0 4 3
// 6 3 1.5 1 0 -1.5
// A L U
// 3 1 1 0 3 1
// 4 2 1.3 1 0 0.6
//test_in[0] = 4; // 0 0
//test_in[1] = 3; // 0 1
//test_in[2] = 6; // 1 0
//test_in[3] = 3; // 1 1
// A L U
// 1 2 3 1 0 0 1 2 3
// 4 5 6 4 1 0 0 -3 -6
// 7 8 9 7 2 1 0 0 0
test_in[0] = 80.428;
test_in[1] = -12.818;
test_in[2] = -81.284;
test_in[3] = -95.437;
test_in[4] = -79.099;
test_in[5] = 14.754;
test_in[6] = 80.749;
test_in[7] = 46.408;
test_in[8] = -28.549;
test_in[9] = 76.515;
test_in[10] = -14.687;
test_in[11] = -75.622;
test_in[12] = -88.490;
test_in[13] = -82.585;
test_in[14] = 85.373;
test_in[15] = -61.574;
print("INPUT", test_in);
auto t1 = std::chrono::high_resolution_clock::now();
LUdecomposition(test_in, l, u);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
print("L", l);
print("U", u);
double sum = 0;
for(int i = 0; i < WIDTH; i++)
{
for(int j = 0; j < WIDTH; j++)
{
for(int k = 0; k < WIDTH; k++)
{
//test_out[i][j] += l[i][k] * u[k][j];
sum = sum + l[INDX_POS(i,k)] * u[INDX_POS(k,j)];
}
test_out[INDX_POS(i,j)] = sum;
sum = 0;
}
}
print("OUTPUT", test_out);
if ( validate(test_in, test_out))
{
fprintf(stdout, "\nAll values were properly calculated.\n");
}
fprintf(stdout, "\n%llu milliseconds\n", duration);
return 0;
}