Skip to content

Update GPU-Opt-Guide examples to 2023.03.14 #1432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#define MAX 100
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))

#define IDX2(i, j) (i * P + j)
#define IDX2(i, j) (i * j)
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)

int main(void) {
Expand All @@ -43,7 +43,7 @@ int main(void) {
dx[i] = scaled_rand();

/* map data to device */
#pragma omp target enter data map(to : u [0:SIZE], dx [0:P * P])
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])

start = omp_get_wtime();

Expand All @@ -58,7 +58,7 @@ int main(void) {
double us = 0.;
double ut = 0.;

s1 = dx[IDX4(b, 0, 0, k)];
s1 = dx[IDX2(b, k)];
s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
s3 = 0.145;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#define MAX 100
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))

#define IDX2(i, j) (i * P + j)
#define IDX2(i, j) (i * j)
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)

int main(void) {
Expand All @@ -43,7 +43,7 @@ int main(void) {
dx[i] = scaled_rand();

/* map data to device */
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])

start = omp_get_wtime();

Expand All @@ -58,7 +58,7 @@ int main(void) {
double us = 0.;
double ut = 0.;

s1 = dx[IDX4(b, 0, 0, k)];
s1 = dx[IDX2(b, k)];
s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
s3 = 0.145;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#define MAX 100
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))

#define IDX2(i, j) (i * P + j)
#define IDX2(i, j) (i * j)
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)

int main(void) {
Expand All @@ -42,7 +42,7 @@ int main(void) {
dx[i] = scaled_rand();

/* map data to device */
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])

start = omp_get_wtime();

Expand All @@ -57,7 +57,7 @@ int main(void) {
double us = 0.;
double ut = 0.;

double s1 = dx[IDX4(b, 0, 0, k)];
double s1 = dx[IDX2(b, k)];
double s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
double s3 = 0.145;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ int main(int argc, char **argv) {
return EXIT_FAILURE;
}

FLOAT alpha, beta;
int niter, verify;
int HA = atoi(argv[1]);
int WA = atoi(argv[2]);
int WB = atoi(argv[3]);
FLOAT alpha, beta;
int niter, verify;

if ((HA == 0) || (WA == 0) || (WB == 0))
exit(1);

if (argc > 4) {

#if PRECISION == 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//==============================================================
// Copyright © 2022 Intel Corporation
// Copyright © 203 Intel Corporation
//
// SPDX-License-Identifier: MIT
// =============================================================
Expand All @@ -9,6 +9,9 @@

double * make_array(int n, double value) {
double* array = static_cast<double*>(malloc(n * sizeof(double)));
if (array == NULL)
return NULL;

for (int i = 0; i < n; i++) {
array[i] = value / (100.0 + i);
}
Expand All @@ -23,6 +26,8 @@ int main() {
double* A = make_array(N, 0.8);
double* B = make_array(N, 0.65);
double* C = make_array(N*N, 2.5);
if ((A == NULL) || (B == NULL) || (C == NULL))
exit(1);

int i, j;
double val = 0.0;
Expand All @@ -38,7 +43,7 @@ int main() {
}
}

printf("Reduced val[%f10.3]", val);
printf("val = %f10.3\n", val);

free(A);
free(B);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
add_compile_options(-g -mcmodel=medium)
add_definitions(-DPREFETCH)
add_subdirectory(c)
add_subdirectory(fortran)
add_subdirectory(c_simd)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_example(nbody_c)
150 changes: 150 additions & 0 deletions Publications/GPU-Opt-Guide/OpenMP/26_omp_prefetch/c/nbody_c.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#include <math.h>
#include <omp.h>
#include <stdio.h>

#define CACHE_CLEAN_SIZE 100000000
#define ITERATIONS 100
#define ARRAYLEN1 4096
#define ARRAYLEN2 32768
// snippet-begin
#define WORKGROUP_SIZE 1024
#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3; 2 = prefetch to L3
#define TILE_SIZE 64

void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) {
#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE)
for (int i = 0; i < n1; i++) {
const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
const float eps = 0.01f;

float dx = 0.0;
float bb[TILE_SIZE];
for (int j = 0; j < n2; j += TILE_SIZE) {
// load tile from b
for (int u = 0; u < TILE_SIZE; ++u) {
bb[u] = b[j + u];
#ifdef PREFETCH
int next_tile = j + TILE_SIZE + u;
if ((next_tile % 16) == 0) {
#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2)
}
#endif
}
#pragma unroll(TILE_SIZE)
for (int u = 0; u < TILE_SIZE; ++u) {
float delta = bb[u] - a[i];
float r2 = delta * delta;
float s0 = r2 + eps;
float s1 = 1.0f / sqrtf(s0);
float f =
(s1 * s1 * s1) -
(ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
dx += f * delta;
}
}
c[i] = dx * 0.23f;
}
}
// snippet-end

void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) {
for (int i = 0; i < n1; ++i) {
const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
const float eps = 0.01f;

float dx = 0.0f;
for (int j = 0; j < n2; ++j) {
float delta = b[j] - a[i];
float r2 = delta * delta;
float s0 = r2 + eps;
float s1 = 1.0f / sqrtf(s0);
float f = (s1 * s1 * s1) -
(ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
dx += f * delta;
}
c[i] = dx * 0.23f;
}
}

void clean_cache_gpu(double *d, int n) {

#pragma omp target teams distribute parallel for thread_limit(1024)
for (unsigned i = 0; i < n; ++i)
d[i] = i;

return;
}

int main() {

float *a, *b, *c;
double *d;

a = new float[ARRAYLEN1];
b = new float[ARRAYLEN2];
c = new float[ARRAYLEN1];
d = new double[CACHE_CLEAN_SIZE];

// intialize
float dx = 1.0f / (float)ARRAYLEN2;
b[0] = 0.0f;
for (int i = 1; i < ARRAYLEN2; ++i) {
b[i] = b[i - 1] + dx;
}
for (int i = 0; i < ARRAYLEN1; ++i) {
a[i] = b[i];
c[i] = 0.0f;
}

#pragma omp target
{}

#pragma omp target enter data map(alloc \
: a [0:ARRAYLEN1], b [0:ARRAYLEN2], \
c [0:ARRAYLEN1])
#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE])

#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2])

double t1, t2, elapsed_s = 0.0;
for (int i = 0; i < ITERATIONS; ++i) {
clean_cache_gpu(d, CACHE_CLEAN_SIZE);

t1 = omp_get_wtime();
nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
t2 = omp_get_wtime();

elapsed_s += (t2 - t1);
}

#pragma omp target update from(c [0:ARRAYLEN1])

double sum = 0.0f;
for (int i = 0; i < ARRAYLEN1; ++i)
sum += c[i];
printf("Obtained output = %8.3f\n", sum);

for (int i = 0; i < ARRAYLEN1; ++i)
c[i] = 0.0f;
nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
sum = 0.0f;
for (int i = 0; i < ARRAYLEN1; ++i)
sum += c[i];
printf("Expected output = %8.3f\n", sum);

printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000));

#pragma omp target exit data map(delete \
: a [0:ARRAYLEN1], b [0:ARRAYLEN2], \
c [0:ARRAYLEN1])
#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE])

delete[] a;
delete[] b;
delete[] c;
delete[] d;

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
add_compile_options(-fopenmp-target-simd)
add_example(nbody_c_simd)
Loading