Skip to content

Commit db5d628

Browse files
authored
Update GPU-Opt-Guide examples to 2023.03.14 (#1432)
1 parent 3798091 commit db5d628

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+819
-154
lines changed

Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#define MAX 100
1919
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
2020

21-
#define IDX2(i, j) (i * P + j)
21+
#define IDX2(i, j) (i * j)
2222
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
2323

2424
int main(void) {
@@ -43,7 +43,7 @@ int main(void) {
4343
dx[i] = scaled_rand();
4444

4545
/* map data to device */
46-
#pragma omp target enter data map(to : u [0:SIZE], dx [0:P * P])
46+
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
4747

4848
start = omp_get_wtime();
4949

@@ -58,7 +58,7 @@ int main(void) {
5858
double us = 0.;
5959
double ut = 0.;
6060

61-
s1 = dx[IDX4(b, 0, 0, k)];
61+
s1 = dx[IDX2(b, k)];
6262
s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
6363
s3 = 0.145;
6464

Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#define MAX 100
1919
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
2020

21-
#define IDX2(i, j) (i * P + j)
21+
#define IDX2(i, j) (i * j)
2222
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
2323

2424
int main(void) {
@@ -43,7 +43,7 @@ int main(void) {
4343
dx[i] = scaled_rand();
4444

4545
/* map data to device */
46-
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
46+
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
4747

4848
start = omp_get_wtime();
4949

@@ -58,7 +58,7 @@ int main(void) {
5858
double us = 0.;
5959
double ut = 0.;
6060

61-
s1 = dx[IDX4(b, 0, 0, k)];
61+
s1 = dx[IDX2(b, k)];
6262
s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
6363
s3 = 0.145;
6464

Publications/GPU-Opt-Guide/OpenMP/06_scalars_private/test_scalars_private_2.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#define MAX 100
1919
#define scaled_rand() ((rand() % MAX) / (1.0 * MAX))
2020

21-
#define IDX2(i, j) (i * P + j)
21+
#define IDX2(i, j) (i * j)
2222
#define IDX4(b, i, j, k) (b * P * P * P + i * P * P + j * P + k)
2323

2424
int main(void) {
@@ -42,7 +42,7 @@ int main(void) {
4242
dx[i] = scaled_rand();
4343

4444
/* map data to device */
45-
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P * P])
45+
#pragma omp target enter data map(to: u[0:SIZE], dx[0:P*P])
4646

4747
start = omp_get_wtime();
4848

@@ -57,7 +57,7 @@ int main(void) {
5757
double us = 0.;
5858
double ut = 0.;
5959

60-
double s1 = dx[IDX4(b, 0, 0, k)];
60+
double s1 = dx[IDX2(b, k)];
6161
double s2 = u[IDX4(b, 0, 0, k)] + BLOCKS;
6262
double s3 = 0.145;
6363

Publications/GPU-Opt-Guide/OpenMP/22_mkl_pad/dgemm_pad_c_01.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,15 @@ int main(int argc, char **argv) {
147147
return EXIT_FAILURE;
148148
}
149149

150+
FLOAT alpha, beta;
151+
int niter, verify;
150152
int HA = atoi(argv[1]);
151153
int WA = atoi(argv[2]);
152154
int WB = atoi(argv[3]);
153-
FLOAT alpha, beta;
154-
int niter, verify;
155+
156+
if ((HA == 0) || (WA == 0) || (WB == 0))
157+
exit(1);
158+
155159
if (argc > 4) {
156160

157161
#if PRECISION == 1

Publications/GPU-Opt-Guide/OpenMP/23_omp_work_group/test_omp_work_group.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//==============================================================
2-
// Copyright © 2022 Intel Corporation
2+
// Copyright © 203 Intel Corporation
33
//
44
// SPDX-License-Identifier: MIT
55
// =============================================================
@@ -9,6 +9,9 @@
99

1010
double * make_array(int n, double value) {
1111
double* array = static_cast<double*>(malloc(n * sizeof(double)));
12+
if (array == NULL)
13+
return NULL;
14+
1215
for (int i = 0; i < n; i++) {
1316
array[i] = value / (100.0 + i);
1417
}
@@ -23,6 +26,8 @@ int main() {
2326
double* A = make_array(N, 0.8);
2427
double* B = make_array(N, 0.65);
2528
double* C = make_array(N*N, 2.5);
29+
if ((A == NULL) || (B == NULL) || (C == NULL))
30+
exit(1);
2631

2732
int i, j;
2833
double val = 0.0;
@@ -38,7 +43,7 @@ int main() {
3843
}
3944
}
4045

41-
printf("Reduced val[%f10.3]", val);
46+
printf("val = %f10.3\n", val);
4247

4348
free(A);
4449
free(B);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
add_compile_options(-g -mcmodel=medium)
2+
add_definitions(-DPREFETCH)
3+
add_subdirectory(c)
4+
add_subdirectory(fortran)
5+
add_subdirectory(c_simd)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_example(nbody_c)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#include <math.h>
2+
#include <omp.h>
3+
#include <stdio.h>
4+
5+
#define CACHE_CLEAN_SIZE 100000000
6+
#define ITERATIONS 100
7+
#define ARRAYLEN1 4096
8+
#define ARRAYLEN2 32768
9+
// snippet-begin
10+
#define WORKGROUP_SIZE 1024
11+
#define PREFETCH_HINT 4 // 4 = prefetch to L1 and L3; 2 = prefetch to L3
12+
#define TILE_SIZE 64
13+
14+
void nbody_1d_gpu(float *c, float *a, float *b, int n1, int n2) {
15+
#pragma omp target teams distribute parallel for thread_limit(WORKGROUP_SIZE)
16+
for (int i = 0; i < n1; i++) {
17+
const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
18+
const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
19+
const float eps = 0.01f;
20+
21+
float dx = 0.0;
22+
float bb[TILE_SIZE];
23+
for (int j = 0; j < n2; j += TILE_SIZE) {
24+
// load tile from b
25+
for (int u = 0; u < TILE_SIZE; ++u) {
26+
bb[u] = b[j + u];
27+
#ifdef PREFETCH
28+
int next_tile = j + TILE_SIZE + u;
29+
if ((next_tile % 16) == 0) {
30+
#pragma ompx prefetch data(PREFETCH_HINT : b[next_tile]) if (next_tile < n2)
31+
}
32+
#endif
33+
}
34+
#pragma unroll(TILE_SIZE)
35+
for (int u = 0; u < TILE_SIZE; ++u) {
36+
float delta = bb[u] - a[i];
37+
float r2 = delta * delta;
38+
float s0 = r2 + eps;
39+
float s1 = 1.0f / sqrtf(s0);
40+
float f =
41+
(s1 * s1 * s1) -
42+
(ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
43+
dx += f * delta;
44+
}
45+
}
46+
c[i] = dx * 0.23f;
47+
}
48+
}
49+
// snippet-end
50+
51+
void nbody_1d_cpu(float *c, float *a, float *b, int n1, int n2) {
52+
for (int i = 0; i < n1; ++i) {
53+
const float ma0 = 0.269327f, ma1 = -0.0750978f, ma2 = 0.0114808f;
54+
const float ma3 = -0.00109313f, ma4 = 0.0000605491f, ma5 = -0.00000147177f;
55+
const float eps = 0.01f;
56+
57+
float dx = 0.0f;
58+
for (int j = 0; j < n2; ++j) {
59+
float delta = b[j] - a[i];
60+
float r2 = delta * delta;
61+
float s0 = r2 + eps;
62+
float s1 = 1.0f / sqrtf(s0);
63+
float f = (s1 * s1 * s1) -
64+
(ma0 + r2 * (ma1 + r2 * (ma2 + r2 * (ma3 + r2 * (ma4 + ma5)))));
65+
dx += f * delta;
66+
}
67+
c[i] = dx * 0.23f;
68+
}
69+
}
70+
71+
void clean_cache_gpu(double *d, int n) {
72+
73+
#pragma omp target teams distribute parallel for thread_limit(1024)
74+
for (unsigned i = 0; i < n; ++i)
75+
d[i] = i;
76+
77+
return;
78+
}
79+
80+
int main() {
81+
82+
float *a, *b, *c;
83+
double *d;
84+
85+
a = new float[ARRAYLEN1];
86+
b = new float[ARRAYLEN2];
87+
c = new float[ARRAYLEN1];
88+
d = new double[CACHE_CLEAN_SIZE];
89+
90+
// intialize
91+
float dx = 1.0f / (float)ARRAYLEN2;
92+
b[0] = 0.0f;
93+
for (int i = 1; i < ARRAYLEN2; ++i) {
94+
b[i] = b[i - 1] + dx;
95+
}
96+
for (int i = 0; i < ARRAYLEN1; ++i) {
97+
a[i] = b[i];
98+
c[i] = 0.0f;
99+
}
100+
101+
#pragma omp target
102+
{}
103+
104+
#pragma omp target enter data map(alloc \
105+
: a [0:ARRAYLEN1], b [0:ARRAYLEN2], \
106+
c [0:ARRAYLEN1])
107+
#pragma omp target enter data map(alloc : d [0:CACHE_CLEAN_SIZE])
108+
109+
#pragma omp target update to(a [0:ARRAYLEN1], b [0:ARRAYLEN2])
110+
111+
double t1, t2, elapsed_s = 0.0;
112+
for (int i = 0; i < ITERATIONS; ++i) {
113+
clean_cache_gpu(d, CACHE_CLEAN_SIZE);
114+
115+
t1 = omp_get_wtime();
116+
nbody_1d_gpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
117+
t2 = omp_get_wtime();
118+
119+
elapsed_s += (t2 - t1);
120+
}
121+
122+
#pragma omp target update from(c [0:ARRAYLEN1])
123+
124+
double sum = 0.0f;
125+
for (int i = 0; i < ARRAYLEN1; ++i)
126+
sum += c[i];
127+
printf("Obtained output = %8.3f\n", sum);
128+
129+
for (int i = 0; i < ARRAYLEN1; ++i)
130+
c[i] = 0.0f;
131+
nbody_1d_cpu(c, a, b, ARRAYLEN1, ARRAYLEN2);
132+
sum = 0.0f;
133+
for (int i = 0; i < ARRAYLEN1; ++i)
134+
sum += c[i];
135+
printf("Expected output = %8.3f\n", sum);
136+
137+
printf("\nTotal time = %8.1f milliseconds\n", (elapsed_s * 1000));
138+
139+
#pragma omp target exit data map(delete \
140+
: a [0:ARRAYLEN1], b [0:ARRAYLEN2], \
141+
c [0:ARRAYLEN1])
142+
#pragma omp target exit data map(delete : d [0:CACHE_CLEAN_SIZE])
143+
144+
delete[] a;
145+
delete[] b;
146+
delete[] c;
147+
delete[] d;
148+
149+
return 0;
150+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
add_compile_options(-fopenmp-target-simd)
2+
add_example(nbody_c_simd)

0 commit comments

Comments
 (0)