#include void vecaddgpu(float* r, float* a, float* b, std::size_t n) { #pragma acc kernels loop present(r, a, b) for (std::size_t i = 0; i < n; ++i) r[i] = a[i] + b[i]; } int main(int, char* []) { const std::size_t n = 100000; /* vector length */ std::vector a(n); /* input vector 1 */ std::vector b(n); /* input vector 2 */ std::vector r(n); /* output vector */ std::vector e(n); /* expected output values */ for (std::size_t i = 0; i < n; ++i) { a[i] = static_cast(i + 1); b[i] = static_cast(1000 * i); } /* compute on the GPU */ auto a_ptr = a.data(); auto b_ptr = b.data(); auto r_ptr = r.data(); #pragma acc data copyin(a_ptr [0:n], b_ptr [0:n]) copyout(r_ptr [0:n]) { vecaddgpu(r_ptr, a_ptr, b_ptr, n); } /* compute on the host to compare */ for (std::size_t i = 0; i < n; ++i) e[i] = a[i] + b[i]; /* compare results */ int errs = 0; for (std::size_t i = 0; i < n; ++i) { if (r[i] != e[i]) { ++errs; } } return errs; }