__global__ void kernelA(float* r, float* x, float* y, float* z, int size) { for (int i = threadIdx.x; i < size; i += blockDim.x) { r[i] = x[i] * y[i] + z[i]; } } __global__ void kernelB(float* r, float* x, float* y, float* z, int size) { for (int i = threadIdx.x; i < size; i += blockDim.x) { r[i] = x[i] * y[i] + z[i]; } }