Hi im trying to perform a parallel reduction with SYCL, but after every calculation it seems my device fails to copy the values back to my host device. Attached is a snippet of my code:
int ddot (const int n, const double * const x, const double * const y,
double * const result, double & time_allreduce)
{
// Since pointer is being passed by value, i modified the value at that memory address directly
// Initializing it to zero
double value = 0.0;
double * value_ptr = &value;
// Create a queue for the chosen device -- GPU
queue Q{};
buffer Result(value_ptr, range<1> (1));
buffer X(x, range<1> (n-1));
buffer Y(y, range<1> (n-1));
Q.submit([&](handler &h){
accessor sum{Result, h};
accessor xAcc{X, h};
accessor yAcc{Y, h};
h.parallel_for(
// (Total work items across work groups, work item in each group
nd_range<1>{range<1>(n), range<1>(n)},
reduction(sum, plus<>()),
[=](nd_item<1> idx, auto& sum) {
int i = idx.get_global_id(0);
sum += xAcc[i] * yAcc[i];
});
});
Q.wait();
return(0);
I was expecting to get some values, given some randomised x and y vector but I get zero