Skip to content
Snippets Groups Projects
Commit a38ae1d9 authored by Daniel Vonk's avatar Daniel Vonk Committed by Daniel Vonk
Browse files

use GPU-based store

parent 5a980242
No related branches found
No related tags found
2 merge requests!19Merge develop into main,!17Add CUDA backend
......@@ -170,6 +170,14 @@ void SelfVectorsCUDAScatterDevice::compute()
afinal_ = 0;
a2final_ = 0;
cudaMalloc(&datfinal_, NF * sizeof(fftw_complex));
cudaMalloc(&dafinal_, sizeof(fftw_complex));
cudaMalloc(&da2final_, sizeof(fftw_complex));
cudaMemset(datfinal_, 0, NF * sizeof(fftw_complex));
cudaMemset(dafinal_, 0, sizeof(fftw_complex));
cudaMemset(da2final_, 0, sizeof(fftw_complex));
size_t NNPP = partitioncomm_.size();
// enough space for NTHREADS at a time. Then it is reduced and stored.
......@@ -201,7 +209,7 @@ void SelfVectorsCUDAScatterDevice::compute()
// square every complex number in 0..NF
dsp(pat);
// doesn't actually store, just adds
store(pat);
// store(pat);
}
timer.stop("sd:c:b:dspstore");
......@@ -222,6 +230,15 @@ void SelfVectorsCUDAScatterDevice::compute()
timer.stop("sd:c:wait");
timer.start("sd:c:reduce");
// Copy signal outputs back to host
cudaMemcpy(&afinal_, dafinal_, sizeof(fftw_complex), cudaMemcpyDeviceToHost);
cudaMemcpy(&a2final_, da2final_, sizeof(fftw_complex), cudaMemcpyDeviceToHost);
cudaMemcpy(atfinal_, datfinal_, NF * sizeof(fftw_complex), cudaMemcpyDeviceToHost);
if (NNPP > 1) {
double *p_atfinal = ( double * )&(atfinal_[0][0]);
......@@ -261,6 +278,10 @@ void SelfVectorsCUDAScatterDevice::compute()
afinal_ *= factor;
a2final_ *= factor;
}
cudaFree(datfinal_);
cudaFree(dafinal_);
cudaFree(da2final_);
}
......@@ -442,8 +463,14 @@ void SelfVectorsCUDAScatterDevice::tf_scatter(size_t atomindex, size_t index, si
auto dsp = cudaflow.kernel(1, NTHREADS, 0, sass::cuda::square_elements, dat_, 2 * N, NF)
.name("square_dsp");
auto store =
cudaflow
.kernel(1, NTHREADS, 0, sass::cuda::store, dafinal_, da2final_, datfinal_, dat_, N, NF)
.name("store");
kernel.succeed(zero_dat, h2d_scatter_factors, h2d_coords, h2d_subvector_index).precede(dsp);
dsp.precede(d2h_dat_);
dsp.precede(d2h_dat_, store);
store.precede(d2h_dat_);
// cudaflow.dump(std::cout);
tf::cudaStream stream;
......
......@@ -50,6 +50,10 @@ protected:
// Allocated by the data stager (must free ourselves)
coor_t *p_coordinates;
std::complex<double> *dafinal_ = nullptr;
std::complex<double> *da2final_ = nullptr;
fftw_complex *datfinal_ = nullptr;
size_t current_atomindex_ = 0;
size_t coord_size_ = 0;
......
......@@ -108,20 +108,13 @@ __global__ void sass::cuda::store(fftw_complex *afinal, fftw_complex *a2final,
for (size_t i = 0; i < NF; i++) {
a += thrust::complex<double>(data[i][0], data[i][1]);
atfinal[i][0] += data[i][0];
atfinal[i][1] += data[i][1];
// atomicAddDbl(( double * )&(atfinal[i][0]), data[i][0]);
// atomicAddDbl(( double * )&(atfinal[i][1]), data[i][1]);
atomicAddDbl(( double * )&(atfinal[i][0]), data[i][0]);
atomicAddDbl(( double * )&(atfinal[i][1]), data[i][1]);
}
a *= 1 / NF;
// atomicAddDbl(( double * )&(afinal[0]), a.real());
// atomicAddDbl(( double * )&(afinal[1]), a.imag());
(*afinal)[0] += a.real();
(*afinal)[1] += a.imag();
a *= 1.0 / NF;
atomicAddDbl(( double * )&(afinal[0]), a.real());
atomicAddDbl(( double * )&(afinal[1]), a.imag());
auto a2 = a * thrust::conj(a);
atomicAddDbl(( double * )&(a2final[0]), a2.real());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment