mirror of
https://github.com/boostorg/math.git
synced 2025-05-11 21:33:52 +00:00
Add GPU support to bessel_i0 Add CUDA and NVRTC testing Add GPU support to bessel_i1 Add CUDA and NVRTC testing of bessel_i1 Add tgamma1pm1 NVRTC impl Add GPU support to iconv Add GPU test to bessel_ik Add SYCL testing of complete bessel_i Add GPU support to bessel_j0 Ignore BOOST_MATH_INSTRUMENT_VARIABLE on NVRTC Add bessel J0 CUDA and NVRTC testing Add GPU support to bessel_j1 Add bessel j1 CUDA and NVRTC testing Add GPU support to bessel jn and jy Add SYCL bessel j testing Add bessel_k0 GPU support Add bessel_k0 CUDA and NVRTC testing Add GPU support to bessel_k1 Add bessel_k1 CUDA and NVRTC testing Add GPU support to bessel_kn Add bessel_kn CUDA and NVRTC testing Add SYCL testing of complete bessel_k Make newton-rhapson GPU compatible Make the completed bessel functions GPU compatible Add SYCL bessel y testing Apply changes for non-empty policy on CUDA Add NVCC cyl_bessel_i testing Add GPU support to sinc Add GPU support to series functions Add GPU support to bessel_jy_zero Add array helper type Make hypot GPU safe Make bessel_yX GPU capable Add bessel_y0 and bessel_y1 CUDA testing Add nvrtc testing of bessel_y0 and bessel_y1 Fix macros Add missing header Add missing header Markup iconv Add iround for NVRTC Add tgamma1pm1 with policy overload for NVRTC Disable header Fix factorial support for CUDA platforms Add definition of bessel traits Add cyl_bessel_i NVRTC testing Fix cyl_bessel_jy warnings Fix CUDA forward declarations Fix maybe-unused variable warning Add CUDA cyl_bessel_j testing Add sign overload for lgamma Fix warnings Add NVRTC cyl_bessel_j testing Add NVCC sph_bessel testing Add NVRTC testing of sph_bessel Add NVRTC testing of cyl_bessel_k Add NVCC testing of cyl_bessel_k Add NVCC testing of cyl_neumann Add NVRTC cyl_neumann testing Add NVRTC sph_neumann testing Add NVCC sph_neumann testing
117 lines
3.5 KiB
Plaintext
117 lines
3.5 KiB
Plaintext
|
|
// Copyright John Maddock 2016.
|
|
// Copyright Matt Borland 2024.
|
|
// Use, modification and distribution are subject to the
|
|
// Boost Software License, Version 1.0. (See accompanying file
|
|
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <vector>
|
|
#include <boost/math/special_functions.hpp>
|
|
#include "cuda_managed_ptr.hpp"
|
|
#include "stopwatch.hpp"
|
|
|
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
|
#include <cuda_runtime.h>
|
|
|
|
typedef float float_type;
|
|
|
|
/**
|
|
* CUDA Kernel Device code
|
|
*
|
|
*/
|
|
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
|
|
{
|
|
using std::cos;
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if (i < numElements)
|
|
{
|
|
out[i] = boost::math::sph_neumann(in1[i], in2[i]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Host main routine
|
|
*/
|
|
int main(void)
|
|
{
|
|
// Error code to check return values for CUDA calls
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
// Print the vector length to be used, and compute its size
|
|
int numElements = 50000;
|
|
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
|
|
|
|
// Allocate the managed input vector A
|
|
cuda_managed_ptr<float_type> input_vector1(numElements);
|
|
|
|
// Allocate the managed input vector B
|
|
cuda_managed_ptr<float_type> input_vector2(numElements);
|
|
|
|
// Allocate the managed output vector C
|
|
cuda_managed_ptr<float_type> output_vector(numElements);
|
|
|
|
// Initialize the input vectors
|
|
for (int i = 0; i < numElements; ++i)
|
|
{
|
|
input_vector1[i] = rand()/(float_type)RAND_MAX;
|
|
input_vector2[i] = rand()/(float_type)RAND_MAX;
|
|
}
|
|
|
|
// Launch the Vector Add CUDA Kernel
|
|
int threadsPerBlock = 256;
|
|
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
|
|
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
|
|
|
|
watch w;
|
|
|
|
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
|
|
cudaDeviceSynchronize();
|
|
|
|
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
|
|
|
|
err = cudaGetLastError();
|
|
|
|
if (err != cudaSuccess)
|
|
{
|
|
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
// Verify that the result vector is correct
|
|
std::vector<float_type> results;
|
|
results.reserve(numElements);
|
|
w.reset();
|
|
for(int i = 0; i < numElements; ++i)
|
|
results.push_back(boost::math::sph_neumann(input_vector1[i], input_vector2[i]));
|
|
double t = w.elapsed();
|
|
bool failed = false;
|
|
// check the results
|
|
for(int i = 0; i < numElements; ++i)
|
|
{
|
|
if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
|
|
{
|
|
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
|
|
{
|
|
std::cout << "error at line: " << i
|
|
<< "\nParallel: " << results[i]
|
|
<< "\n Serial: " << output_vector[i]
|
|
<< "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
|
|
failed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (failed)
|
|
{
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
|
|
std::cout << "Done\n";
|
|
|
|
return 0;
|
|
}
|